diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..5b53d80d4c201a794b933a02d5772baa83b9d6cb
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,13 @@
+---
+Language:          Cpp
+BasedOnStyle:      Google
+BreakBeforeBraces: GNU
+SpaceBeforeParens: Never
+AllowShortLoopsOnASingleLine: false
+AllowShortIfStatementsOnASingleLine: false
+AlignConsecutiveAssignments: true
+ColumnLimit:       135
+ReflowComments:    true
+SortUsingDeclarations: true
+...
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6a36a2cc477443c7a1256e8058cff01bc7b5930d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,92 @@
+
+
+#eclipse stuff
+.cproject
+.dep/
+.project
+.settings/
+.pydevproject
+
+#OS
+.gdb_history
+.DS_Store
+
+#binaries
+build
+Arepo
+Arepo.dSYM
+Tenet
+Tenet.dSYM
+Gadget4
+Gadget4.dSYM
+.build*/
+*.o
+*.check
+/extlibs
+
+doxygen
+WARNINGS
+processes_*.txt
+
+*~
+*.o*
+*.e*
+*\#
+*.sh
+
+#potentially large files
+*.h5
+*.hdf5
+*.dat
+*.bin
+
+#media files
+*.mp4
+*.png
+*.eps
+*.jpg
+*.jpeg
+*.pdf
+*.out
+
+*.aux
+*.toc
+*.log
+
+#Config files
+/Config.sh
+/Makefile.systype
+/param.txt
+/parameter.txt
+/Run1.tex
+/param.tex
+/param.param
+/*.param
+
+
+#Output folders
+/tests
+/out
+/**/output*/
+balance.txt
+cpu.txt
+info.txt
+domain.txt
+memory.txt
+timebins.txt
+timings.txt
+energy.txt
+cpu.csv
+*-usedvalues
+**/restartfiles/
+
+
+# special files in test directory
+tests/Aquarius/Aq-A-2-cont/Tree-PM-O3/*.txt
+tests/ICs
+
+# symbolic links in repository
+tests/MilliMillHyper/output
+tests/MilliMillHyper/output_tree
+tests/EuclidTest/G4_TreePM/output
+site/
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..5878855fc02759f333b9c61c7ed2d253332b3984
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,521 @@
+#/*******************************************************************************
+# * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+# * \copyright   by Volker Springel. Copyright (C) 2014, 2015 by Volker Springel
+# * \copyright   (volker.springel@h-its.org) and all contributing authors.
+# *******************************************************************************/
+#
+# You might be looking for the compile-time Makefile options of the
+# code if you are familiar with Gadget2...
+#
+# They have moved to a separate file.
+#
+# To build the code, do the following:
+#
+#  (1) Copy the file "Template-Config.sh"  to  "Config.sh"
+#
+#        cp Template-Config.sh Config.sh
+#
+#  (2) Edit "Config.sh" as needed for your application
+#
+#  (3) Run "make"
+#
+#
+#  New compile-time options should be added to the
+#  file "Template-Config.sh" only. Usually, they should be added
+#  there in the disabled/default version.
+#
+#  "Config.sh" should not be checked in to the repository.
+#
+#  Note: It is possible to override the default name of the
+#  Config.sh file, if desired, as well as the name of the
+#  executable. For example:
+#
+#   make CONFIG=MyNewConf.sh EXEC=Gadget4_new
+#
+#-----------------------------------------------------------------
+#
+# You might also be looking for the target system SYSTYPE option
+#
+# It has also moved to a separate file.
+#
+# To build the code, do the following:
+#
+# (A) set the SYSTYPE variable in your .bashrc (or similar file):
+#
+#        e.g. export SYSTYPE=Magny
+# or
+#
+# (B) set SYSTYPE in Makefile.systype
+#     This file has priority over your shell variable:
+#
+#    (1) Copy the file "Template-Makefile.systype"  to  "Makefile.systype"
+#
+#        cp Template-Makefile.systype Makefile.systype
+#
+#    (2) Uncomment your system in  "Makefile.systype".
+#
+# For the chosen system type, an if-clause should be defined below,
+# loading short definitions of library path names and/or compiler
+# names and options from the buildsystem/ directory. A new system
+# type should also be added to Template-Makefile.systype
+#
+
+
+ifdef DIR
+EXEC = $(DIR)/Gadget4
+CONFIG = $(DIR)/Config.sh
+BUILD_DIR = $(DIR)/build
+else
+EXEC   = Gadget4
+CONFIG   = Config.sh
+BUILD_DIR = build
+endif
+
+
+SRC_DIR = src
+
+###################
+#determine SYSTYPE#
+###################
+ifdef SYSTYPE
+SYSTYPE := "$(SYSTYPE)"
+-include Makefile.systype
+else
+include Makefile.systype
+endif
+
+
+
+
+$(info Build configuration:)
+$(info SYSTYPE: $(SYSTYPE))
+$(info CONFIG: $(CONFIG))
+$(info EXEC: $(EXEC))
+$(info )
+
+
+PYTHON   = /usr/bin/python
+
+RESULT     := $(shell CONFIG=$(CONFIG) PYTHON=$(PYTHON) BUILD_DIR=$(BUILD_DIR) SRC_DIR=$(SRC_DIR) CURDIR=$(CURDIR) make -f buildsystem/Makefile.config)
+$(info $(RESULT))
+CONFIGVARS := $(shell cat $(BUILD_DIR)/gadgetconfig.h)
+
+RESULT     := $(shell SRC_DIR=$(SRC_DIR) BUILD_DIR=$(BUILD_DIR) ./buildsystem/git_version.sh)
+
+##########################
+#define available Systems#
+##########################
+ifeq ($(SYSTYPE),"Generic-gcc")
+include buildsystem/Makefile.gen.libs
+include buildsystem/Makefile.comp.gcc
+endif
+ifeq ($(SYSTYPE),"Generic-intel")
+include buildsystem/Makefile.comp.gcc-paranoia
+include buildsystem/Makefile.gen.libs
+endif
+
+ifeq ($(SYSTYPE),"SuperMUC-NG")
+include buildsystem/Makefile.comp.supermuc-ng
+include buildsystem/Makefile.path.supermuc-ng
+endif
+
+ifeq ($(SYSTYPE),"SuperMUC-NG-OpenMPI")
+include buildsystem/Makefile.comp.supermuc-ng-openmpi
+include buildsystem/Makefile.path.supermuc-ng
+endif
+
+ifeq ($(SYSTYPE),"SuperMUC-NG-GCC")
+include buildsystem/Makefile.comp.supermuc-ng-gcc
+include buildsystem/Makefile.path.supermuc-ng-gcc
+endif
+
+ifeq ($(SYSTYPE), "Generic-gcc-single")
+include buildsystem/Makefile.comp.gcc
+include buildsystem/Makefile.gen.singlelibs
+endif
+
+ifeq ($(SYSTYPE), "Generic-intel-single")
+include buildsystem/Makefile.comp.gcc-paranoia
+include buildsystem/Makefile.gen.singlelibs
+endif
+
+ifeq ($(SYSTYPE),"Darwin")
+include buildsystem/Makefile.comp.gcc
+include buildsystem/Makefile.path.macports
+endif
+
+ifeq ($(SYSTYPE),"Magny")
+include buildsystem/Makefile.comp.gcc
+include buildsystem/Makefile.path.magny
+endif
+
+ifeq ($(SYSTYPE),"Freya")
+include buildsystem/Makefile.comp.freya
+include buildsystem/Makefile.path.freya
+endif
+
+#module load  gcc/7.2    gsl/2.2  hdf5-serial/gcc/1.8.18   fftw/gcc/3.3.6   
+ifeq ($(SYSTYPE),"FreyaOpenMPI")
+include buildsystem/Makefile.comp.freyaopenmpi
+include buildsystem/Makefile.path.freya
+endif
+
+
+ifeq ($(SYSTYPE),"Cobra")
+include buildsystem/Makefile.comp.cobra
+include buildsystem/Makefile.path.cobra
+endif
+
+ifeq ($(SYSTYPE),"RavenOpenMPI")
+include buildsystem/Makefile.comp.ravenopenmpi
+include buildsystem/Makefile.path.cobra
+endif
+
+ifeq ($(SYSTYPE),"CobraOpenMPI")
+include buildsystem/Makefile.comp.cobraopenmpi
+include buildsystem/Makefile.path.cobra
+endif
+
+ifeq ($(SYSTYPE),"Haswell")
+include buildsystem/Makefile.comp.gcc
+include buildsystem/Makefile.path.haswell
+endif
+
+ifeq ($(SYSTYPE),"gcc-paranoia")
+include buildsystem/Makefile.comp.gcc-paranoia
+include buildsystem/Makefile.path.mpa_desktop
+endif
+
+
+ifeq ($(SYSTYPE),"libs")
+include buildsystem/makefile.comp.gcc
+include buildsystem/Makefile.path.libs
+endif
+
+ifeq ($(SYSTYPE),"hydra")
+include buildsystem/Makefile.comp.gcc
+include buildsystem/Makefile.path.hydra
+endif
+
+ifeq ($(SYSTYPE),"bwforcluster")
+include buildsystem/Makefile.comp.gcc
+include buildsystem/Makefile.path.bwforcluster
+endif
+
+
+ifndef LINKER
+LINKER = $(CPP)
+endif
+
+##########################################
+#determine the needed object/header files#
+##########################################
+SUBDIRS += .
+
+
+SUBDIRS += main
+OBJS    += main/begrun.o main/init.o main/main.o main/run.o
+INCL    += main/main.h main/simulation.h
+
+
+SUBDIRS += data
+OBJS    += data/mymalloc.o data/allvars.o data/test_symtensors.o
+INCL    += data/allvars.h data/dtypes.h data/mymalloc.h data/idstorage.h data/symtensors.h \
+           data/intposconvert.h data/constants.h data/simparticles.h \
+           data/macros.h data/particle_data.h data/sph_particle_data.h \
+           data/lightcone_particle_data.h data/lightcone_massmap_data.h data/lcparticles.h data/mmparticles.h
+
+
+SUBDIRS += domain
+OBJS    += domain/domain.o domain/domain_balance.o domain/domain_box.o \
+           domain/domain_exchange.o domain/domain_toplevel.o
+INCL    += domain/domain.h
+
+
+SUBDIRS += io
+OBJS    += io/hdf5_util.o io/snap_io.o io/parameters.o \
+           io/restart.o io/io.o io/test_io_bandwidth.o
+INCL    += io/io.h io/hdf5_util.h io/snap_io.h io/parameters.h \
+	         io/restart.h io/io_streamcount.h io/test_io_bandwidth.h
+
+
+SUBDIRS += logs
+OBJS    += logs/logs.o
+INCL    += logs/logs.h logs/timer.h
+
+SUBDIRS += gitversion
+INCL    += gitversion/version.h
+
+
+SUBDIRS += mpi_utils
+OBJS    += mpi_utils/hypercube_allgatherv.o mpi_utils/mpi_types.o mpi_utils/mpi_vars.o mpi_utils/sums_and_minmax.o \
+           mpi_utils/sizelimited_sendrecv.o mpi_utils/myalltoall.o mpi_utils/allreduce_sparse_double_sum.o mpi_utils/healthtest.o \
+           mpi_utils/allreduce_debugcheck.o mpi_utils/shared_mem_handler.o 
+INCL    += mpi_utils/mpi_utils.h mpi_utils/generic_comm.h mpi_utils/shared_mem_handler.h
+
+
+SUBDIRS += pm
+OBJS    += pm/pm_nonperiodic.o pm/pm_periodic.o \
+           pm/pm_mpi_fft.o
+INCL    += pm/pm.h pm/pm_mpi_fft.h pm/pm_periodic.h pm/pm_nonperiodic.h
+
+
+SUBDIRS += vectorclass
+OBJS    += vectorclass/instrset_detect.o
+INCL    +=
+
+
+SUBDIRS += sort
+OBJS    += sort/peano.o
+INCL    += sort/peano.h sort/cxxsort.h sort/parallel_sort.h
+
+
+SUBDIRS += sph
+OBJS    += sph/density.o sph/hydra.o sph/init_entropy.o sph/artificial_viscosity.o
+INCL    += sph/kernel.h sph/sph.h
+
+SUBDIRS += system
+OBJS    += system/pinning.o system/system.o
+INCL    += system/system.h system/pinning.h
+
+
+SUBDIRS += time_integration
+OBJS    += time_integration/driftfac.o time_integration/kicks.o \
+           time_integration/predict.o time_integration/timestep.o \
+           time_integration/timestep_treebased.o
+INCL    += time_integration/timestep.h time_integration/driftfac.h
+
+
+SUBDIRS += gravity
+OBJS    += gravity/gravity.o gravity/ewald.o gravity/ewald_test.o \
+           gravity/grav_forcetest.o \
+           gravity/grav_direct.o gravity/second_order_ics.o
+INCL    += gravity/ewald.h gravity/ewaldtensors.h gravity/grav_forcetest.h
+
+
+SUBDIRS += tree
+OBJS    += tree/tree.o
+INCL    += tree/tree.h
+
+
+SUBDIRS += gravtree
+OBJS    += gravtree/gravtree_build.o gravtree/gravtree.o gravtree/gwalk.o
+INCL    += gravtree/gravtree.h  gravtree/gwalk.h  
+
+
+SUBDIRS += ngbtree
+OBJS    += ngbtree/ngbtree_build.o 
+INCL    += ngbtree/ngbtree.h
+
+
+ifeq (COOLING,$(findstring COOLING,$(CONFIGVARS)))
+OBJS    += cooling_sfr/cooling.o cooling_sfr/sfr_eos.o cooling_sfr/starformation.o
+INCL    += cooling_sfr/cooling.h
+SUBDIRS += cooling_sfr
+endif
+
+
+ifeq (FOF,$(findstring FOF,$(CONFIGVARS)))
+OBJS    += fof/fof.o fof/fof_findgroups.o fof/fof_nearest.o fof/fof_io.o fof/foftree_build.o
+INCL    += fof/fof.h  fof/fof_io.h  fof/foftree.h
+SUBDIRS += fof
+endif
+
+
+ifeq (SUBFIND,$(findstring SUBFIND,$(CONFIGVARS)))
+OBJS	+= subfind/subfind.o subfind/subfind_treepotential.o \
+           subfind/subfind_processing.o subfind/subfind_density.o subfind/subfind_distribute.o subfind/subfind_findlinkngb.o \
+           subfind/subfind_nearesttwo.o subfind/subfind_properties.o subfind/subfind_unbind.o subfind/subfind_history.o \
+           subfind/subfind_so.o subfind/subfind_readid_io.o subfind/subfind_orphanids.o subfind/subfind_excursionset.o 
+INCL	+= subfind/subfind.h subfind/subfind_readid_io.h
+SUBDIRS += subfind
+endif
+
+
+ifeq (FMM,$(findstring FMM,$(CONFIGVARS)))
+SUBDIRS += fmm
+OBJS    += fmm/fmm.o
+INCL    += fmm/fmm.h
+endif
+
+
+ifeq (MERGERTREE,$(findstring MERGERTREE,$(CONFIGVARS)))
+OBJS    += mergertree/descendant.o mergertree/io_descendant.o mergertree/io_progenitors.o \
+           mergertree/postproc_descendants.o mergertree/io_readsnap.o mergertree/halotrees.o \
+           mergertree/io_halotrees.o mergertree/io_treelinks.o mergertree/io_readtrees_mbound.o \
+           mergertree/rearrange.o
+INCL    += mergertree/mergertree.h mergertree/io_descendant.h mergertree/io_progenitors.h \
+           mergertree/io_readsnap.h mergertree/io_treelinks.h mergertree/io_halotrees.h \
+           mergertree/io_readtrees_mbound.h
+SUBDIRS += mergertree
+endif
+
+
+ifeq (LIGHTCONE,$(findstring LIGHTCONE,$(CONFIGVARS)))
+SUBDIRS += lightcone
+OBJS    += lightcone/lightcone.o lightcone/lightcone_particle_io.o lightcone/lightcone_massmap_io.o      
+INCL    += lightcone/lightcone.h lightcone/lightcone_particle_io.h lightcone/lightcone_massmap_io.h 
+endif
+
+
+ifeq (LIGHTCONE_MASSMAPS,$(findstring LIGHTCONE_MASSMAPS,$(CONFIGVARS)))
+MAPS_LIBS += -lchealpix -lcfitsio #-lcurl
+endif
+
+ifeq (LIGHTCONE_PARTICLES,$(findstring LIGHTCONE_PARTICLES,$(CONFIGVARS)))
+MAPS_LIBS += -lchealpix -lcfitsio #-lcurl
+endif
+
+
+ifeq (NGENIC,$(findstring NGENIC,$(CONFIGVARS)))
+OBJS    += ngenic/ngenic.o ngenic/power.o ngenic/grid.o
+INCL    += ngenic/ngenic.h
+SUBDIRS += ngenic
+endif
+
+
+ifeq (DEBUG_MD5,$(findstring DEBUG_MD5,$(CONFIGVARS)))
+SUBDIRS += debug_md5
+OBJS    += debug_md5/calc_checksum.o debug_md5/Md5.o
+INCL    += debug_md5/Md5.h
+endif
+
+
+################################
+#determine the needed libraries#
+################################
+
+# we only need fftw if PMGRID is turned on
+ifeq (PMGRID, $(findstring PMGRID, $(CONFIGVARS)))
+ifeq (DOUBLEPRECISION_FFTW,$(findstring DOUBLEPRECISION_FFTW,$(CONFIGVARS)))  # test for double precision libraries
+FFTW_LIBS += -lfftw3
+else
+FFTW_LIBS += -lfftw3f
+endif
+else
+ifeq (NGENIC, $(findstring NGENIC, $(CONFIGVARS)))
+ifeq (DOUBLEPRECISION_FFTW,$(findstring DOUBLEPRECISION_FFTW,$(CONFIGVARS)))  # test for double precision libraries
+FFTW_LIBS += -lfftw3
+else
+FFTW_LIBS += -lfftw3f
+endif
+else
+
+endif
+endif
+
+ifeq (FORCETEST, $(findstring FORCETEST, $(CONFIGVARS)))
+FFTW_LIBS += -lfftw3
+endif
+
+
+HWLOC_LIBS  += -lhwloc
+
+ifneq (IMPOSE_PINNING,$(findstring IMPOSE_PINNING,$(CONFIGVARS)))
+HWLOC_INCL =
+HWLOC_LIBS =
+endif
+
+ifneq (VTUNE_INSTRUMENT,$(findstring VTUNE_INSTRUMENT,$(CONFIGVARS)))
+VTUNE_INCL =
+VTUNE_LIBS =
+endif
+
+GSL_LIBS   += -lgsl -lgslcblas
+HDF5_LIBS  += -lhdf5 -lz
+MATH_LIBS  = -lm
+
+MAKEFILES = $(MAKEFILE_LIST) buildsystem/Makefile.config
+
+##########################
+#combine compiler options#
+##########################
+
+CFLAGS = $(OPTIMIZE) $(OPT) $(HDF5_INCL) $(GSL_INCL) $(FFTW_INCL) $(HWLOC_INCL) $(VTUNE_INCL) $(MAPS_INCL) -I$(BUILD_DIR) -I$(SRC_DIR)
+
+LIBS = $(MATH_LIBS) $(HDF5_LIBS) $(GSL_LIBS) $(FFTW_LIBS) $(HWLOC_LIBS) $(VTUNE_LIBS) $(TEST_LIBS) $(MAPS_LIBS)
+
+
+SUBDIRS := $(addprefix $(BUILD_DIR)/,$(SUBDIRS))
+OBJS := $(addprefix $(BUILD_DIR)/,$(OBJS)) $(BUILD_DIR)/compile_time_info.o $(BUILD_DIR)/compile_time_info_hdf5.o $(BUILD_DIR)/version.o
+INCL := $(addprefix $(SRC_DIR)/,$(INCL)) $(BUILD_DIR)/gadgetconfig.h
+
+
+TO_CHECK := $(addsuffix .check, $(OBJS) $(patsubst $(SRC_DIR)%, $(BUILD_DIR)%, $(INCL)) )
+TO_CHECK +=  $(BUILD_DIR)/Makefile.check
+CONFIG_CHECK = $(BUILD_DIR)/$(notdir $(CONFIG)).check
+DOCS_CHECK = $(BUILD_DIR)/config.check  $(BUILD_DIR)/param.check
+
+
+################
+#create subdirs#
+################
+RESULT := $(shell mkdir -p $(SUBDIRS)  )
+
+###########################################
+#create info file for command line options#
+###########################################
+RESULT := $(shell echo 'static const char *compiler_flags="$(CPP) $(CFLAGS)";' > $(BUILD_DIR)/compiler-command-line-args.h )
+
+#############
+#build rules#
+#############
+
+all: check_docs check build
+
+build: $(EXEC)
+
+$(EXEC): $(OBJS)
+	$(LINKER) $(OPTIMIZE) $(OBJS) $(LIBS) -o $(EXEC)
+
+clean:
+	rm -f $(OBJS) $(EXEC)
+	rm -f $(BUILD_DIR)/compile_time_info.cc $(BUILD_DIR)/compile_time_info_hdf5.cc $(BUILD_DIR)/gadgetconfig.h
+	rm -f $(TO_CHECK) $(CONFIG_CHECK)
+	rm -f $(BUILD_DIR)/version.cc
+
+$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp $(INCL) $(MAKEFILES)
+	$(CPP) $(CFLAGS) -c $< -o $@
+
+$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cc $(INCL) $(MAKEFILES)
+	$(CPP) $(CFLAGS) -c $< -o $@
+
+$(BUILD_DIR)/compile_time_info.o: $(BUILD_DIR)/compile_time_info.cc $(MAKEFILES)
+	$(CPP) $(CFLAGS) -c $< -o $@
+
+$(BUILD_DIR)/compile_time_info_hdf5.o: $(BUILD_DIR)/compile_time_info_hdf5.cc $(MAKEFILES)
+	$(CPP) $(CFLAGS) -c $< -o $@
+
+
+check: $(CONFIG_CHECK)
+
+check_docs: $(DOCS_CHECK)
+
+$(CONFIG_CHECK): $(TO_CHECK) $(CONFIG) buildsystem/check.py
+	@$(PYTHON) buildsystem/check.py 2 $(CONFIG) $(CONFIG_CHECK) defines_extra $(TO_CHECK)
+
+$(BUILD_DIR)/%.o.check: $(SRC_DIR)/%.cpp Template-Config.sh defines_extra buildsystem/check.py
+	@$(PYTHON) buildsystem/check.py 1 $< $@ Template-Config.sh defines_extra
+
+$(BUILD_DIR)/%.o.check: $(SRC_DIR)/%.cc Template-Config.sh defines_extra buildsystem/check.py
+	@$(PYTHON) buildsystem/check.py 1 $< $@ Template-Config.sh defines_extra
+
+$(BUILD_DIR)/%.h.check: $(SRC_DIR)/%.h Template-Config.sh defines_extra buildsystem/check.py
+	@$(PYTHON) buildsystem/check.py 1 $< $@ Template-Config.sh defines_extra
+
+$(BUILD_DIR)/%.o.check: $(BUILD_DIR)/%.cc Template-Config.sh defines_extra buildsystem/check.py
+	@$(PYTHON) buildsystem/check.py 1 $< $@ Template-Config.sh defines_extra
+
+$(BUILD_DIR)/%.h.check: $(BUILD_DIR)/%.h Template-Config.sh defines_extra buildsystem/check.py
+	@$(PYTHON) buildsystem/check.py 1 $< $@ Template-Config.sh defines_extra
+
+$(BUILD_DIR)/Makefile.check: Makefile Template-Config.sh defines_extra buildsystem/check.py
+	@$(PYTHON) buildsystem/check.py 3 $< $@ Template-Config.sh defines_extra
+
+$(BUILD_DIR)/config.check: documentation/04_config-options.md Template-Config.sh buildsystem/check.py
+	@$(PYTHON) buildsystem/check.py 4 Template-Config.sh $@  $<
+
+$(BUILD_DIR)/param.check: documentation/05_parameterfile.md $(SRC_DIR)/io/parameters.cc buildsystem/check.py
+	@$(PYTHON) buildsystem/check.py 5 $(SRC_DIR)/data/allvars.cc $@  $<
+
+.PHONY = all check build clean
diff --git a/Template-Config.sh b/Template-Config.sh
new file mode 100644
index 0000000000000000000000000000000000000000..71f8cf7625cde1362b22c4adc7f0fad2ba515ed9
--- /dev/null
+++ b/Template-Config.sh
@@ -0,0 +1,206 @@
+
+##################################################
+#  Enable/Disable compile-time options as needed #
+##################################################
+
+
+
+#----------------------------------------Parallelization options
+
+IMPOSE_PINNING
+#IMPOSE_PINNING_OVERRIDE_MODE
+#EXPLICIT_VECTORIZATION         # This uses AVX at selected places through the vectorclass C++ library
+#PRESERVE_SHMEM_BINARY_INVARIANCE
+#SIMPLE_DOMAIN_AGGREGATION
+
+#--------------------------------------- Basic operation mode of code
+PERIODIC
+#TWODIMS
+#ONEDIMS
+#LONG_X_BITS=2
+#LONG_Y_BITS=2
+#LONG_Z_BITS=1
+
+#NTYPES=6                       # Number of particle types. Note that this may only be changed from the default value of 6 if
+                                # HDF5 snapshot files are used.
+
+#GADGET2_HEADER                 # allows reading in of Gadget2/3 snapshots by using this code's header format for snaphot file formats 1 and 2
+#SECOND_ORDER_LPT_ICS           # treats second order LPT ICs generated with Adrian Jenkin's code
+#LEAN                           # selects a special 'lean' mode of code operation, which is optimized for extreme memory saving
+
+#--------------------------------------- Gravity calculation
+SELFGRAVITY                     # switch on for self-gravity
+HIERARCHICAL_GRAVITY
+#FMM                            # enables FMM tree algorithm
+ALLOW_DIRECT_SUMMATION          # allows calculation of small number of particles with direct summation
+#EXTERNALGRAVITY                # switch on for external potential
+#EVALPOTENTIAL                  # computes gravitational potential
+#EXTRAPOTTERM                   # if this is set, the extra multipole term needed in principle for the potential is computed even though it does not enter the force
+#EXTRA_HIGH_EWALD_ACCURACY      # if this is activate, a third order Taylor expansion is used for the Ewald corrections 
+ 
+#MULTIPOLE_ORDER=2              # enables in tree and/or FMM for a value of 2 monopoles+dipoles (this is default), 3 gives quadrupoles, 4 octupoles, and 5 hexadecupoles
+#RANDOMIZE_DOMAINCENTER
+
+#--------------------------------------- TreePM Options
+PMGRID=512
+#ASMTH=1.25
+#RCUT=6.0
+#NTAB=128                       # size of short-range look-up table
+#TREEPM_NOTIMESPLIT             # if this is activated, long-range and short-range gravity are time-integrated on a single timestep
+#PLACEHIGHRESREGION=2
+#HRPMGRID=512                   # High-res PM grid size (optional, default is HRPMGRID=PMGRID)
+
+#FFT_COLUMN_BASED
+#PM_ZOOM_OPTIMIZED
+#GRAVITY_TALLBOX=2              # this can be used to set-up gravity with two-dimensional boxes
+#TREE_NUM_BEFORE_NODESPLIT=10   # Optional number that tells how many particles are allowed in a tree node before it is split
+
+#--------------------------------------- Treatment of gravitational softening
+#INDIVIDUAL_GRAVITY_SOFTENING=4+8+16+32
+#NSOFTCLASSES=4                   # Number of different softening values. Normally equal to number of particle types, but can be chosen differently if DECOUPLE_TYPE_AND_SOFTTYPE is set.
+#ADAPTIVE_HYDRO_SOFTENING
+
+
+
+#--------------------------------------- SPH treatmeant and formulation
+#PRESSURE_ENTROPY_SPH           # enables the Hopkins 2013 pressure-entropy formulation
+#OUTPUT_PRESSURE_SPH_DENSITY    # output also density computed in the pressure-entropy forumulation
+#GAMMA=1.4
+#ISOTHERM_EQS
+#REUSE_HYDRO_ACCELERATIONS_FROM_PREVIOUS_STEP
+IMPROVED_VELOCITY_GRADIENTS # use higher-order gradients of the velocity according to Hu et. al (2014)
+VISCOSITY_LIMITER_FOR_LARGE_TIMESTEPS #limits the acceleration due to the viscosity  
+
+#--------------------------------------- SPH kernels
+CUBIC_SPLINE_KERNEL           # uses the cubic spline kernel
+#WENDLAND_C2_KERNEL           # the Wendland C2 kernel from Dehnen & Aly 2012
+#WENDLAND_C4_KERNEL           # the Wendland C4 kernel from Dehnen & Aly 2012
+#WENDLAND_C6_KERNEL           # the Wendland C6 kernel from Dehnen & Aly 2012
+WENDLAND_BIAS_CORRECTION      # reduces self-contribution for Wendland kernels according to Dehnen & Aly 2012
+
+
+
+#--------------------------------------- SPH viscosity options
+#TIMEDEP_ART_VISC               # Enables time dependend viscosity
+#HIGH_ART_VISC_START            # Start with high rather than low viscosity
+#NO_SHEAR_VISCOSITY_LIMITER     # Turns of the shear viscosity supression
+OUTPUT_VISCOSITY_PARAMETER
+
+
+#--------------------------------------- Extra physics
+#COOLING
+#STARFORMATION
+
+
+#--------------------------------------- Time integration options
+#FORCE_EQUAL_TIMESTEPS          # this chooses a variable but global timestep
+
+
+#---------------------------------------- Single/double precision and data types
+POSITIONS_IN_32BIT              # if set, use 32-integers for positions
+POSITIONS_IN_64BIT              # if set, use 64-integers for positions
+POSITIONS_IN_128BIT             # if set, use 128-integers for positions
+DOUBLEPRECISION=1
+DOUBLEPRECISION_FFTW
+#OUTPUT_IN_DOUBLEPRECISION      # output files will be written in double precision
+#ENLARGE_DYNAMIC_RANGE_IN_TIME  # This extends the dynamic range of the integer timeline from 32 to 64 bit
+#IDS_32BIT                      # Selects 32-bit IDs for internal storage (default)
+#IDS_48BIT                      # Selects 48-bit IDs for internal storage
+#IDS_64BIT                      # Selects 64-bit IDs for internal storage
+#USE_SINGLEPRECISION_INTERNALLY  # reduces default double precision for most internal computations to single precision
+
+#---------------------------------------- Output/Input options
+#REDUCE_FLUSH
+#OUTPUT_VELOCITY_GRADIENT
+#OUTPUT_PRESSURE
+#OUTPUT_ENTROPY
+#OUTPUT_CHANGEOFENTROPY
+#OUTPUT_POTENTIAL
+#OUTPUT_ACCELERATION
+#OUTPUT_TIMESTEP
+#OUTPUT_DIVVEL                  # output  velocity divergence
+#OUTPUT_CURLVEL                 # output  velocity curl
+#OUTPUT_COOLHEAT                # output actual energy loss/gain in cooling/heating routine
+#POWERSPEC_ON_OUTPUT
+#OUTPUT_NON_SYNCHRONIZED_ALLOWED
+#OUTPUT_VELOCITIES_IN_HALF_PRECISION
+#OUTPUT_ACCELERATIONS_IN_HALF_PRECISION
+#OUTPUT_COORDINATES_AS_INTEGERS
+#ALLOW_HDF5_COMPRESSION
+
+#---------------------------------------- On the fly FOF groupfinder
+#FOF                            # enable FoF output
+#FOF_PRIMARY_LINK_TYPES=2       # 2^type for the primary dark matter type
+#FOF_SECONDARY_LINK_TYPES=1+16+32  # 2^type for the types linked to nearest primaries
+##FOF_GROUP_MIN_LEN=32          # default is 32
+##FOF_LINKLENGTH=0.16           # Linkinglength for FoF (default=0.2)
+#FOF_ALLOW_HUGE_GROUPLENGTH     # if this is set, groups and subhalos may have more than 2 billion particles in length
+
+#---------------------------------------- Subfind
+SUBFIND                        # enables substructure finder
+#SUBFIND_STORE_LOCAL_DENSITY    # will calculate local densities and velocity dispersions for *all* particles
+                                # selected by FOF primary and seconday link types (not only for particles in gropus), 
+                                # and store them in snapshots
+#SUBFIND_ORPHAN_TREATMENT       # creates special snapshots with formerly most bound particles
+#SUBFIND_HBT                    # use previous subhalo catalogue instead of density excursions to define subhalo candidates
+
+#---------------------------------------- Merger tree code
+MERGERTREE                      # enables on-the-fly calculation of descendants, and merger tree construction from group catalogues
+
+#---------------------------------------- On-the-fly lightcone creation
+#LIGHTCONE
+#LIGHTCONE_PARTICLES
+#LIGHTCONE_MASSMAPS
+#LIGHTCONE_PARTICLES_GROUPS
+#LIGHTCONE_OUTPUT_ACCELERATIONS
+#LIGHTCONE_IMAGE_COMP_HSML_VELDISP
+
+#REARRANGE_OPTION
+
+#--------------------------------------- IC generation
+#NGENIC=256                     # generate cosmological ICs, set NGENIC to the FFT grid size used for IC generation
+#CREATE_GRID                    # start with a regular cartesian DM particle grid, instead of reading a glass file (for NGENIC)
+#GENERATE_GAS_IN_ICS            # add SPH particles to dark matter only ICs
+#SPLIT_PARTICLE_TYPE=4+8        # particle types to be split if GENERATE_GAS_IN_ICS is activated
+#NGENIC_FIX_MODE_AMPLITUDES     # when activated, this leaves the mode amplitudes at sqrt(P(k)), instead of sampling from a Rayleigh distribution
+#NGENIC_MIRROR_PHASES           # if this is activated, all phases are turned by 180 degrees
+#NGENIC_2LPT                    # applies 2LPT instead of just Zeldovich
+#NGENIC_TEST                    # can be used to create ICs, measure the power spectrum, and then stop
+
+#---------------------------------------- MPI related settings
+#ISEND_IRECV_IN_DOMAIN
+#USE_MPIALLTOALLV_IN_DOMAINDECOMP
+#MPI_HYPERCUBE_ALLGATHERV       # some MPI-libraries may use quite a bit of internal storage for MPI_Allgatherv. This uses hypercubes instead as a work-around
+#MPI_MESSAGE_SIZELIMIT_IN_MB=200
+#NUMPART_PER_TASK_LARGE         # Set this if the number of particle per task is very large (so that more than 2GB data is comprised just by the particle data)
+
+#MAX_NUMBER_OF_RANKS_WITH_SHARED_MEMORY=64
+#NUMBER_OF_MPI_LISTENERS_PER_NODE=1
+
+#---------------------------------------- Testing and Debugging options
+DEBUG                           # enables core-dumps
+#DEBUG_ENABLE_FPU_EXCEPTIONS    # tries to enable FPU exceptions
+#DEBUG_SYMTENSORS               # carries out a few unit tests related to the tensor algebra routines of the code
+HOST_MEMORY_REPORTING           # reports after start-up the available system memory by analyzing /proc/meminfo
+#FORCETEST=0.001                # calculates for given fraction of particles direct summation forces to check accuracy of tree force
+#FORCETEST_FIXEDPARTICLESET     # if this set, always the same particle receive a force accuracy check during a run
+#FORCETEST_TESTFORCELAW=1       # Special option for measuring the effective force law, can be set to 1 or 2 for TreePM+HighRes
+#VTUNE_INSTRUMENT
+#DEBUG_MD5                      # make MD5 check sum routines available for debugging
+#SQUASH_TEST
+#DOMAIN_SPECIAL_CHECK
+#RECREATE_UNIQUE_IDS            # Before carrying out the ID uniqueness test, this sets new IDs. Can be used if IC files contain broken IDs.
+#NO_STOP_BELOW_MINTIMESTEP      # do not stop when the code wants to go below minimum timestep
+#TILING=2                       # duplicated an IC in each dimension
+#DO_NOT_PRODUCE_BIG_OUTPUT      # for scaling tests, one may disable the writing of restart files and snapshot dumps
+#MEASURE_TOTAL_MOMENTUM
+#ENABLE_HEALTHTEST
+#EWALD_TEST                     # a development test for the Ewald tables
+#STOP_AFTER_STEP=10             # ends a simulation after the specified timestep (to simplify scalability tests)
+
+#TREE_NO_SAFETY_BOX             # when set, this disables the geometric 'near node' protection
+
+#---------------------------------------- Postprocessing options
+
+#LGALAXIES                      # Semi-analytic L-Galaxies code
+
diff --git a/Template-Makefile.systype b/Template-Makefile.systype
new file mode 100644
index 0000000000000000000000000000000000000000..735b4b6b6ff65c6a723bbe58972446a3dd38ebc6
--- /dev/null
+++ b/Template-Makefile.systype
@@ -0,0 +1,17 @@
+# Select Target Computer 
+#
+# Please copy this file to Makefile.systype and uncomment your
+# system. Don't commit changes to this file unless you add support for
+# a new system.
+
+#SYSTYPE="Generic-gcc"
+#SYSTYPE="Generic-intel"
+#SYSTYPE="Generic-gcc-single"
+#SYSTYPE="Generic-intel-single"
+#SYSTYPE="Darwin"
+#SYSTYPE="Magny"
+#SYSTYPE="gcc-paranoia"	
+#SYSTYPE="libs"
+#SYSTYPE="hydra"
+#SYSTYPE="bwforcluster"
+
diff --git a/buildsystem/Makefile.comp.cobra b/buildsystem/Makefile.comp.cobra
new file mode 100644
index 0000000000000000000000000000000000000000..15a0110c2c46ff0694ff65655270fb36fbc9d306
--- /dev/null
+++ b/buildsystem/Makefile.comp.cobra
@@ -0,0 +1,16 @@
+CPP      =  mpiicpc  -std=c++11 # sets the C++-compiler
+OPTIMIZE =  -ggdb -O3 -march=native  -Wall -Wno-format-security
+
+ifeq (EXPLICIT_VECTORIZATION,$(findstring EXPLICIT_VECTORIZATION,$(CONFIGVARS)))
+CFLAGS_VECTOR += -mavx2  # enables generation of AVX instructions (used through vectorclass)
+CPV      =  $(CPP)
+else
+CFLAGS_VECTOR =
+CPV      =  $(CPP)
+endif
+
+ifeq (NUM_THREADS,$(findstring NUM_THREADS,$(CONFIGVARS)))
+OPTIMIZE +=  -fopenmp
+else
+OPTIMIZE +=  -Wno-unknown-pragmas
+endif
diff --git a/buildsystem/Makefile.comp.cobraopenmpi b/buildsystem/Makefile.comp.cobraopenmpi
new file mode 100644
index 0000000000000000000000000000000000000000..2a9430a4dc9abe85cfa9777feb0a63013c2e3079
--- /dev/null
+++ b/buildsystem/Makefile.comp.cobraopenmpi
@@ -0,0 +1,15 @@
+CPP      =  /u/vrs/Libs/openmpi-4.0.4/bin/mpicxx  -std=c++11 # sets the C++-Compiler
+OPTIMIZE =  -ggdb -O3  -Wall -Wno-format-security
+ifeq (EXPLICIT_VECTORIZATION,$(findstring EXPLICIT_VECTORIZATION,$(CONFIGVARS)))
+CFLAGS_VECTOR += -mavx2  # enables generation of AVX instructions (used through vectorclass)
+CPV      =  $(CPP)
+else
+CFLAGS_VECTOR =
+CPV      =  $(CPP)
+endif
+
+ifeq (NUM_THREADS,$(findstring NUM_THREADS,$(CONFIGVARS)))
+OPTIMIZE +=  -qopenmp
+else
+OPTIMIZE +=  -Wno-unknown-pragmas
+endif
diff --git a/buildsystem/Makefile.comp.freya b/buildsystem/Makefile.comp.freya
new file mode 100644
index 0000000000000000000000000000000000000000..15a0110c2c46ff0694ff65655270fb36fbc9d306
--- /dev/null
+++ b/buildsystem/Makefile.comp.freya
@@ -0,0 +1,16 @@
+CPP      =  mpiicpc  -std=c++11 # sets the C++-compiler
+OPTIMIZE =  -ggdb -O3 -march=native  -Wall -Wno-format-security
+
+ifeq (EXPLICIT_VECTORIZATION,$(findstring EXPLICIT_VECTORIZATION,$(CONFIGVARS)))
+CFLAGS_VECTOR += -mavx2  # enables generation of AVX instructions (used through vectorclass)
+CPV      =  $(CPP)
+else
+CFLAGS_VECTOR =
+CPV      =  $(CPP)
+endif
+
+ifeq (NUM_THREADS,$(findstring NUM_THREADS,$(CONFIGVARS)))
+OPTIMIZE +=  -fopenmp
+else
+OPTIMIZE +=  -Wno-unknown-pragmas
+endif
diff --git a/buildsystem/Makefile.comp.freyaopenmpi b/buildsystem/Makefile.comp.freyaopenmpi
new file mode 100644
index 0000000000000000000000000000000000000000..c4c0105565302e9be84e383ddae1c41e50e55817
--- /dev/null
+++ b/buildsystem/Makefile.comp.freyaopenmpi
@@ -0,0 +1,16 @@
+CPP      =  /freya/u/vrs/Libs/openmpi-4.0.1/bin/mpicxx  -std=c++11 # sets the C++-compiler
+OPTIMIZE =  -ggdb -O3 -march=native  -Wall -Wno-format-security
+
+ifeq (EXPLICIT_VECTORIZATION,$(findstring EXPLICIT_VECTORIZATION,$(CONFIGVARS)))
+CFLAGS_VECTOR += -mavx2  # enables generation of AVX instructions (used through vectorclass)
+CPV      =  $(CPP)
+else
+CFLAGS_VECTOR =
+CPV      =  $(CPP)
+endif
+
+ifeq (NUM_THREADS,$(findstring NUM_THREADS,$(CONFIGVARS)))
+OPTIMIZE +=  -fopenmp
+else
+OPTIMIZE +=  -Wno-unknown-pragmas
+endif
diff --git a/buildsystem/Makefile.comp.gcc b/buildsystem/Makefile.comp.gcc
new file mode 100644
index 0000000000000000000000000000000000000000..48e196c166958d635b974cff156cead60c4dcee5
--- /dev/null
+++ b/buildsystem/Makefile.comp.gcc
@@ -0,0 +1,16 @@
+CPP      =  mpicxx  -std=c++11 # sets the C++-compiler
+OPTIMIZE =  -ggdb -O3 -march=native  -Wall -Wno-format-security  # -Wdouble-promotion # -Wconversion    
+
+ifeq (EXPLICIT_VECTORIZATION,$(findstring EXPLICIT_VECTORIZATION,$(CONFIGVARS)))
+CFLAGS_VECTOR += -mavx -fabi-version=0  # enables generation of AVX instructions (used through vectorclass)
+CPV      =  $(CPP)
+else
+CFLAGS_VECTOR =
+CPV      =  $(CPP)
+endif
+
+ifeq (NUM_THREADS,$(findstring NUM_THREADS,$(CONFIGVARS)))
+OPTIMIZE +=  -fopenmp
+else
+OPTIMIZE +=  -Wno-unknown-pragmas
+endif
diff --git a/buildsystem/Makefile.comp.gcc-paranoia b/buildsystem/Makefile.comp.gcc-paranoia
new file mode 100644
index 0000000000000000000000000000000000000000..62a8b5c0165f5493801d16a2b71de8bb0b880947
--- /dev/null
+++ b/buildsystem/Makefile.comp.gcc-paranoia
@@ -0,0 +1,20 @@
+CPP      =  mpicxx -std=c++11 -Wwrite-strings -Wredundant-decls -Woverloaded-virtual -Wcast-qual -Wcast-align -Wpointer-arith -Wmissing-declarations # sets the C++-compiler
+OPTIMIZE =    -g -Wall -W -O3 -march=native
+
+# -fopenmp
+# -DDISABLE_MEMORY_MANAGER
+# -ffunction-sections -fdata-sections -Wl,--gc-sections -Wl,--print-gc-sections -Wl,--demangle
+
+ifeq (EXPLICIT_VECTORIZATION,$(findstring EXPLICIT_VECTORIZATION,$(CONFIGVARS)))
+CFLAGS_VECTOR += -mavx   # enables generation of AVX instructions (used through vectorclass)
+CPV      =  $(CPP)
+else
+CFLAGS_VECTOR =
+CPV      =  $(CPP)
+endif
+
+ifeq (NUM_THREADS,$(findstring NUM_THREADS,$(CONFIGVARS)))
+OPTIMIZE +=  -fopenmp
+else
+OPTIMIZE +=  -Wno-unknown-pragmas
+endif
diff --git a/buildsystem/Makefile.comp.ravenopenmpi b/buildsystem/Makefile.comp.ravenopenmpi
new file mode 100644
index 0000000000000000000000000000000000000000..f9892ce226fb3fd1dd2bef07a5eafed4403f3da7
--- /dev/null
+++ b/buildsystem/Makefile.comp.ravenopenmpi
@@ -0,0 +1,15 @@
+CPP      =  /u/vrs/Libs/openmpi-4.0.5/bin/mpicxx  -std=c++11 # sets the C++-Compiler
+OPTIMIZE =  -ggdb -O3  -Wall -Wno-format-security
+ifeq (EXPLICIT_VECTORIZATION,$(findstring EXPLICIT_VECTORIZATION,$(CONFIGVARS)))
+CFLAGS_VECTOR += -mavx2  # enables generation of AVX instructions (used through vectorclass)
+CPV      =  $(CPP)
+else
+CFLAGS_VECTOR =
+CPV      =  $(CPP)
+endif
+
+ifeq (NUM_THREADS,$(findstring NUM_THREADS,$(CONFIGVARS)))
+OPTIMIZE +=  -qopenmp
+else
+OPTIMIZE +=  -Wno-unknown-pragmas
+endif
diff --git a/buildsystem/Makefile.comp.supermuc-ng b/buildsystem/Makefile.comp.supermuc-ng
new file mode 100644
index 0000000000000000000000000000000000000000..275a8175e21e08ed5c27d0561a4f110ef257a022
--- /dev/null
+++ b/buildsystem/Makefile.comp.supermuc-ng
@@ -0,0 +1,17 @@
+CC       =  mpiicc   -std=c11  # sets the C-compiler
+CPP      =  mpiicpc  -std=c++11 # sets the C++-compiler
+OPTIMIZE =  -ggdb -O3 -march=native  -Wall -Wno-format-security 
+
+ifeq (EXPLICIT_VECTORIZATION,$(findstring EXPLICIT_VECTORIZATION,$(CONFIGVARS)))
+CFLAGS_VECTOR += -mavx2  # enables generation of AVX instructions (used through vectorclass)
+CPV      =  $(CPP)
+else
+CFLAGS_VECTOR = 
+CPV      =  $(CPP)
+endif
+
+ifeq (NUM_THREADS,$(findstring NUM_THREADS,$(CONFIGVARS)))
+OPTIMIZE +=  -fopenmp
+else
+OPTIMIZE +=  -Wno-unknown-pragmas
+endif
diff --git a/buildsystem/Makefile.comp.supermuc-ng-gcc b/buildsystem/Makefile.comp.supermuc-ng-gcc
new file mode 100644
index 0000000000000000000000000000000000000000..d3d93121ee7871b8764c0a5d2f56563a6ad0d5a0
--- /dev/null
+++ b/buildsystem/Makefile.comp.supermuc-ng-gcc
@@ -0,0 +1,17 @@
+CC       =  mpicc   -std=c11  # sets the C-compiler
+CPP      =  mpicxx  -std=c++11 # sets the C++-compiler
+OPTIMIZE =  -ggdb -O3 -march=native  -Wall -Wno-format-security 
+
+ifeq (EXPLICIT_VECTORIZATION,$(findstring EXPLICIT_VECTORIZATION,$(CONFIGVARS)))
+CFLAGS_VECTOR += -mavx2  # enables generation of AVX instructions (used through vectorclass)
+CPV      =  $(CPP)
+else
+CFLAGS_VECTOR = 
+CPV      =  $(CPP)
+endif
+
+ifeq (NUM_THREADS,$(findstring NUM_THREADS,$(CONFIGVARS)))
+OPTIMIZE +=  -fopenmp
+else
+OPTIMIZE +=  -Wno-unknown-pragmas
+endif
diff --git a/buildsystem/Makefile.comp.supermuc-ng-openmpi b/buildsystem/Makefile.comp.supermuc-ng-openmpi
new file mode 100644
index 0000000000000000000000000000000000000000..d3d93121ee7871b8764c0a5d2f56563a6ad0d5a0
--- /dev/null
+++ b/buildsystem/Makefile.comp.supermuc-ng-openmpi
@@ -0,0 +1,17 @@
+CC       =  mpicc   -std=c11  # sets the C-compiler
+CPP      =  mpicxx  -std=c++11 # sets the C++-compiler
+OPTIMIZE =  -ggdb -O3 -march=native  -Wall -Wno-format-security 
+
+ifeq (EXPLICIT_VECTORIZATION,$(findstring EXPLICIT_VECTORIZATION,$(CONFIGVARS)))
+CFLAGS_VECTOR += -mavx2  # enables generation of AVX instructions (used through vectorclass)
+CPV      =  $(CPP)
+else
+CFLAGS_VECTOR = 
+CPV      =  $(CPP)
+endif
+
+ifeq (NUM_THREADS,$(findstring NUM_THREADS,$(CONFIGVARS)))
+OPTIMIZE +=  -fopenmp
+else
+OPTIMIZE +=  -Wno-unknown-pragmas
+endif
diff --git a/buildsystem/Makefile.config b/buildsystem/Makefile.config
new file mode 100644
index 0000000000000000000000000000000000000000..51e7736c403be24e22ac924bbb9cedcff2c5f942
--- /dev/null
+++ b/buildsystem/Makefile.config
@@ -0,0 +1,12 @@
+#/*******************************************************************************
+# * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+# * \copyright   by Volker Springel. Copyright (C) 2014, 2015 by Volker Springel
+# * \copyright   (volker.springel@h-its.org) and all contributing authors.
+# *******************************************************************************/
+
+RESULT     := $(shell mkdir -p $(BUILD_DIR)  )
+
+all: $(BUILD_DIR)/gadgetconfig.h 
+
+$(BUILD_DIR)/gadgetconfig.h:  $(CONFIG) buildsystem/config.py
+	$(PYTHON) buildsystem/config.py $(CONFIG) $(BUILD_DIR) $(CURDIR) $(SRC_DIR)
diff --git a/buildsystem/Makefile.gen.libs b/buildsystem/Makefile.gen.libs
new file mode 100644
index 0000000000000000000000000000000000000000..a1763c8456a5b4bd458445f2de9dffa44eaf1308
--- /dev/null
+++ b/buildsystem/Makefile.gen.libs
@@ -0,0 +1,8 @@
+GSL_INCL   = -I$(LIB_DIR)/gsl/build/include
+GSL_LIBS   = -L$(LIB_DIR)/gsl/build/lib
+FFTW_INCL  = -I$(LIB_DIR)/fftw3/build/include
+FFTW_LIBS  = -L$(LIB_DIR)/fftw3/build/lib
+HDF5_INCL  = -I$(LIB_DIR)/hdf5/build/include
+HDF5_LIBS  = -L$(LIB_DIR)/hdf5/build/lib  
+HWLOC_INCL = -I$(LIB_DIR)/hwloc/build/include
+HWLOC_LIBS = -L$(LIB_DIR)/hwloc/build/lib
diff --git a/buildsystem/Makefile.lib b/buildsystem/Makefile.lib
new file mode 100644
index 0000000000000000000000000000000000000000..b99657d994ed6a6f471d124c6ba48903dc213d3f
--- /dev/null
+++ b/buildsystem/Makefile.lib
@@ -0,0 +1,78 @@
+LIB_DIR=extlibs
+
+FFTW_VERSION=3.3.4
+GSL_VERSION=1.16
+HDF5_VERSION=1.8.15-patch1
+HWLOC_VERSION=1.8.6
+
+libs: gsl fftw-single fftw-double hdf5 hwloc
+
+SHELL=/bin/bash
+
+$(LIB_DIR):
+	mkdir $(LIB_DIR)
+
+
+
+
+fftw-single: $(LIB_DIR)/include/sfftw.h
+
+fftw-double: $(LIB_DIR)/include/dfftw.h
+
+$(LIB_DIR)/fftw-$(FFTW_VERSION).tar.gz: | $(LIB_DIR)
+	cd $(LIB_DIR); wget http://www.fftw.org/fftw-$(FFTW_VERSION).tar.gz
+	
+$(LIB_DIR)/fftw-$(FFTW_VERSION)/: $(LIB_DIR)/fftw-$(FFTW_VERSION).tar.gz
+	cd $(LIB_DIR); tar -xf fftw-$(FFTW_VERSION).tar.gz
+		
+$(LIB_DIR)/include/sfftw.h: | $(LIB_DIR)/fftw-$(FFTW_VERSION)/
+	cd $(LIB_DIR)/fftw-$(FFTW_VERSION); export LD_LIBRARY_PATH=$(LIB_DIR)/lib; ./configure --prefix=$(CURDIR)/$(LIB_DIR)/ --enable-float; make; make install
+	
+$(LIB_DIR)/include/dfftw.h: | $(LIB_DIR)/fftw-$(FFTW_VERSION)/	
+	cd $(LIB_DIR)/fftw-$(FFTW_VERSION); export LD_LIBRARY_PATH=$(LIB_DIR)/lib; ./configure --prefix=$(CURDIR)/$(LIB_DIR)/ ; make; make install
+
+
+
+
+gsl: $(LIB_DIR)/include/gsl/
+
+$(LIB_DIR)/gsl-$(GSL_VERSION).tar.gz: | $(LIB_DIR)
+	cd $(LIB_DIR); wget  http://ftpmirror.gnu.org/gsl/gsl-$(GSL_VERSION).tar.gz
+		
+$(LIB_DIR)/gsl-$(GSL_VERSION)/: $(LIB_DIR)/gsl-$(GSL_VERSION).tar.gz
+	cd $(LIB_DIR); tar -xf gsl-$(GSL_VERSION).tar.gz
+	
+$(LIB_DIR)/include/gsl/: | $(LIB_DIR)/gsl-$(GSL_VERSION)/
+	cd $(LIB_DIR)/gsl-$(GSL_VERSION);export LD_LIBRARY_PATH=$(LIB_DIR)/lib; ./configure --prefix=$(CURDIR)/$(LIB_DIR)/; make; make install
+	
+	
+  
+  
+hwloc: $(LIB_DIR)/include/hwloc.h
+
+$(LIB_DIR)/hwloc-$(HWLOC_VERSION).tar.gz: | $(LIB_DIR)
+	cd $(LIB_DIR); wget  http://www.open-mpi.de/software/hwloc/v1.8/downloads/hwloc-$(HWLOC_VERSION).tar.gz
+
+$(LIB_DIR)/hwloc-$(HWLOC_VERSION)/:	$(LIB_DIR)/hwloc-$(HWLOC_VERSION).tar.gz
+	cd $(LIB_DIR); tar -xf hwloc-$(HWLOC_VERSION).tar.gz
+	
+$(LIB_DIR)/include/hwloc.h: | $(LIB_DIR)/hwloc-$(HWLOC_VERSION)/
+	cd $(LIB_DIR)/hwloc-$(HWLOC_VERSION); ./configure --prefix=$(CURDIR)/$(LIB_DIR)/; make; make install
+	
+	
+  
+  
+hdf5: $(LIB_DIR)/include/hdf5.h
+
+$(LIB_DIR)/hdf5-$(HDF5_VERSION).tar.gz: | $(LIB_DIR)
+	cd $(LIB_DIR); wget ftp://ftp.hdfgroup.org/HDF5/current/src/hdf5-$(HDF5_VERSION).tar.gz
+	
+$(LIB_DIR)/hdf5-$(HDF5_VERSION): $(LIB_DIR)/hdf5-$(HDF5_VERSION).tar.gz
+	cd $(LIB_DIR); tar -xf hdf5-$(HDF5_VERSION).tar.gz
+	
+$(LIB_DIR)/include/hdf5.h: | $(LIB_DIR)/hdf5-$(HDF5_VERSION)
+	/bin/bash -c 'cd $(LIB_DIR)/hdf5-$(HDF5_VERSION); export LD_LIBRARY_PATH=$(LIB_DIR)/lib; ./configure --prefix=$(CURDIR)/$(LIB_DIR)/ --enable-parallel; make; make install'
+
+
+
+.PHONY = gsl fftw-single fftw-double hdf5 hwloc
diff --git a/buildsystem/Makefile.path.bwforcluster b/buildsystem/Makefile.path.bwforcluster
new file mode 100644
index 0000000000000000000000000000000000000000..8063eae2f619939dede517a7de8d6e6c918aa648
--- /dev/null
+++ b/buildsystem/Makefile.path.bwforcluster
@@ -0,0 +1,10 @@
+GSL_INCL   =  -I/home/hd/hd_hd/hd_mm002/libraries/include
+GSL_LIBS   =  -L/home/hd/hd_hd/hd_mm002/libraries/lib  -Xlinker -R -Xlinker /home/hd/hd_hd/hd_mm002/libraries/lib
+FFTW_INCL  =  -I/home/hd/hd_hd/hd_mm002/libraries/include
+FFTW_LIBS  =  -L/home/hd/hd_hd/hd_mm002/libraries/lib  -Xlinker -R -Xlinker /home/hd/hd_hd/hd_mm002/libraries/lib
+HDF5_INCL  =  -I/home/hd/hd_hd/hd_mm002/libraries/include
+HDF5_LIBS  =  -L/home/hd/hd_hd/hd_mm002/libraries/lib  -Xlinker -R -Xlinker /home/hd/hd_hd/hd_mm002/libraries/lib
+HWLOC_INCL =  -I/home/hd/hd_hd/hd_mm002/libraries/include
+HWLOC_LIBS =  -L/home/hd/hd_hd/hd_mm002/libraries/lib  -Xlinker -R -Xlinker /home/hd/hd_hd/hd_mm002/libraries/lib
+MAPS_INCL  =  -I/home/hd/hd_hd/hd_bd123/Libs/include
+MAPS_LIBS  =  -L/home/hd/hd_hd/hd_bd123/Libs/lib
diff --git a/buildsystem/Makefile.path.cobra b/buildsystem/Makefile.path.cobra
new file mode 100644
index 0000000000000000000000000000000000000000..5d21542649205f03f8b54fbba64fab77e9335cf9
--- /dev/null
+++ b/buildsystem/Makefile.path.cobra
@@ -0,0 +1,11 @@
+GSL_INCL   =  -I$(GSL_HOME)/include
+GSL_LIBS   =  -L$(GSL_HOME)/lib  -Xlinker -R -Xlinker $(GSL_HOME)/lib
+FFTW_INCL  =  -I$(FFTW_HOME)/include
+FFTW_LIBS  =  -L$(FFTW_HOME)/lib  -Xlinker -R -Xlinker $(FFTW_HOME)/lib
+HDF5_INCL  =  -I$(HDF5_HOME)/include
+HDF5_LIBS  =  -L$(HDF5_HOME)/lib -Xlinker -R -Xlinker $(HDF5_HOME)/lib
+HWLOC_INCL =  
+HWLOC_LIBS =  
+MAPS_INCL  =  -I/u/vrs/Libs/include
+MAPS_LIBS  =  -L/u/vrs/Libs/lib  -Xlinker -R -Xlinker /u/vrs/Libs/lib
+
diff --git a/buildsystem/Makefile.path.default b/buildsystem/Makefile.path.default
new file mode 100644
index 0000000000000000000000000000000000000000..05676517fb0ca702a639057637f7f70a0310e032
--- /dev/null
+++ b/buildsystem/Makefile.path.default
@@ -0,0 +1,8 @@
+GSL_INCL   =
+GSL_LIBS   =
+FFTW_INCL  =
+FFTW_LIBS  =
+HDF5_INCL  =
+HDF5_LIBS  =
+HWLOC_INCL =
+HWLOC_LIBS =
diff --git a/buildsystem/Makefile.path.freya b/buildsystem/Makefile.path.freya
new file mode 100644
index 0000000000000000000000000000000000000000..60ced5836277f803fcc95654078e3966c12d8e5d
--- /dev/null
+++ b/buildsystem/Makefile.path.freya
@@ -0,0 +1,11 @@
+GSL_INCL   =  -I$(GSL_HOME)/include
+GSL_LIBS   =  -L$(GSL_HOME)/lib  -Xlinker -R -Xlinker $(GSL_HOME)/lib
+FFTW_INCL  =  -I$(FFTW_HOME)/include
+FFTW_LIBS  =  -L$(FFTW_HOME)/lib  -Xlinker -R -Xlinker $(FFTW_HOME)/lib
+HDF5_INCL  =  -I$(HDF5_HOME)/include
+HDF5_LIBS  =  -L$(HDF5_HOME)/lib -Xlinker -R -Xlinker $(HDF5_HOME)/lib
+HWLOC_INCL =  
+HWLOC_LIBS =  
+MAPS_INCL  =  -I/u/vrs/Libs/include
+MAPS_LIBS  =  -L/u/vrs/Libs/lib  -Xlinker -R -Xlinker /u/vrs/Libs/lib  -lcurl
+
diff --git a/buildsystem/Makefile.path.haswell b/buildsystem/Makefile.path.haswell
new file mode 100644
index 0000000000000000000000000000000000000000..6e0d278525c32a800904a5e508c9e2950bed6e71
--- /dev/null
+++ b/buildsystem/Makefile.path.haswell
@@ -0,0 +1,10 @@
+GSL_INCL   =  -I/hits/basement/tap/sw/libs/gsl-2.1/include
+GSL_LIBS   =  -L/hits/basement/tap/sw/libs/gsl-2.1/lib  -Xlinker -R -Xlinker /hits/basement/tap/sw/libs/gsl-2.1/lib
+FFTW_INCL  =  
+FFTW_LIBS  =  
+HDF5_INCL  =  -I/hits/basement/tap/sw/libs/hdf5-1.8.17/include
+HDF5_LIBS  =  -L/hits/basement/tap/sw/libs/hdf5-1.8.17/lib  -Xlinker -R -Xlinker /hits/basement/tap/sw/libs/hdf5-1.8.17/lib
+HWLOC_INCL =  -I/hits/basement/tap/sw/libs/hwloc-1.11.3/include
+HWLOC_LIBS =  -L/hits/basement/tap/sw/libs/hwloc-1.11.3/lib -Xlinker -R -Xlinker /hits/basement/tap/sw/libs/hwloc-1.11.3/lib
+VTUNE_INCL = -I/cm/shared/apps/intel/vtune_u13/vtune_amplifier_xe_2015/include
+VTUNE_LIBS = -L/cm/shared/apps/intel/vtune_u13/vtune_amplifier_xe_2015/lib64 -littnotify
diff --git a/buildsystem/Makefile.path.hydra b/buildsystem/Makefile.path.hydra
new file mode 100644
index 0000000000000000000000000000000000000000..2331b3b84bb04b68df163ca62b8f259f0b29eaea
--- /dev/null
+++ b/buildsystem/Makefile.path.hydra
@@ -0,0 +1,8 @@
+GSL_INCL   = -I/u/system/SLES11/soft/gsl/1.16/include
+GSL_LIBS   = -L/u/system/SLES11/soft/gsl/1.16/lib -Xlinker -R -Xlinker /u/system/SLES11/soft/gsl/1.16/lib
+FFTW_INCL  = -I/u/system/SLES11/soft/fftw/3.3.4/intel-14.0/mpi.ibm-1.4/include
+FFTW_LIBS  = -L/u/system/SLES11/soft/fftw/3.3.4/intel-14.0/mpi.ibm-1.4/lib -Xlinker -R -Xlinker /u/system/SLES11/soft/fftw/3.3.4/intel-14.0/mpi.ibm-1.4/lib
+HDF5_INCL  = -I/u/system/SLES11/soft/hdf5/1.8.14/intel14.0/serial/include
+HDF5_LIBS  =  -L/u/system/SLES11/soft/hdf5/1.8.14/intel14.0/serial/lib -Xlinker -R -Xlinker /u/system/SLES11/soft/hdf5/1.8.14/intel14.0/serial/lib
+HWLOC_INCL = -I/u/anbau/hwloc-1.11.2/include
+HWLOC_LIBS = -L/u/anbau/hwloc-1.11.2/lib -Xlinker -R -Xlinker /u/anbau/hwloc-1.11.2/lib
diff --git a/buildsystem/Makefile.path.libs b/buildsystem/Makefile.path.libs
new file mode 100644
index 0000000000000000000000000000000000000000..b201798a34f7f6e1101f69edc8d16183d58cb9bc
--- /dev/null
+++ b/buildsystem/Makefile.path.libs
@@ -0,0 +1,8 @@
+GSL_INCL   = -I$(LIB_DIR)/include
+GSL_LIBS   = -L$(LIB_DIR)/lib
+FFTW_INCL  = -I$(LIB_DIR)/include
+FFTW_LIBS  = -L$(LIB_DIR)/lib
+HDF5_INCL  = -I$(LIB_DIR)/include 
+HDF5_LIBS  = -L$(LIB_DIR)/lib 
+HWLOC_INCL = -I$(LIB_DIR)/include
+HWLOC_LIBS = -L$(LIB_DIR)/lib
diff --git a/buildsystem/Makefile.path.macports b/buildsystem/Makefile.path.macports
new file mode 100644
index 0000000000000000000000000000000000000000..30ff448554e32c63904cc1f9a02f4a13b5d9fdb7
--- /dev/null
+++ b/buildsystem/Makefile.path.macports
@@ -0,0 +1,10 @@
+GSL_INCL   = -I/opt/local/include 
+GSL_LIBS   = -L/opt/local/
+FFTW_INCL  = -I/opt/local/include
+FFTW_LIBS  = -L/opt/local/lib
+HDF5_INCL  = -I/opt/local/include
+HDF5_LIBS  = -L/opt/local/lib  
+HWLOC_INCL = -I/opt/local/include
+HWLOC_LIBS = -L/opt/local/lib 
+MAPS_INCL  = -I/opt/local/include/healpix_cxx 
+MAPS_LIBS  = -L/opt/local/lib 
diff --git a/buildsystem/Makefile.path.magny b/buildsystem/Makefile.path.magny
new file mode 100644
index 0000000000000000000000000000000000000000..5d81a622aed81453de0bf1016e7a4c23dd85445b
--- /dev/null
+++ b/buildsystem/Makefile.path.magny
@@ -0,0 +1,12 @@
+GSL_INCL   =  -I/hits/tap/sw/libs/gsl-1.15/include
+GSL_LIBS   =  -L/hits/tap/sw/libs/gsl-1.15/lib  -Xlinker -R -Xlinker /hits/tap/sw/libs/gsl-1.15/lib
+FFTW_INCL  =  -I/hits/tap/sw/libs/fftw-3.3.4/include
+FFTW_LIBS  =  -L/hits/tap/sw/libs/fftw-3.3.4/lib  -Xlinker -R -Xlinker /hits/tap/sw/libs/fftw-3.3.4/lib
+HDF5_INCL  =  -I/hits/tap/sw/libs/hdf5-1.8.10/include
+HDF5_LIBS  =  -L/hits/tap/sw/libs/hdf5-1.8.10/lib -Xlinker -R -Xlinker /hits/tap/sw/libs/hdf5-1.8.10/lib
+HWLOC_INCL =  -I/hits/tap/sw/libs/hwloc-1.5.1/include
+HWLOC_LIBS =  -L/hits/tap/sw/libs/hwloc-1.5.1/lib -Xlinker -R -Xlinker /hits/tap/sw/libs/hwloc-1.5.1/lib
+VTUNE_INCL =  -I/cm/shared/apps/intel/vtune_u13/vtune_amplifier_xe_2015/include
+VTUNE_LIBS =  -L/cm/shared/apps/intel/vtune_u13/vtune_amplifier_xe_2015/lib64 -littnotify
+MAPS_INCL  =  -I/hits/tap/sw/libs/healpix/include
+MAPS_LIBS  =  -L/hits/tap/sw/libs/healpix/lib
diff --git a/buildsystem/Makefile.path.mpa_desktop b/buildsystem/Makefile.path.mpa_desktop
new file mode 100644
index 0000000000000000000000000000000000000000..367466c883972ede31b8c358065bd8c4d94d5cab
--- /dev/null
+++ b/buildsystem/Makefile.path.mpa_desktop
@@ -0,0 +1,8 @@
+GSL_INCL   =  -I/usr/common/pdsoft/appl/gsl/include
+GSL_LIBS   =  -L/usr/common/pdsoft/appl/gsl/lib
+FFTW_INCL  =
+FFTW_LIBS  =
+HDF5_INCL  =  -I$(HDF5_HOME)/include
+HDF5_LIBS  =  -L$(HDF5_HOME)/lib 
+HWLOC_INCL =
+HWLOC_LIBS = 
diff --git a/buildsystem/Makefile.path.supermuc-ng b/buildsystem/Makefile.path.supermuc-ng
new file mode 100644
index 0000000000000000000000000000000000000000..28326c9797e3c1fdc27ad0254f55b3e00aeda1ee
--- /dev/null
+++ b/buildsystem/Makefile.path.supermuc-ng
@@ -0,0 +1,11 @@
+GSL_INCL   =  -I$(GSL_BASE)/include
+GSL_LIBS   =  -L$(GSL_BASE)/lib  -Xlinker -R -Xlinker $(GSL_BASE)/lib
+FFTW_INCL  =  -I$(FFTW_BASE)/include
+FFTW_LIBS  =  -L$(FFTW_BASE)/lib  -Xlinker -R -Xlinker $(FFTW_BASE)/lib
+HDF5_INCL  =  -I$(HDF5_BASE)/include
+HDF5_LIBS  =  -L$(HDF5_BASE)/lib -Xlinker -R -Xlinker $(HDF5_BASE)/lib
+HWLOC_INCL =  
+HWLOC_LIBS =  
+MAPS_INCL  =  -I/dss/dsshome1/04/lu79qih9/Libs/include
+MAPS_LIBS  =  -L/dss/dsshome1/04/lu79qih9/Libs/lib  -Xlinker -R -Xlinker /dss/dsshome1/04/lu79qih9/Libs/lib
+
diff --git a/buildsystem/Makefile.path.supermuc-ng-gcc b/buildsystem/Makefile.path.supermuc-ng-gcc
new file mode 100644
index 0000000000000000000000000000000000000000..2fb7334731cf86c0fef79195d0f1ef41f245cd49
--- /dev/null
+++ b/buildsystem/Makefile.path.supermuc-ng-gcc
@@ -0,0 +1,11 @@
+GSL_INCL   =  -I$(GSL_BASE)/include
+GSL_LIBS   =  -L$(GSL_BASE)/lib  -Xlinker -R -Xlinker $(GSL_BASE)/lib
+FFTW_INCL  =  -I$(FFTW_BASE)/include
+FFTW_LIBS  =  -L$(FFTW_BASE)/lib  -Xlinker -R -Xlinker $(FFTW_BASE)/lib
+HDF5_INCL  =  -I$(HDF5_BASE)/include
+HDF5_LIBS  =  -L$(HDF5_BASE)/lib -Xlinker -R -Xlinker $(HDF5_BASE)/lib
+HWLOC_INCL =  
+HWLOC_LIBS =  
+MAPS_INCL  =  -I/dss/dsshome1/04/lu79qih9/Libs/SuperMUC-NG-GCC/include
+MAPS_LIBS  =  -L/dss/dsshome1/04/lu79qih9/Libs/SuperMUC-NG-GCC/lib  -Xlinker -R -Xlinker /dss/dsshome1/04/lu79qih9/Libs/SuperMUC-NG-GCC/lib
+
diff --git a/buildsystem/check.py b/buildsystem/check.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f75a57c8ba0a1609821579dcdc2e45238e948f8
--- /dev/null
+++ b/buildsystem/check.py
@@ -0,0 +1,337 @@
+import re 
+import sys
+import os
+
+def parseIf(string, defines, fin):
+    s = string
+    
+    while len(s) > 0:
+        if s.startswith("defined"):
+            s = s[7:]
+            continue
+            
+        if s.startswith("//"):
+            return
+            
+        m = re.search("[a-zA-Z_][a-zA-Z_0-9]*",s)
+        if m is not None and m.start() == 0:
+            defines.update([m.group()])
+            #print "%s : %s"%(string,m.group())
+            s = s[m.span()[1]:]
+            continue
+        
+        if s.startswith("/*"):
+            m = re.search("\*/",s)
+            if m is not None:
+                s = s[m.span()[1]:]
+                continue
+            else:
+                return
+            
+        if s[0] == "\n":
+            return
+            
+        if s == "\\\n":
+            string = fin.readline()
+            s = string
+            continue
+            
+        if s[0] in '<>+-*/=!&|() 0123456789\t':
+            s = s[1:]
+            continue
+            
+        print("Strange character in '%s' detected: '%s', skipping it."%(string, s[0]))
+        s = s[1:]
+            
+            
+
+#Find all macros used  in a .c or .h files
+def filter_code(fin):
+    defines = set()
+    
+    line = fin.readline()
+    while line != "":
+        s = line.lstrip()
+        #print s
+        if s.startswith("#if "):
+            parseIf(s[4:],defines,fin)
+        elif s.startswith("#elseif "):
+            parseIf(s[8:],defines,fin)
+        elif s.startswith("#elif "):
+            parseIf(s[6:],defines,fin)    
+            
+        elif s.startswith("#ifdef ") or s.startswith("#ifndef "):
+            if s.startswith("#ifdef "):
+                s = s[7:]
+            else:
+                s = s[8:]
+                
+            s = s.lstrip()
+            m = re.search("[a-zA-Z_][a-zA-Z_0-9]*",s)
+            if m is not None and m.start() == 0:
+                defines.update([m.group()])
+            else:
+                print("Strange #ifdef/#ifndef: '%s'. Skipping it.",s)
+            
+        line = fin.readline()
+    
+    return defines
+    
+#Find all items of Template-Config.sh
+def filter_template_config(fin):
+    defines = set()
+    for line in fin:
+        s = line.split()
+        if(len(s)>0):
+            d = re.findall("^#*([a-zA-Z_][a-zA-Z_0-9]*)",s[0])
+            for dd in d:
+                defines.update([dd])
+                #print dd
+    
+    return defines
+
+#Find all items of src/data/allvars.cc
+def filter_template_ioparam(fin):
+    defines = set()
+    s = fin.read()
+    d = re.findall(r"add_param\(\"([a-zA-Z_][a-zA-Z_0-9]*)\"",s)
+    for dd in d:
+        defines.update([dd])
+
+    defines.update(["SofteningComovingClass0"])
+    defines.update(["SofteningMaxPhysClass0"])
+    defines.update(["SofteningClassOfPartType0"])
+        
+    return defines
+
+
+#Find all active items on Config.sh    
+def filter_config(fin):
+    defines = set()
+    for line in fin:
+        s = line.split()
+        if(len(s)>0):
+            d = re.findall("^([a-zA-Z_][a-zA-Z_0-9]*)",s[0])
+            for dd in d:
+                defines.update([dd])
+    
+    return defines
+    
+#Find all macros used in Makefile
+def filter_makefile(fin):
+    defines = set()
+    for line in fin:
+        s = line.strip()
+        if s.startswith("ifeq"):
+            d = re.findall("ifeq\s*\(([a-zA-Z_][a-zA-Z_0-9]*)\s*,\s*\$\(findstring",s)
+            for dd in d:
+                defines.update([dd])
+        if s.startswith("ifneq"):
+            d = re.findall("ifneq\s*\(([a-zA-Z_][a-zA-Z_0-9]*)\s*,\s*\$\(findstring",s)
+            for dd in d:
+                defines.update([dd])
+    
+    return defines
+    
+#Parse all documented options from README-Config.md
+def filter_readme_config(fin):
+    defines = set()
+    s = fin.read()
+#    d = re.findall(r"^\*\*([a-zA-Z_][a-zA-Z_0-9]*)\*\*\s*\n\s*\n[\W\w]{40}",s)
+    d = re.findall(r"\*\*([a-zA-Z_][a-zA-Z_0-9]*)\*\*",s)
+    for dd in d:
+        defines.update([dd])
+        
+    return defines
+    
+    
+def load(fin):
+    s = set()
+    
+    for i in fin:
+        if not i.startswith("#"):
+            s.update([i.strip()])
+        
+    return s
+    
+def write(s, fout):
+    fout = open(fout,'w')
+    for i in sorted(s):
+        fout.write(i + os.linesep)
+        
+    fout.close()
+    
+#Check source files for illegal macros
+def check_code(fin, fout, template, extra):
+    allowed = filter_template_config(template)
+    #print allowed
+    allowed.update(filter_config(extra))
+    
+    used = filter_code(fin)
+    #print used
+    
+    diff =  sorted(used.difference(allowed))
+    
+    if len(diff) > 0:
+        print("\nIllegal macros/options detected in file %s.\nCheck for potential typos and add them either to the file 'Template-Config.sh' or 'defines_extra'\n('defines_extra' is for macros which are either internally defined or should not appear in Template-Config.sh).\nIn case you want to suppress this check, build with 'make build' instead of 'make'.\n"%fin.name)
+        for i in diff:
+            print(i)
+        print("")
+        exit(1)
+        
+    write(used,fout)
+    exit(0)
+    
+#Check Makefile for illegal options
+def check_makefile(fin, fout, template, extra):
+    allowed = filter_template_config(template)
+    #print allowed
+    allowed.update(filter_config(extra))
+    
+    used = filter_makefile(fin)
+    #print used
+    
+    diff =  sorted(used.difference(allowed))
+    
+    if len(diff) > 0:
+        print("\nIllegal macros/options detected in file %s.\nCheck for potential typos and add them either to the file 'Template-Config.sh' or 'defines_extra'\n('defines_extra' is for macros which are either internally defined or should not appear in Template-Config.sh).\nIn case you want to suppress this check, build with 'make build' instead of 'make'.\n"%fin.name)
+        for i in diff:
+            print(i)
+        print("")
+        exit(1)
+        
+    write(used,fout)
+    exit(0)
+    
+#Check Config.sh for illegal options
+def check_config(fin, fout, args, extra):
+    allowed = filter_config(extra)
+    
+    for file in args:
+        allowed.update(load(open(file,'r')))
+
+    used = filter_config(fin)
+    
+    diff =  sorted(used.difference(allowed))
+    
+    if len(diff) > 0:
+        print("\nThe following options are active in %s, but are not used in any of the\nsource code files being compiled into the final executable.\nPlease check for typos and deactivate the options.\nIn case you want to suppress this check, build with 'make build' instead of 'make'.\n"%fin.name)
+        for i in diff:
+            print(i)
+        print("")
+        exit(1)
+        
+    write(used,fout)
+    exit(0)
+
+#Check whether all Template-=Config options are documented
+def check_documentation(fin, fout, fdoc):
+    documented = filter_readme_config(fdoc)
+    
+    used = filter_template_config(fin)
+    
+    diff =  sorted(used.difference(documented))
+    
+    ex = False
+    
+    if len(diff) > 0:
+        print("\nERROR: The following options are undocumented in %s, but appear in %s.\n       Please add a proper documentation!\n"%(fdoc.name,fin.name))
+        for i in diff:
+            print(i)
+        print("")
+        ex = True
+        
+    diff =  sorted(documented.difference(used))
+    
+    if len(diff) > 0:
+        print("\nERROR: The following options are documented in %s, but are not used in %s anymore.\n       Please remove redundant documentation!\n"%(fdoc.name,fin.name))
+        for i in diff:
+            print(i)
+        print("")
+        ex = True
+        
+    if ex:
+        exit(1)
+        
+    write(used,fout)
+    exit(0)    
+
+#Check whether all src/io/paramaters.c options are documented
+def check_parameters(fin, fout, fdoc):
+    documented = filter_readme_config(fdoc)
+    
+    used = filter_template_ioparam(fin)
+    
+    diff =  sorted(used.difference(documented))
+    
+    ex = False
+    
+    if len(diff) > 0:
+        print("\nERROR: The following parameters are undocumented in %s, but appear in %s.\n       Please add a proper documentation!\n"%(fdoc.name,fin.name))
+        for i in diff:
+            print(i)
+        print("")
+        ex = True
+        
+    diff =  sorted(documented.difference(used))
+    
+    if len(diff) > 0:
+        print("\nERROR: The following parameters are documented in %s, but are not used in %s.\n       Please remove redundant documentation!\n"%(fdoc.name,fin.name))
+        for i in diff:
+            print(i)
+        print("")
+        ex = True
+        
+    if ex:
+        exit(1)
+        
+    write(used,fout)
+    exit(0)    
+
+    
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        exit(1)
+        
+    mode = int(sys.argv[1])
+    if mode < 1 or mode > 5:
+        print("Unknown mode")
+        exit(1)
+        
+    fin = open(sys.argv[2],'r')
+    fout = sys.argv[3]
+    
+    if mode == 1:
+        print("Checking %s for illegal define macros"%sys.argv[2])
+        template = open(sys.argv[4],'r')
+        extra = open(sys.argv[5],'r')
+        
+        check_code(fin, fout, template, extra)
+        
+    if mode == 2:
+        print("Checking active options of %s"%sys.argv[2])
+        extra = open(sys.argv[4],'r')
+        check_config(fin, fout, sys.argv[5:], extra)
+        
+    if mode == 3:
+        print("Checking %s for illegal define macros"%sys.argv[2])
+        template = open(sys.argv[4],'r')
+        extra = open(sys.argv[5],'r')
+        
+        check_makefile(fin, fout, template, extra)
+        
+    if mode == 4:
+        print("Checking %s for documented options"%sys.argv[2])
+        template = open(sys.argv[2],'r')
+        doc = open(sys.argv[4],'r')
+        
+        check_documentation(fin, fout, doc)
+                
+    if mode == 5:
+        print("Checking %s for documented parameters"%sys.argv[2])
+        template = open(sys.argv[2],'r')
+        doc = open(sys.argv[4],'r')
+        
+        check_parameters(fin, fout, doc)
+        
+        
diff --git a/buildsystem/config.py b/buildsystem/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e98b05ca4a65e58c48c49792d76a76097ba194f
--- /dev/null
+++ b/buildsystem/config.py
@@ -0,0 +1,126 @@
+import re 
+import sys
+import os
+
+
+def get_options(fin):
+    options = {}
+    for line in fin:
+        s = line.split()
+        if(len(s)>0):
+            if(s[0][0] != "#"):
+                val = s[0].split("=")
+                if len(val) > 1:
+                    options[val[0]] =  val[1]
+                else:
+                    options[val[0]] = None
+    
+    return options
+    
+def out1(options, fname):
+    f = open(fname, "w")
+    
+    keys = list(options.keys())
+    keys.sort()
+    
+    for key in keys:
+        if options[key] is None:
+            f.write("#define " + key + "\n")
+        else:
+            f.write("#define " + key + " " + options[key] + "\n")
+            
+def out2(options, fname):
+    f = open(fname, "w")
+    
+    str = """#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include \"gadgetconfig.h\"
+#include \"data/dtypes.h\"
+#include \"data/allvars.h\"
+#include \"main/main.h\"
+void output_compile_time_options(void)\n{
+printf(
+"""
+    f.write(str)
+    
+    keys = list(options.keys())
+    keys.sort()
+    
+    for key in keys:
+        if options[key] is None:
+            f.write("\"    " + key + "\\n\"\n")
+        else:
+            f.write("\"    " + key + "=" + options[key] + "\\n\"\n")
+            
+    str = """);
+}"""
+    f.write(str)
+    
+def out3(options, fname):
+    f = open(fname, "w")
+    
+    str = """#include <mpi.h>
+#include <stdio.h>
+#include <hdf5.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <math.h>
+#include \"gadgetconfig.h\"
+#include \"data/constants.h\"
+#include \"data/dtypes.h\"
+#include \"data/macros.h\"
+#include \"io/io.h\"
+hid_t my_H5Acreate(hid_t loc_id, const char *attr_name, hid_t type_id, hid_t space_id, hid_t acpl_id);
+hid_t my_H5Screate(H5S_class_t type);
+herr_t my_H5Aclose(hid_t attr_id, const char *attr_name);
+herr_t my_H5Awrite(hid_t attr_id, hid_t mem_type_id, const void *buf, const char *attr_name);
+herr_t my_H5Sclose(hid_t dataspace_id, H5S_class_t type);
+herr_t my_H5Tclose(hid_t type_id);
+
+void IO_Def::write_compile_time_options_in_hdf5(hid_t handle)
+{
+hid_t hdf5_dataspace, hdf5_attribute;
+double val;
+hid_t atype = H5Tcopy(H5T_C_S1);
+H5Tset_size(atype, 1);\n
+"""
+    f.write(str)
+    
+    keys = list(options.keys())
+    keys.sort()
+    
+    for key in keys:
+        f.write("hdf5_dataspace = my_H5Screate(H5S_SCALAR);")
+            
+        if options[key] is None:
+            f.write("hdf5_attribute = my_H5Acreate(handle, \"" + key + "\" , atype, hdf5_dataspace, H5P_DEFAULT);\n")
+            f.write("my_H5Awrite(hdf5_attribute, atype, \"\", \"" + key + "\");\n")
+        else:
+            f.write("hdf5_attribute = my_H5Acreate(handle, \"" + key + "\" , H5T_NATIVE_DOUBLE, hdf5_dataspace, H5P_DEFAULT);\n")
+            f.write("val = " + options[key] + ";\n")
+            f.write("my_H5Awrite(hdf5_attribute, H5T_NATIVE_DOUBLE, &val, \"" + key + "\");\n")
+            
+        f.write("my_H5Aclose(hdf5_attribute, \"" + key + "\");\n")
+        f.write("my_H5Sclose(hdf5_dataspace, H5S_SCALAR);\n\n")
+
+    f.write("my_H5Tclose(atype);\n")
+    f.write("}\n")
+    f.write("\n")
+
+if __name__ == "__main__":
+    if len(sys.argv) < 5:
+        print("Usage: python check.py <Config.sh> <build_dir> <curr_dir> <src_dir>")
+        exit(1)
+        
+       
+    fin = open(sys.argv[1],'r')
+    fout = sys.argv[2]
+    
+    options = get_options(fin)
+    
+    out1(options, sys.argv[2] + "/gadgetconfig.h")
+    out2(options, sys.argv[2] + "/compile_time_info.cc")
+    out3(options, sys.argv[2] + "/compile_time_info_hdf5.cc")
diff --git a/buildsystem/git_version.sh b/buildsystem/git_version.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a2ff7ccc9ccf9e0c7589b3218df967e52b389314
--- /dev/null
+++ b/buildsystem/git_version.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+
+DATE=$(git log -n 1 2> /dev/null | head -n4 | grep "Date" | cut -d' ' -f4-)
+COMMIT=$(git log -n 1 2> /dev/null | head -n1 | grep "commit" | cut -d' ' -f2-)
+
+
+if  [[ $DATE == "" ]]
+    then
+    DATE="unknown"
+fi
+
+if  [[ $COMMIT == "" ]]
+    then
+    COMMIT="unknown"
+fi
+
+
+
+if [ -f $BUILD_DIR/version.cc ]
+    then
+
+    COMMIT2=$(grep "GIT_COMMIT" $BUILD_DIR/version.cc)
+
+    if [[ $COMMIT2 == *$COMMIT* ]] #it's the same commit
+    then
+        exit
+    fi
+fi
+
+cp $SRC_DIR/gitversion/version $BUILD_DIR/version.cc
+sed -i.bu "s/_DATE_/$DATE/g" $BUILD_DIR/version.cc
+sed -i.bu "s/_COMMIT_/$COMMIT/g" $BUILD_DIR/version.cc
+rm $BUILD_DIR/version.cc.bu
diff --git a/defines_extra b/defines_extra
new file mode 100644
index 0000000000000000000000000000000000000000..fb52addc8147aacd4380ab1fcd489de9f6c8df83
--- /dev/null
+++ b/defines_extra
@@ -0,0 +1,186 @@
+#defines for headers
+VERSION_H
+HDF5UTIL_H
+ALLVARS_H
+MAIN_H
+EWALD_H
+DTYPES_H
+DOMAIN_H
+DOMAIN_PRIVATE_H
+GADGET_CXXUTILS_H
+TIMER_H
+TIMESTEP_H
+TAGS_H
+TREE_H
+MMPART_H
+MMPARTDATA_H
+KERNEL_H
+DRIFTFAC_H
+DENSITY_H
+UPDATETHERMO_H
+FORCETREE_H
+READSNAP_IO_H
+TREELINKS_IO_H
+TREE_PRIVATE_H
+GENERIC_COMM_H
+SYMTENSORS_H
+FMM_H
+SUBREADID_IO_H
+FMM_PRIVATE_H
+LGALAXIES_GALSNAP_IO_H
+IO_H
+IDSTORAGE_H
+LIGHTCONE_MASSMAP_IO_H
+DESCENDANT_IO_H
+PROGENITORS_IO_H
+GRAVTREE_WALK_H
+LCPART_H
+HALOTREES_IO_H
+SPH_H
+LCPARTDATA_H
+LIGHTCONE_IO_H
+FOF_IO_H
+LGALAXIES_GALSNAP_FINISH_IO_H
+SIMPART_H
+SPHPARTDATA_H
+SHAREDMEM_H
+LGALAXIES_H
+READTREES_MBOUND_IO_H
+FOFTREE_H
+LGALAXIES_OPTIONS_H
+GRAVITY_EWALDTENSORS_H
+SETCOMM_H
+LGALAXIES_TREE_IO_H
+PM_PERIODIC_H
+PINNING_H
+IO_STREAMCOUNT_H
+PARAMETERS_H
+IO_LIGHTCONE_READ_H
+RESTART_H
+FOF_H
+SNAP_READ_WRITE_H
+SIMULATION_H
+SUBFIND_H
+SUBFIND_PRIVATE_H
+SUBFIND_PRIVATE_CPP_H
+FOF_PRIVATE_H
+PM_NONPERIODIC_H
+SRC_DATA_SYMTENSOR_INDICES_H_
+FOF_CPP_H
+MACROS_H
+HEALTHTEST_H
+GRAV_FORCETEST_H
+TEST_IO_BANDWIDTH_H
+PARTDATA_H
+CONSTANTS_H
+LIGHTCONE_H
+IO_PRIVATE_H
+IO_FOF_PRIVATE_H
+LOGS_H
+SYSTEM_H
+SYSTEM_PRIVATE_H
+MYMALLOC_H
+MPI_UTILS_H
+PM_H
+SORT_H
+PARALLEL_SORT_H
+GADGET4_CXXSORT_H
+SPH_H
+TIME_INTEGRATION_H
+RR_MEMORY_H
+NGBTREE_H_
+HDF5_UTIL_H
+DATA_H
+PM_MPI_FFT_H
+LOCKS_H
+COOLING_PRIVATE_H
+COOLING_H
+MERGERTREE_H
+NGENIC_H
+CONVERT_H
+LGALAXIES_GALTREE_IO_H
+
+#system defines
+_MSC_FULL_VER
+_MSC_VER
+__GNUC__
+__INTEL_COMPILER
+__clang__
+__cplusplus
+__linux__
+
+#internally used
+M_PI
+TIMER_STRUCT
+TIMER_ENUM
+OMIT_GENERIC_GET_NUMNODES
+GENERIC_EXTENDED
+GRID
+GRIDX
+GRIDY
+GRIDZ
+LONG_X
+LONG_Y
+LONG_Z
+EWALD_TAYLOR_ORDER
+TESTGRID
+THREEDIMS
+BINS_PS
+DIRECT_SUMMATION_THRESHOLD
+POWERSPEC_FOLDFAC
+FFTW
+
+LIGHTCONE_ALLOC_FAC
+LIGHTCONE_MASSMAP_ALLOC_FAC
+LIGHTCONE_MAX_BOXREPLICAS
+LIGHTCONE_MAX_FILLFACTOR
+LIGHTCONE_ORDER_NSIDE
+
+NSOFTCLASSES_HYDRO
+STAR_TYPE
+MAXLEN_OUTPUTLIST
+VCL_NAMESPACE
+MAX_VARIATION_TOLERANCE
+ALLOC_TOLERANCE
+
+#to be moved to Template COnfig, or removed from code
+
+
+
+FMM_DEBUG
+CHECK_LOCAL_RANK
+
+
+LGALAXIES_COMPUTE_OBS_MAGS
+LGALAXIES_HALOPROPERTIES
+LGALAXIES_ICL
+LGALAXIES_METALS_SELF
+LGALAXIES_OUTPUT_MOMAF_INPUTS
+LGALAXIES_OUTPUT_REST_MAGS
+LGALAXIES_TRACK_BURST
+LGALAXIES_GUO10
+LGALAXIES_GUO13
+LGALAXIES_GALAXYTREE
+LGALAXIES_MASS_CHECKS
+LGALAXIES_OVERWRITE_OUTPUT
+LGALAXIES_HENRIQUES13
+LGALAXIES_CHIEFFI
+LGALAXIES_DTD
+LGALAXIES_PORTINARI
+LGALAXIES_KITZBICHLER
+LGALAXIES_LIGHT_OUTPUT
+LGALAXIES_OUTPUT_L_CONE_INPUTS
+LGALAXIES_OUTPUT_OBS_MAGS
+LGALAXIES_STAR_FORMATION_HISTORY
+LGALAXIES_POST_PROCESS_MAGS
+LGALAXIES_PHOTTABLES_PRECOMPUTED
+LGALAXIES_SPEC_PHOTABLES_ON_THE_FLY
+LGALAXIES_CONTINUOUS_TREES
+
+COMPUTE_SPECPHOT_PROPERTIES
+USE_MEMORY_TO_MINIMIZE_IO
+
+CORRECT_CIC
+DIFFERENT_TRANSFER_FUNC
+MULTICOMPONENTGLASSFILE
+NEUTRINOS
diff --git a/documentation/10_examples.md b/documentation/10_examples.md
index 567be1402e72f2acc00abf15aa9a6be5641bdfbe..30544622ee4ad3d5e7f9c296bfed2d6a8ac298f5 100644
Binary files a/documentation/10_examples.md and b/documentation/10_examples.md differ
diff --git a/documentation/12_lgalaxies.md b/documentation/12_lgalaxies.md
deleted file mode 100644
index 00e4a1cf05931ff500751401d6a59477e020599a..0000000000000000000000000000000000000000
--- a/documentation/12_lgalaxies.md
+++ /dev/null
@@ -1,313 +0,0 @@
-
-Semi-analytic galaxy formation postprocessing              {#sam}
-=============================================
-
-
-Parameterfile for L-Galaxies
-============================
-
-
-The following parameters refer to the semi-analytic model L-Galaxies as
-described last in Henriquez et al. (2015)
-
-**FileWithOutputRedshifts**    outputredshifts.txt
-
-This file lists the output redshifts at which time sclices are supposed to 
-be generated.
- 
-
--------
-
-**McFile**    reionfile.txt
-
-This file lists the reionization history adopted in the semi-analytic model.
-
--------
-
-**FileNameGalaxies**
-
-Dummy explanation.
-
--------
-
-**SpecPhotDir**
-
-Dummy explanation.
-
--------
-
-**PhotPrefix**
-
-Dummy explanation.
-
--------
-
-**SpecPhotIMF**
-
-Dummy explanation.
-
--------
-
-**FileWithFilterNames**
-
-Dummy explanation.
-
--------
-
-**CoolFunctionsDir**   
-
-Dummy explanation.
-
--------
-
-**LastDarkMatterSnapShot**
-
-Dummy explanation.
-
--------
-
-**FirstFile**
-
-Dummy explanation.
-
--------
-
-**LastFile**  
-
-Dummy explanation.
-
--------
-
-**ReionizationModel**
-
-Dummy explanation.
-
--------
-
-**DiskRadiusModel**
-
-Dummy explanation.
-
--------
-
-**StarFormationModel**
-
-Dummy explanation.
-
--------
-
-**FeedbackReheatingModel**
-
-Dummy explanation.
-
--------
-
-**FeedbackEjectionModel**
-
-Dummy explanation.
-
--------
-
-**FateOfSatellitesGas**
-
-Dummy explanation.
-
--------
-
-**ReIncorporationModel**
-
-Dummy explanation.
-
--------
-
-**AGNRadioModeModel**
-
-Dummy explanation.
-
--------
-
-**DiskInstabilityModel**
-
-Dummy explanation.
-
--------
-
-**BHGrowthInDiskInstabilityModel**
-
-Dummy explanation.
-
--------
-
-**HotGasStripingModel**
-
-Dummy explanation.
-
--------
-
-**DisruptionModel**
-
-Dummy explanation.
-
--------
-
-**StarBurstModel**
-
-Dummy explanation.
-
--------
-
-**BulgeFormationInMinorMergersOn**
-
-Dummy explanation.
-
--------
-
-**MetallicityOption** 
-
-Dummy explanation.
-
--------
-
-**Reionization_z0**
-
-Dummy explanation.
-
--------
-
-**Reionization_zr**
-
-Dummy explanation.
-
--------
-
-**Yield**
-
-Dummy explanation.
-
--------
-
-**RecycleFraction**
-
-Dummy explanation.
-
--------
-
-**ThreshMajorMerger**
-
-Dummy explanation.
-
--------
-
-**MergerTimeMultiplier**
-
-Dummy explanation.
-
--------
-
-**RamPressureStrip_CutOffMass**
-
-Dummy explanation.
-
--------
-
-**SfrEfficiency**
-
-Dummy explanation.
-
--------
-
-**SfrColdCrit**
-
-Dummy explanation.
-
--------
-
-**SfrBurstEfficiency**
-
-Dummy explanation.
-
--------
-
-**SfrBurstSlope**
-
-Dummy explanation.
-
--------
-
-**AgnEfficiency**
-
-Dummy explanation.
-
--------
-
-**BlackHoleGrowthRate**
-
-Dummy explanation.
-
--------
-
-**BlackHoleSeedMass**
-
-Dummy explanation.
-
--------
-
-**BlackHoleCutoffVelocity**
-
-Dummy explanation.
-
--------
-
-**FeedbackReheatingEpsilon**
-
-Dummy explanation.
-
--------
-
-**ReheatPreVelocity**
-
-Dummy explanation.
-
--------
-
-**ReheatSlope**
-
-Dummy explanation.
-
--------
-
-**FeedbackEjectionEfficiency**
-
-Dummy explanation.
-
--------
-
-**EjectPreVelocity**
-
-Dummy explanation.
-
--------
-
-**EjectSlope**
-
-Dummy explanation.
-
--------
-
-**ReIncorporationFactor**
-
-Dummy explanation.
-
--------
-
-**EnergySN**
-
-Dummy explanation.
-
--------
-
-**EtaSN**
-
-Dummy explanation.
-
--------
diff --git a/documentation/13_specialparams.md b/documentation/13_specialparams.md
deleted file mode 100644
index ff0f60c297620acd97606617c2c71d7937a9dfc0..0000000000000000000000000000000000000000
--- a/documentation/13_specialparams.md
+++ /dev/null
@@ -1,69 +0,0 @@
-
-Parameters for special postprocesing options              {#lcimage}
-============================================
-
-
-Parameterfile for Lightcone Imaging Tool
-========================================
-
-
-**LightConeImageConeNr**   0
-
-For the lightcone image code, which cone should be processed.
-
--------
-
-**LightConeImageCornerX**
-
-Lower left corner of image in comoving coordinates.
-
--------
-
-**LightConeImageCornerY**
-
-Lower left corner of image in comoving coordinates.
-
--------
-
-**LightConeImageLengthX**
-
-Horizontal extension of image in comoving coordinates.
-
--------
-
-**LightConeImageLengthY**
-
-
-Vertical extension of image in comoving coordinates.
-
--------
-
-**LightConeImagePicName**
-
-Partial file name of image file
-
--------
-
-**LightConeImagePixelsX**
-
-Number of pixels for image in x-dimension.
-
--------
-
-**LightConeImagePixelsY**
-
-Number of pixels for image in y-dimension.
-
--------
-
-**LightConeImageFirstConeDir**
-
-First cone dir to be read in for image.
-
--------
-
-**LightConeImageLastConeDir**
-
-Last cone dir to be read in for image.
-
--------
diff --git a/examples/CollidingGalaxiesSFR/Config.sh b/examples/CollidingGalaxiesSFR/Config.sh
new file mode 100644
index 0000000000000000000000000000000000000000..26504b5e009a309bd3fe6b39496985d0c973361f
--- /dev/null
+++ b/examples/CollidingGalaxiesSFR/Config.sh
@@ -0,0 +1,15 @@
+
+
+POSITIONS_IN_64BIT
+TREE_NUM_BEFORE_NODESPLIT=4
+GADGET2_HEADER
+
+SELFGRAVITY
+NTYPES=6
+NSOFTCLASSES=2
+
+MULTIPOLE_ORDER=3
+DOUBLEPRECISION=1
+
+COOLING
+STARFORMATION
diff --git a/examples/CollidingGalaxiesSFR/TREECOOL b/examples/CollidingGalaxiesSFR/TREECOOL
new file mode 100644
index 0000000000000000000000000000000000000000..730a1f8b997e59448606759bf4e4f219903669c1
--- /dev/null
+++ b/examples/CollidingGalaxiesSFR/TREECOOL
@@ -0,0 +1,171 @@
+  0.000 3.03516e-14 1.37296e-14 3.04873e-16 1.74434e-25 1.76233e-25 1.00198e-26
+  0.005 3.20557e-14 1.47386e-14 3.14717e-16 1.85463e-25 1.87090e-25 1.03701e-26
+  0.010 3.37379e-14 1.57232e-14 3.24518e-16 1.96306e-25 1.97710e-25 1.07195e-26
+  0.015 3.54076e-14 1.66914e-14 3.34310e-16 2.07032e-25 2.08173e-25 1.10691e-26
+  0.020 3.70746e-14 1.76519e-14 3.44133e-16 2.17717e-25 2.18565e-25 1.14202e-26
+  0.025 3.87497e-14 1.86137e-14 3.54027e-16 2.28440e-25 2.28979e-25 1.17740e-26
+  0.030 4.04442e-14 1.95867e-14 3.64036e-16 2.39288e-25 2.39514e-25 1.21319e-26
+  0.035 4.21704e-14 2.05814e-14 3.74207e-16 2.50352e-25 2.50277e-25 1.24953e-26
+  0.040 4.39415e-14 2.16092e-14 3.84589e-16 2.61733e-25 2.61381e-25 1.28660e-26
+  0.045 4.57713e-14 2.26820e-14 3.95237e-16 2.73534e-25 2.72948e-25 1.32455e-26
+  0.050 4.76748e-14 2.38127e-14 4.06206e-16 2.85868e-25 2.85108e-25 1.36356e-26
+  0.055 4.96649e-14 2.50126e-14 4.17547e-16 2.98833e-25 2.97975e-25 1.40379e-26
+  0.060 5.17433e-14 2.62842e-14 4.29269e-16 3.12445e-25 3.11577e-25 1.44526e-26
+  0.065 5.39090e-14 2.76279e-14 4.41369e-16 3.26695e-25 3.25918e-25 1.48795e-26
+  0.070 5.61603e-14 2.90436e-14 4.53847e-16 3.41576e-25 3.41001e-25 1.53185e-26
+  0.075 5.84958e-14 3.05314e-14 4.66697e-16 3.57078e-25 3.56831e-25 1.57694e-26
+  0.080 6.09147e-14 3.20921e-14 4.79923e-16 3.73198e-25 3.73418e-25 1.62321e-26
+  0.085 6.34210e-14 3.37301e-14 4.93541e-16 3.89966e-25 3.90806e-25 1.67070e-26
+  0.090 6.60202e-14 3.54508e-14 5.07577e-16 4.07420e-25 4.09053e-25 1.71947e-26
+  0.095 6.87178e-14 3.72597e-14 5.22055e-16 4.25603e-25 4.28216e-25 1.76960e-26
+  0.100 7.15197e-14 3.91629e-14 5.37001e-16 4.44557e-25 4.48359e-25 1.82115e-26
+  0.105 7.44316e-14 4.11660e-14 5.52443e-16 4.64323e-25 4.69541e-25 1.87419e-26
+  0.110 7.74567e-14 4.32728e-14 5.68396e-16 4.84924e-25 4.91808e-25 1.92875e-26
+  0.115 8.05977e-14 4.54866e-14 5.84877e-16 5.06376e-25 5.15205e-25 1.98486e-26
+  0.120 8.38574e-14 4.78108e-14 6.01902e-16 5.28699e-25 5.39777e-25 2.04256e-26
+  0.125 8.72386e-14 5.02490e-14 6.19488e-16 5.51911e-25 5.65571e-25 2.10187e-26
+  0.130 9.07449e-14 5.28052e-14 6.37654e-16 5.76035e-25 5.92640e-25 2.16283e-26
+  0.135 9.43816e-14 5.54857e-14 6.56428e-16 6.01112e-25 6.21054e-25 2.22550e-26
+  0.140 9.81554e-14 5.82973e-14 6.75838e-16 6.27188e-25 6.50889e-25 2.28995e-26
+  0.145 1.02073e-13 6.12473e-14 6.95918e-16 6.54310e-25 6.82228e-25 2.35627e-26
+  0.150 1.06141e-13 6.43433e-14 7.16699e-16 6.82530e-25 7.15153e-25 2.42451e-26
+  0.155 1.10366e-13 6.75924e-14 7.38214e-16 7.11896e-25 7.49749e-25 2.49477e-26
+  0.160 1.14754e-13 7.09995e-14 7.60494e-16 7.42442e-25 7.86090e-25 2.56710e-26
+  0.165 1.19310e-13 7.45687e-14 7.83569e-16 7.74196e-25 8.24246e-25 2.64158e-26
+  0.170 1.24039e-13 7.83043e-14 8.07471e-16 8.07192e-25 8.64293e-25 2.71828e-26
+  0.175 1.28947e-13 8.22108e-14 8.32231e-16 8.41460e-25 9.06308e-25 2.79726e-26
+  0.180 1.34040e-13 8.62930e-14 8.57885e-16 8.77036e-25 9.50374e-25 2.87861e-26
+  0.185 1.39325e-13 9.05574e-14 8.84475e-16 9.13966e-25 9.96583e-25 2.96241e-26
+  0.190 1.44808e-13 9.50112e-14 9.12045e-16 9.52302e-25 1.04503e-24 3.04876e-26
+  0.195 1.50497e-13 9.96618e-14 9.40641e-16 9.92097e-25 1.09583e-24 3.13776e-26
+  0.200 1.56400e-13 1.04517e-13 9.70311e-16 1.03341e-24 1.14907e-24 3.22951e-26
+  0.205 1.62524e-13 1.09584e-13 1.00111e-15 1.07629e-24 1.20488e-24 3.32411e-26
+  0.210 1.68878e-13 1.14873e-13 1.03309e-15 1.12079e-24 1.26336e-24 3.42171e-26
+  0.215 1.75471e-13 1.20392e-13 1.06634e-15 1.16696e-24 1.32463e-24 3.52245e-26
+  0.220 1.82312e-13 1.26150e-13 1.10090e-15 1.21487e-24 1.38882e-24 3.62649e-26
+  0.225 1.89411e-13 1.32158e-13 1.13687e-15 1.26456e-24 1.45604e-24 3.73398e-26
+  0.230 1.96778e-13 1.38425e-13 1.17430e-15 1.31609e-24 1.52642e-24 3.84509e-26
+  0.235 2.04420e-13 1.44957e-13 1.21328e-15 1.36952e-24 1.60010e-24 3.95998e-26
+  0.240 2.12348e-13 1.51762e-13 1.25387e-15 1.42491e-24 1.67719e-24 4.07880e-26
+  0.245 2.20569e-13 1.58846e-13 1.29613e-15 1.48232e-24 1.75781e-24 4.20173e-26
+  0.250 2.29093e-13 1.66216e-13 1.34015e-15 1.54182e-24 1.84212e-24 4.32893e-26
+  0.255 2.37931e-13 1.73879e-13 1.38603e-15 1.60347e-24 1.93024e-24 4.46063e-26
+  0.260 2.47094e-13 1.81844e-13 1.43393e-15 1.66735e-24 2.02234e-24 4.59720e-26
+  0.265 2.56597e-13 1.90119e-13 1.48409e-15 1.73355e-24 2.11859e-24 4.73907e-26
+  0.270 2.66456e-13 1.98715e-13 1.53671e-15 1.80214e-24 2.21919e-24 4.88670e-26
+  0.275 2.76684e-13 2.07640e-13 1.59204e-15 1.87323e-24 2.32432e-24 5.04058e-26
+  0.280 2.87295e-13 2.16903e-13 1.65028e-15 1.94689e-24 2.43415e-24 5.20114e-26
+  0.285 2.98297e-13 2.26504e-13 1.71157e-15 2.02315e-24 2.54877e-24 5.36861e-26
+  0.290 3.09693e-13 2.36443e-13 1.77598e-15 2.10204e-24 2.66824e-24 5.54317e-26
+  0.295 3.21489e-13 2.46719e-13 1.84363e-15 2.18357e-24 2.79263e-24 5.72499e-26
+  0.300 3.33687e-13 2.57331e-13 1.91459e-15 2.26777e-24 2.92200e-24 5.91426e-26
+  0.305 3.46292e-13 2.68275e-13 1.98896e-15 2.35463e-24 3.05640e-24 6.11109e-26
+  0.310 3.59296e-13 2.79549e-13 2.06668e-15 2.44414e-24 3.19585e-24 6.31526e-26
+  0.315 3.72693e-13 2.91149e-13 2.14768e-15 2.53628e-24 3.34037e-24 6.52651e-26
+  0.320 3.86474e-13 3.03070e-13 2.23188e-15 2.63101e-24 3.48997e-24 6.74449e-26
+  0.325 4.00628e-13 3.15309e-13 2.31918e-15 2.72828e-24 3.64466e-24 6.96887e-26
+  0.330 4.15148e-13 3.27859e-13 2.40950e-15 2.82807e-24 3.80444e-24 7.19932e-26
+  0.335 4.30037e-13 3.40716e-13 2.50279e-15 2.93035e-24 3.96932e-24 7.43572e-26
+  0.340 4.45297e-13 3.53874e-13 2.59903e-15 3.03515e-24 4.13931e-24 7.67797e-26
+  0.345 4.60935e-13 3.67327e-13 2.69817e-15 3.14243e-24 4.31441e-24 7.92600e-26
+  0.350 4.76953e-13 3.81068e-13 2.80019e-15 3.25221e-24 4.49461e-24 8.17967e-26
+  0.355 4.93350e-13 3.95087e-13 2.90502e-15 3.36442e-24 4.67986e-24 8.43878e-26
+  0.360 5.10092e-13 4.09358e-13 3.01249e-15 3.47887e-24 4.86986e-24 8.70267e-26
+  0.365 5.27139e-13 4.23848e-13 3.12243e-15 3.59530e-24 5.06427e-24 8.97056e-26
+  0.370 5.44447e-13 4.38524e-13 3.23464e-15 3.71346e-24 5.26271e-24 9.24160e-26
+  0.375 5.61966e-13 4.53349e-13 3.34890e-15 3.83305e-24 5.46475e-24 9.51489e-26
+  0.380 5.79656e-13 4.68292e-13 3.46479e-15 3.95381e-24 5.67002e-24 9.78927e-26
+  0.385 5.97501e-13 4.83345e-13 3.58126e-15 4.07565e-24 5.87839e-24 1.00628e-25
+  0.390 6.15495e-13 4.98501e-13 3.69703e-15 4.19851e-24 6.08976e-24 1.03333e-25
+  0.395 6.33628e-13 5.13759e-13 3.81073e-15 4.32233e-24 6.30403e-24 1.05985e-25
+  0.400 6.51892e-13 5.29112e-13 3.92091e-15 4.44704e-24 6.52109e-24 1.08557e-25
+  0.405 6.70266e-13 5.44548e-13 4.02635e-15 4.57250e-24 6.74074e-24 1.11029e-25
+  0.410 6.88697e-13 5.60022e-13 4.12706e-15 4.69834e-24 6.96240e-24 1.13388e-25
+  0.415 7.07118e-13 5.75479e-13 4.22336e-15 4.82413e-24 7.18542e-24 1.15628e-25
+  0.420 7.25457e-13 5.90861e-13 4.31558e-15 4.94937e-24 7.40907e-24 1.17741e-25
+  0.425 7.43639e-13 6.06104e-13 4.40409e-15 5.07356e-24 7.63257e-24 1.19718e-25
+  0.430 7.61593e-13 6.21150e-13 4.48926e-15 5.19622e-24 7.85520e-24 1.21555e-25
+  0.435 7.79283e-13 6.35966e-13 4.57121e-15 5.31709e-24 8.07643e-24 1.23264e-25
+  0.440 7.96680e-13 6.50521e-13 4.65009e-15 5.43595e-24 8.29579e-24 1.24861e-25
+  0.445 8.13752e-13 6.64782e-13 4.72602e-15 5.55253e-24 8.51278e-24 1.26364e-25
+  0.450 8.30465e-13 6.78717e-13 4.79916e-15 5.66661e-24 8.72686e-24 1.27789e-25
+  0.455 8.46779e-13 6.92287e-13 4.86956e-15 5.77787e-24 8.93742e-24 1.29153e-25
+  0.460 8.62630e-13 7.05442e-13 4.93682e-15 5.88589e-24 9.14370e-24 1.30445e-25
+  0.465 8.77947e-13 7.18124e-13 5.00044e-15 5.99020e-24 9.34488e-24 1.31651e-25
+  0.470 8.92654e-13 7.30275e-13 5.05989e-15 6.09029e-24 9.54006e-24 1.32757e-25
+  0.475 9.06670e-13 7.41833e-13 5.11460e-15 6.18564e-24 9.72833e-24 1.33747e-25
+  0.480 9.19911e-13 7.52732e-13 5.16403e-15 6.27569e-24 9.90870e-24 1.34606e-25
+  0.485 9.32296e-13 7.62912e-13 5.20780e-15 6.35990e-24 1.00802e-23 1.35328e-25
+  0.490 9.43739e-13 7.72311e-13 5.24556e-15 6.43770e-24 1.02419e-23 1.35906e-25
+  0.495 9.54150e-13 7.80863e-13 5.27693e-15 6.50848e-24 1.03926e-23 1.36336e-25
+  0.500 9.63436e-13 7.88500e-13 5.30153e-15 6.57161e-24 1.05313e-23 1.36613e-25
+  0.505 9.71506e-13 7.95155e-13 5.31900e-15 6.62650e-24 1.06569e-23 1.36730e-25
+  0.510 9.78299e-13 8.00783e-13 5.32906e-15 6.67274e-24 1.07686e-23 1.36681e-25
+  0.515 9.83755e-13 8.05339e-13 5.33145e-15 6.70994e-24 1.08658e-23 1.36463e-25
+  0.520 9.87817e-13 8.08778e-13 5.32590e-15 6.73772e-24 1.09477e-23 1.36068e-25
+  0.525 9.90421e-13 8.11053e-13 5.31214e-15 6.75566e-24 1.10135e-23 1.35491e-25
+  0.530 9.91509e-13 8.12119e-13 5.28993e-15 6.76339e-24 1.10625e-23 1.34728e-25
+  0.535 9.91034e-13 8.11944e-13 5.25917e-15 6.76061e-24 1.10942e-23 1.33775e-25
+  0.540 9.88954e-13 8.10498e-13 5.21981e-15 6.74704e-24 1.11081e-23 1.32634e-25
+  0.545 9.85225e-13 8.07751e-13 5.17181e-15 6.72241e-24 1.11036e-23 1.31305e-25
+  0.550 9.79804e-13 8.03673e-13 5.11512e-15 6.68643e-24 1.10803e-23 1.29786e-25
+  0.555 9.72656e-13 7.98241e-13 5.04976e-15 6.63889e-24 1.10377e-23 1.28080e-25
+  0.560 9.63786e-13 7.91453e-13 4.97591e-15 6.57980e-24 1.09756e-23 1.26189e-25
+  0.565 9.53210e-13 7.83316e-13 4.89379e-15 6.50925e-24 1.08941e-23 1.24117e-25
+  0.570 9.40948e-13 7.73839e-13 4.80367e-15 6.42734e-24 1.07932e-23 1.21870e-25
+  0.575 9.27023e-13 7.63033e-13 4.70584e-15 6.33420e-24 1.06729e-23 1.19453e-25
+  0.580 9.11479e-13 7.50925e-13 4.60072e-15 6.23011e-24 1.05334e-23 1.16874e-25
+  0.585 8.94420e-13 7.37592e-13 4.48914e-15 6.11574e-24 1.03758e-23 1.14153e-25
+  0.590 8.75971e-13 7.23127e-13 4.37207e-15 5.99193e-24 1.02012e-23 1.11313e-25
+  0.595 8.56270e-13 7.07630e-13 4.25056e-15 5.85960e-24 1.00109e-23 1.08379e-25
+  0.600 8.35464e-13 6.91213e-13 4.12575e-15 5.71971e-24 9.80648e-24 1.05377e-25
+  0.605 8.13689e-13 6.73973e-13 3.99881e-15 5.57316e-24 9.58915e-24 1.02335e-25
+  0.610 7.91014e-13 6.55949e-13 3.87085e-15 5.42035e-24 9.35945e-24 9.92793e-26
+  0.615 7.67495e-13 6.37169e-13 3.74305e-15 5.26160e-24 9.11776e-24 9.62355e-26
+  0.620 7.43195e-13 6.17664e-13 3.61667e-15 5.09725e-24 8.86454e-24 9.32324e-26
+  0.625 7.18183e-13 5.97467e-13 3.49305e-15 4.92770e-24 8.60025e-24 9.03005e-26
+  0.630 6.92510e-13 5.76609e-13 3.37306e-15 4.75324e-24 8.32529e-24 8.74586e-26
+  0.635 6.66160e-13 5.55097e-13 3.25568e-15 4.57383e-24 8.03966e-24 8.46801e-26
+  0.640 6.39103e-13 5.32935e-13 3.13941e-15 4.38935e-24 7.74325e-24 8.19269e-26
+  0.645 6.11305e-13 5.10128e-13 3.02264e-15 4.19968e-24 7.43598e-24 7.91577e-26
+  0.650 5.82733e-13 4.86680e-13 2.90365e-15 4.00470e-24 7.11776e-24 7.63286e-26
+  0.655 5.53398e-13 4.62628e-13 2.78116e-15 3.80458e-24 6.78897e-24 7.34068e-26
+  0.660 5.23475e-13 4.38110e-13 2.65588e-15 3.60049e-24 6.45155e-24 7.04071e-26
+  0.665 4.93190e-13 4.13298e-13 2.52895e-15 3.39394e-24 6.10796e-24 6.73562e-26
+  0.670 4.62781e-13 3.88374e-13 2.40165e-15 3.18652e-24 5.76080e-24 6.42827e-26
+  0.675 4.32506e-13 3.63534e-13 2.27531e-15 2.97993e-24 5.41289e-24 6.12171e-26
+  0.680 4.02608e-13 3.38965e-13 2.15110e-15 2.77581e-24 5.06689e-24 5.81862e-26
+  0.685 3.73229e-13 3.14778e-13 2.02940e-15 2.57509e-24 4.72449e-24 5.51973e-26
+  0.690 3.44498e-13 2.91080e-13 1.91041e-15 2.37866e-24 4.38723e-24 5.22542e-26
+  0.695 3.16553e-13 2.67978e-13 1.79434e-15 2.18744e-24 4.05678e-24 4.93605e-26
+  0.700 2.89539e-13 2.45592e-13 1.68142e-15 2.00239e-24 3.73490e-24 4.65201e-26
+  0.705 2.63589e-13 2.24030e-13 1.57187e-15 1.82444e-24 3.42325e-24 4.37371e-26
+  0.710 2.38785e-13 2.03361e-13 1.46587e-15 1.65414e-24 3.12293e-24 4.10162e-26
+  0.715 2.15194e-13 1.83648e-13 1.36361e-15 1.49198e-24 2.83496e-24 3.83623e-26
+  0.720 1.92888e-13 1.64957e-13 1.26529e-15 1.33849e-24 2.56041e-24 3.57804e-26
+  0.725 1.71944e-13 1.47356e-13 1.17110e-15 1.19420e-24 2.30039e-24 3.32758e-26
+  0.730 1.52421e-13 1.30901e-13 1.08119e-15 1.05956e-24 2.05587e-24 3.08532e-26
+  0.735 1.34328e-13 1.15602e-13 9.95540e-16 9.34625e-25 1.82712e-24 2.85142e-26
+  0.740 1.17654e-13 1.01458e-13 9.14061e-16 8.19349e-25 1.61426e-24 2.62600e-26
+  0.745 1.02391e-13 8.84631e-14 8.36666e-16 7.13671e-25 1.41738e-24 2.40916e-26
+  0.750 8.85234e-14 7.66120e-14 7.63247e-16 6.17513e-25 1.23658e-24 2.20098e-26
+  0.755 7.60273e-14 6.58897e-14 6.93737e-16 5.30720e-25 1.07181e-24 2.00163e-26
+  0.760 6.48482e-14 5.62575e-14 6.28219e-16 4.52939e-25 9.22646e-25 1.81158e-26
+  0.765 5.49201e-14 4.76680e-14 5.66814e-16 3.83742e-25 7.88556e-25 1.63134e-26
+  0.770 4.61705e-14 4.00688e-14 5.09642e-16 3.22657e-25 6.68935e-25 1.46149e-26
+  0.775 3.85199e-14 3.34019e-14 4.56825e-16 2.69166e-25 5.63114e-25 1.30260e-26
+  0.780 3.18847e-14 2.76053e-14 4.08322e-16 2.22719e-25 4.70327e-25 1.15496e-26
+  0.785 2.61848e-14 2.26161e-14 3.63550e-16 1.82784e-25 3.89653e-25 1.01791e-26
+  0.790 2.13351e-14 1.83663e-14 3.21775e-16 1.48789e-25 3.20070e-25 8.90547e-27
+  0.795 1.72430e-14 1.47819e-14 2.82204e-16 1.20115e-25 2.60465e-25 7.71863e-27
+  0.800 1.38082e-14 1.17826e-14 2.43984e-16 9.60883e-26 2.09630e-25 6.60776e-27
+  0.805 1.09348e-14 9.28974e-15 2.06562e-16 7.60575e-26 1.66418e-25 5.56549e-27
+  0.810 8.55931e-15 7.24513e-15 1.70516e-16 5.95623e-26 1.30119e-25 4.59785e-27
+  0.815 6.62143e-15 5.59106e-15 1.36700e-16 4.61541e-26 1.00067e-25 3.71403e-27
+  0.820 5.05572e-15 4.26510e-15 1.06025e-16 3.53472e-26 7.55377e-26 2.92375e-27
+  0.825 3.79149e-15 3.19987e-15 7.94636e-17 2.66172e-26 5.57487e-26 2.23726e-27
+  0.830 2.76009e-15 2.33100e-15 5.76164e-17 1.94610e-26 3.98834e-26 1.65825e-27
+  0.835 1.91144e-15 1.61533e-15 3.97497e-17 1.35327e-26 2.71614e-26 1.16780e-27
+  0.840 1.19633e-15 1.01154e-15 2.47998e-17 8.49865e-27 1.67609e-26 7.41001e-28
+  0.845 5.61822e-16 4.75209e-16 1.16223e-17 4.00000e-27 7.79785e-27 3.51094e-28
+  0.850 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
diff --git a/examples/CollidingGalaxiesSFR/job.sh b/examples/CollidingGalaxiesSFR/job.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d9e95e22886b8a2dbe3889fcf122bd10c8e20c93
--- /dev/null
+++ b/examples/CollidingGalaxiesSFR/job.sh
@@ -0,0 +1,17 @@
+#!/bin/bash -l 
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=vspringel@mpa-garching.mpg.de
+#SBATCH --time=24:00:00
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=40
+#SBATCH --job-name GalCollSFR
+
+echo
+echo "Running on hosts: $SLURM_NODELIST"
+echo "Running on $SLURM_NNODES nodes."
+echo "Running on $SLURM_NPROCS processors."
+echo "Current working directory is `pwd`"
+echo
+
+mpiexec -np $SLURM_NPROCS  ./Gadget4 param.txt
+
diff --git a/examples/CollidingGalaxiesSFR/param.txt b/examples/CollidingGalaxiesSFR/param.txt
new file mode 100644
index 0000000000000000000000000000000000000000..968798a72ef85c280d7d2ebc0d67dc56a7b5916f
--- /dev/null
+++ b/examples/CollidingGalaxiesSFR/param.txt
@@ -0,0 +1,99 @@
+%----  Relevant files 
+InitCondFile        /u/vrs/Simulations/ICs/ExampleICs/ics_collision_g4.dat
+OutputDir           ./output
+SnapshotFileBase    snapshot
+OutputListFilename  empty.txt
+
+
+%---- File formats
+ICFormat           1
+SnapFormat         3 
+
+%---- CPU-time limits
+TimeLimitCPU              90000  % in seconds
+CpuTimeBetRestartFile     7200    % in seconds
+
+%----- Memory alloction
+MaxMemSize        4000
+
+%---- Caracteristics of run
+TimeBegin           0.0        % Begin of the simulation
+TimeMax	            4.0        % End of the simulation
+
+%---- Basic code options that set the type of simulation
+ComovingIntegrationOn    0
+
+%---- Cosmological parameters
+Omega0	              0
+OmegaLambda           0
+OmegaBaryon           0
+HubbleParam         1.0
+BoxSize               0
+Hubble                0
+
+%---- Output frequency and output paramaters
+OutputListOn              0 
+TimeBetSnapshot           0.05
+TimeOfFirstSnapshot       0.0
+TimeBetStatistics         0.05
+NumFilesPerSnapshot       1
+MaxFilesWithConcurrentIO  1 
+
+%---- Accuracy of time integration
+ErrTolIntAccuracy        0.012 
+CourantFac               0.20
+MaxSizeTimestep          0.01
+MinSizeTimestep          0.0 
+
+%---- Tree algorithm, force accuracy, domain update frequency
+TypeOfOpeningCriterion                1
+ErrTolTheta                           0.6
+ErrTolThetaMax                        1.2
+ErrTolForceAcc                        0.005
+TopNodeFactor                         3.0
+
+ActivePartFracForNewDomainDecomp      0.01
+ 
+%---- Initial density estimate
+DesNumNgb              64
+MaxNumNgbDeviation     1 
+
+%---- System of units
+UnitLength_in_cm         3.085678e21        ;  1.0 kpc/h
+UnitMass_in_g            1.989e43           ;  1.0e10/h solar masses
+UnitVelocity_in_cm_per_s 1e5                ;  1 km/sec
+GravityConstantInternal  0
+
+
+%---- Gravitational softening length
+SofteningComovingClass0     0.1
+SofteningComovingClass1     0.3
+
+SofteningMaxPhysClass0       0.1
+SofteningMaxPhysClass1       0.3
+
+SofteningClassOfPartType0    0
+SofteningClassOfPartType1    1
+SofteningClassOfPartType2    0
+SofteningClassOfPartType3    0
+SofteningClassOfPartType4    0
+SofteningClassOfPartType5    0
+
+
+%----- SPH
+ArtBulkViscConst       1.0
+MinEgySpec             0
+InitGasTemp            0
+
+
+%----- Star formation
+
+MaxSfrTimescale     1.5         % Gas consumption timescale (multi-phase model)
+FactorSN            0.1         % beta, mass fraction of massive stars (multi-phase model)
+FactorEVP           1000        % A_0, evaporation parameter (multi-phase model)
+TempSupernova       1e+08       % T_SN, effective "supernova temperature",sets feedback energy (multi-phase model)
+TempClouds          1000        % temperature of cold clouds (multi-phase model)
+CritOverDensity     57.7        % overdensity threshold value for cosological sims
+CritPhysDensity     0           % critical physical density for star formation (in cm^(-3))
+TreecoolFile        TREECOOL
+
diff --git a/examples/DM-L50-N128/Config.sh b/examples/DM-L50-N128/Config.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d4f9c32a922cf90f7072264d5aeb97b00d2e7afa
--- /dev/null
+++ b/examples/DM-L50-N128/Config.sh
@@ -0,0 +1,40 @@
+
+# Basic code operation
+
+    LEAN
+
+    PERIODIC
+    SELFGRAVITY
+    RANDOMIZE_DOMAINCENTER
+    
+# Gravity options
+
+    PMGRID=384
+    TREEPM_NOTIMESPLIT
+    ASMTH=2.0
+    
+# Softening types and particle types
+
+    NSOFTCLASSES=1
+    NTYPES=2
+
+# Floating point accuracy
+
+    POSITIONS_IN_32BIT
+    DOUBLEPRECISION=2
+
+# Group finding
+
+    FOF
+
+# Miscellaneous code options
+
+    POWERSPEC_ON_OUTPUT
+
+# IC generation via N-GenIC
+
+    NGENIC=256
+    NGENIC_2LPT
+    CREATE_GRID
+    IDS_32BIT
+
diff --git a/examples/DM-L50-N128/job.sh b/examples/DM-L50-N128/job.sh
new file mode 100755
index 0000000000000000000000000000000000000000..316dc41cf7ec2299b2e420e52206163bb1991c4d
--- /dev/null
+++ b/examples/DM-L50-N128/job.sh
@@ -0,0 +1,18 @@
+#!/bin/bash -l
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=vspringel@mpa-garching.mpg.de
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=40
+#SBATCH --job-name DM-L50-N128
+
+
+echo
+echo "Running on hosts: $SLURM_NODELIST"
+echo "Running on $SLURM_NNODES nodes."
+echo "Running on $SLURM_NPROCS processors."
+echo "Current working directory is `pwd`"
+echo
+
+mpiexec -np $SLURM_NPROCS  ./Gadget4 param.txt 
+
diff --git a/examples/DM-L50-N128/outputs.txt b/examples/DM-L50-N128/outputs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..19d8a1f33cad6fa5f5271768dc22d0f16635c993
--- /dev/null
+++ b/examples/DM-L50-N128/outputs.txt
@@ -0,0 +1,3 @@
+0.25
+0.5
+1.0
diff --git a/examples/DM-L50-N128/param.txt b/examples/DM-L50-N128/param.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e8356001dce520dc765eca0ab35f9cf9f51fcf14
--- /dev/null
+++ b/examples/DM-L50-N128/param.txt
@@ -0,0 +1,96 @@
+
+%----  Relevant files 
+InitCondFile         ./dummy.dat       % we will here create the ICs upon startup
+OutputDir            ./output
+SnapshotFileBase     snapshot
+OutputListFilename   outputs.txt
+
+
+%---- File formats
+ICFormat             1
+SnapFormat           3 
+
+%---- CPU-time limits
+TimeLimitCPU              86400   % 24h, in seconds
+CpuTimeBetRestartFile     7200    % 2h,  in seconds
+
+%----- Memory alloction
+MaxMemSize                1800    % in MByte
+
+%---- Caracteristics of run
+TimeBegin                 0.015625   % Begin of the simulation, z = 63
+TimeMax	                  1.0        % End of the simulation,   z = 0
+
+%---- Basic code options that set the type of simulation
+ComovingIntegrationOn     1 
+
+%---- Cosmological parameters
+Omega0	                  0.308
+OmegaLambda               0.692
+OmegaBaryon               0.0482
+HubbleParam               0.678
+Hubble                    100.0
+BoxSize                   50.0
+
+%---- Output frequency and output paramaters
+OutputListOn              1 
+TimeBetSnapshot           0.0
+TimeOfFirstSnapshot       0.0
+TimeBetStatistics         0.01
+NumFilesPerSnapshot       1
+MaxFilesWithConcurrentIO  1 
+
+%---- Accuracy of time integration
+ErrTolIntAccuracy        0.01 
+CourantFac               0.3
+MaxSizeTimestep          0.005
+MinSizeTimestep          0.0 
+
+%---- Tree algorithm, force accuracy, domain update frequency
+TypeOfOpeningCriterion                1
+ErrTolTheta                           0.75
+ErrTolThetaMax                        1.0
+ErrTolForceAcc                        0.002
+TopNodeFactor                         3.0
+
+ActivePartFracForNewDomainDecomp      0.01
+ActivePartFracForPMinsteadOfEwald     0.05
+
+%---- Initial density estimate
+DesNumNgb                        64
+MaxNumNgbDeviation               1 
+
+%---- System of units
+UnitLength_in_cm         3.085678e24        ;  Mpc / h
+UnitMass_in_g            1.989e43           ;  1.0e10 Msun / h
+UnitVelocity_in_cm_per_s 1e5                ;  1 km/sec
+GravityConstantInternal  0
+
+
+%---- Gravitational softening length
+SofteningComovingClass0      0.01           ; 10 kpc/h
+SofteningMaxPhysClass0       0.01
+
+SofteningClassOfPartType0    0
+SofteningClassOfPartType1    0
+
+
+%----- SPH
+ArtBulkViscConst             1.0
+MinEgySpec                   0
+InitGasTemp                  0
+
+
+%----- N-GenIC
+NSample                                           128
+GridSize                                          128
+Seed                                              181170
+SphereMode                                        1
+PowerSpectrumType                                 0
+ReNormalizeInputSpectrum                          1
+PrimordialIndex                                   1.0
+ShapeGamma                                        0.21
+Sigma8                                            0.9
+PowerSpectrumFile                                 powerspec
+InputSpectrum_UnitLength_in_cm                    3.085678e21
+
diff --git a/examples/DM-Zoom-Aq-C-5/Config.sh b/examples/DM-Zoom-Aq-C-5/Config.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cbaab755259d0a4c0bc0b141fd0d3057ab1f5585
--- /dev/null
+++ b/examples/DM-Zoom-Aq-C-5/Config.sh
@@ -0,0 +1,19 @@
+
+    SELFGRAVITY
+    PERIODIC
+    PMGRID=1024
+    PM_ZOOM_OPTIMIZED
+    
+    GADGET2_HEADER
+
+    FOF
+    FOF_GROUP_MIN_LEN=32
+    FOF_PRIMARY_LINK_TYPES=2
+    SUBFIND
+    EVALPOTENTIAL
+
+    NSOFTCLASSES=4
+    INDIVIDUAL_GRAVITY_SOFTENING=4+8+16+32
+    
+    POWERSPEC_ON_OUTPUT
+
diff --git a/examples/DM-Zoom-Aq-C-5/ExpansionList_16 b/examples/DM-Zoom-Aq-C-5/ExpansionList_16
new file mode 100644
index 0000000000000000000000000000000000000000..3dbd8d52d4db707c4eb39f29ac74dcf249a072de
--- /dev/null
+++ b/examples/DM-Zoom-Aq-C-5/ExpansionList_16
@@ -0,0 +1,16 @@
+  0.0289500717 1
+  0.041920119 1
+  0.0607029683 1
+  0.0879106611 1
+  0.127352452 1
+  0.184663064 1
+  0.267668583 1
+  0.35425973 1
+  0.434352807 1
+  0.511473107 1
+  0.587742558 1
+  0.66465946 1
+  0.7433989 1
+  0.82495401 1
+  0.910212068 1
+  1. 1
diff --git a/examples/DM-Zoom-Aq-C-5/job.sh b/examples/DM-Zoom-Aq-C-5/job.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6d8d646bfad0fa0b8f507df76a8bdd0461ab7aaf
--- /dev/null
+++ b/examples/DM-Zoom-Aq-C-5/job.sh
@@ -0,0 +1,17 @@
+#!/bin/bash -l
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=vspringel@mpa-garching.mpg.de
+#SBATCH --time=24:00:00
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=40
+#SBATCH --job-name Aq-C-5
+
+
+echo
+echo "Running on hosts: $SLURM_NODELIST"
+echo "Running on $SLURM_NNODES nodes."
+echo "Running on $SLURM_NPROCS processors."
+echo "Current working directory is `pwd`"
+echo
+
+mpiexec -np $SLURM_NPROCS  ./Gadget4 param.txt 
diff --git a/examples/DM-Zoom-Aq-C-5/param.txt b/examples/DM-Zoom-Aq-C-5/param.txt
new file mode 100644
index 0000000000000000000000000000000000000000..db5569705c994faa3fc94f9bacf6561d1e30626e
--- /dev/null
+++ b/examples/DM-Zoom-Aq-C-5/param.txt
@@ -0,0 +1,100 @@
+%----  Relevant files 
+InitCondFile        /u/vrs/Simulations/ICs/ExampleICs/Aq-C-5-aquila-dm-fixed
+OutputDir           ./output
+SnapshotFileBase    snapshot
+OutputListFilename  ./ExpansionList_16
+
+
+%---- File formats
+ICFormat           1
+SnapFormat         3 
+
+%---- CPU-time limits
+TimeLimitCPU              86400   % in seconds
+CpuTimeBetRestartFile     7200    % in seconds
+
+%----- Memory alloction
+MaxMemSize        4300
+
+%---- Caracteristics of run
+TimeBegin           0.0078125  % Begin of the simulation
+TimeMax	            1.0        % End of the simulation
+
+%---- Basic code options that set the type of simulation
+ComovingIntegrationOn    1 
+
+%---- Cosmological parameters
+Omega0	              0.25
+OmegaLambda           0.75
+OmegaBaryon           0.04
+HubbleParam           0.73
+BoxSize               100.0
+Hubble                100.0
+
+
+%---- Output frequency and output paramaters
+OutputListOn              1 
+TimeBetSnapshot           0.0
+TimeOfFirstSnapshot       0.0
+TimeBetStatistics         0.01
+NumFilesPerSnapshot       8
+MaxFilesWithConcurrentIO  8 
+
+%---- Accuracy of time integration
+ErrTolIntAccuracy        0.012 
+CourantFac               0.3
+MaxSizeTimestep          0.005
+MinSizeTimestep          0.0 
+
+%---- Tree algorithm, force accuracy, domain update frequency
+TypeOfOpeningCriterion                1
+ErrTolTheta                           0.7
+ErrTolThetaMax                        1.0
+ErrTolForceAcc                        0.0025
+TopNodeFactor                         5
+
+ActivePartFracForNewDomainDecomp     0.01
+%%ActivePartFracForPMinsteadOfEwald  0.05
+ 
+%---- Initial density estimate
+DesNumNgb              64
+MaxNumNgbDeviation     1 
+
+%---- System of units
+UnitLength_in_cm         3.085678e24        ;  1.0 kpc
+UnitMass_in_g            1.989e43           ;  1.0e10 solar masses
+UnitVelocity_in_cm_per_s 1e5                ;  1 km/sec
+GravityConstantInternal  0
+
+%---- Gravitational softening length
+SofteningComovingClass0     0.001
+SofteningComovingClass1     0.007
+SofteningComovingClass2     0.025
+SofteningComovingClass3     0.081
+
+
+SofteningMaxPhysClass0     0.0005
+SofteningMaxPhysClass1     0.007
+SofteningMaxPhysClass2     0.025
+SofteningMaxPhysClass3     0.081
+
+
+SofteningClassOfPartType0  0
+SofteningClassOfPartType1  0
+SofteningClassOfPartType2  1
+SofteningClassOfPartType3  2
+SofteningClassOfPartType4  3
+SofteningClassOfPartType5  3
+
+
+
+%----- Subfind
+
+DesLinkNgb             20
+
+%----- SPH
+ArtBulkViscConst       1.0
+MinEgySpec             0
+InitGasTemp            0
+
+
diff --git a/examples/G2-cluster/Config.sh b/examples/G2-cluster/Config.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bdd3233e49cb9dbb3e36bdcb33487ad34f27545a
--- /dev/null
+++ b/examples/G2-cluster/Config.sh
@@ -0,0 +1,16 @@
+
+# Basic code operation
+
+    SELFGRAVITY
+    NTYPES=4
+
+# Gravity options
+  
+    PMGRID=128
+    TREEPM_NOTIMESPLIT
+    
+    
+# Miscellaneous code options
+
+    DOUBLEPRECISION=1
+    GADGET2_HEADER
diff --git a/examples/G2-cluster/job.sh b/examples/G2-cluster/job.sh
new file mode 100755
index 0000000000000000000000000000000000000000..25da6b6f19867afd4f866d4abb45397601ad3083
--- /dev/null
+++ b/examples/G2-cluster/job.sh
@@ -0,0 +1,17 @@
+#!/bin/bash 
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=vspringel@mpa-garching.mpg.de
+#SBATCH --time=1:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=16
+#SBATCH --job-name G2-cluster
+
+echo
+echo "Running on hosts: $SLURM_NODELIST"
+echo "Running on $SLURM_NNODES nodes."
+echo "Running on $SLURM_NPROCS processors."
+echo "Current working directory is `pwd`"
+echo
+
+mpiexec -np $SLURM_NPROCS  ./Gadget4 param.txt
+
diff --git a/examples/G2-cluster/param.txt b/examples/G2-cluster/param.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3aac3bad570dbe387423334dd7cd39d8578521a1
--- /dev/null
+++ b/examples/G2-cluster/param.txt
@@ -0,0 +1,89 @@
+%----  Relevant files 
+InitCondFile        /u/vrs/Simulations/ICs/ExampleICs/cluster_littleendian.dat
+OutputDir           ./output
+SnapshotFileBase    snapshot
+OutputListFilename  empty.txt
+
+
+%---- File formats
+ICFormat           1
+SnapFormat         3 
+
+%---- CPU-time limits
+TimeLimitCPU              180000  % in seconds
+CpuTimeBetRestartFile     7200    % in seconds
+
+%----- Memory alloction
+MaxMemSize        2300
+
+%---- Caracteristics of run
+TimeBegin           0.041666666     % Begin of the simulation (z=23)
+TimeMax	            1.0             % End of the simulation   (z=0)
+
+%---- Basic code options that set the type of simulation
+ComovingIntegrationOn    1
+
+%---- Cosmological parameters
+Omega0	              0.3
+OmegaLambda           0.7
+OmegaBaryon           0
+HubbleParam           0.7
+Hubble                0.1
+BoxSize               0
+
+%---- Output frequency and output paramaters
+OutputListOn              0 
+TimeBetSnapshot           1.8384163   % 5 constant steps in log(a)
+TimeOfFirstSnapshot       0.047619048  % z=20
+TimeBetStatistics         0.05
+NumFilesPerSnapshot       1
+MaxFilesWithConcurrentIO  1 
+
+%---- Accuracy of time integration
+ErrTolIntAccuracy        0.012 
+CourantFac               0.15
+MaxSizeTimestep          0.025
+MinSizeTimestep          0.0 
+
+%---- Tree algorithm, force accuracy, domain update frequency
+TypeOfOpeningCriterion                1
+ErrTolTheta                           0.5
+ErrTolThetaMax                        1.0
+ErrTolForceAcc                        0.005
+TopNodeFactor                         2.5
+
+ActivePartFracForNewDomainDecomp      0.01
+ActivePartFracForPMinsteadOfEwald     0.05
+
+%---- Initial density estimate
+DesNumNgb              64
+MaxNumNgbDeviation     1 
+
+%---- System of units
+UnitLength_in_cm         3.085678e21        ;  1.0 kpc/h
+UnitMass_in_g            1.989e43           ;  1.0e10/h solar masses
+UnitVelocity_in_cm_per_s 1e5                ;  1 km/sec
+GravityConstantInternal  0
+
+
+%---- Gravitational softening length
+SofteningComovingClass0      0
+SofteningComovingClass1      72.0
+SofteningComovingClass2      180.0
+SofteningComovingClass3      500.0
+
+SofteningMaxPhysClass0       0
+SofteningMaxPhysClass1       12.0
+SofteningMaxPhysClass2       30.0
+SofteningMaxPhysClass3       150.0
+
+SofteningClassOfPartType0    0
+SofteningClassOfPartType1    1
+SofteningClassOfPartType2    2
+SofteningClassOfPartType3    3
+
+
+%----- SPH
+ArtBulkViscConst       1.0
+MinEgySpec             0
+InitGasTemp            0
diff --git a/examples/G2-galaxy/Config.sh b/examples/G2-galaxy/Config.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d5c731dbe2b2b6d0e718b06f957665c6b871152b
--- /dev/null
+++ b/examples/G2-galaxy/Config.sh
@@ -0,0 +1,16 @@
+
+# Basic code operation
+
+    SELFGRAVITY
+    NTYPES=3
+
+# Gravity options
+  
+    EVALPOTENTIAL
+    
+# Miscellaneous code options
+
+    DOUBLEPRECISION=1
+    GADGET2_HEADER
+    
+
diff --git a/examples/G2-galaxy/job.sh b/examples/G2-galaxy/job.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f1d2f1aa94723102be8550937d5ac878950732be
--- /dev/null
+++ b/examples/G2-galaxy/job.sh
@@ -0,0 +1,20 @@
+#!/bin/bash 
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=vspringel@mpa-garching.mpg.de
+#SBATCH --time=1:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=16
+#SBATCH --job-name G2-galaxy
+
+echo
+echo "Running on hosts: $SLURM_NODELIST"
+echo "Running on $SLURM_NNODES nodes."
+echo "Running on $SLURM_NPROCS processors."
+echo "Current working directory is `pwd`"
+echo
+
+mpiexec -np $SLURM_NPROCS  ./Gadget4 param.txt
+
+
+
+
diff --git a/examples/G2-galaxy/param.txt b/examples/G2-galaxy/param.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ed6433f3e16d4e42b0d6108c1f62f6925c53663a
--- /dev/null
+++ b/examples/G2-galaxy/param.txt
@@ -0,0 +1,85 @@
+%----  Relevant files 
+InitCondFile        /u/vrs/Simulations/ICs/ExampleICs/galaxy_littleendian.dat
+OutputDir           ./output
+SnapshotFileBase    snapshot
+OutputListFilename  empty.txt
+
+
+%---- File formats
+ICFormat           1
+SnapFormat         3 
+
+%---- CPU-time limits
+TimeLimitCPU              180000  % in seconds
+CpuTimeBetRestartFile     7200    % in seconds
+
+%----- Memory alloction
+MaxMemSize        2300
+
+%---- Caracteristics of run
+TimeBegin           0.0        % Begin of the simulation
+TimeMax	            3.0        % End of the simulation
+
+%---- Basic code options that set the type of simulation
+ComovingIntegrationOn    0
+
+%---- Cosmological parameters
+Omega0	              0
+OmegaLambda           0
+OmegaBaryon           0
+HubbleParam         1.0
+Hubble                0
+BoxSize               0
+
+%---- Output frequency and output paramaters
+OutputListOn              0 
+TimeBetSnapshot           0.5
+TimeOfFirstSnapshot       0.0
+TimeBetStatistics         0.05
+NumFilesPerSnapshot       1
+MaxFilesWithConcurrentIO  1 
+
+%---- Accuracy of time integration
+ErrTolIntAccuracy        0.012 
+CourantFac               0.15
+MaxSizeTimestep          0.01
+MinSizeTimestep          0.0 
+
+%---- Tree algorithm, force accuracy, domain update frequency
+TypeOfOpeningCriterion                1
+ErrTolTheta                           0.5
+ErrTolThetaMax                        0.9
+ErrTolForceAcc                        0.005
+TopNodeFactor                         2.5
+
+ActivePartFracForNewDomainDecomp      0.01
+ 
+%---- Initial density estimate
+DesNumNgb              64
+MaxNumNgbDeviation     1 
+
+%---- System of units
+UnitLength_in_cm         3.085678e21        ;  1.0 kpc/h
+UnitMass_in_g            1.989e43           ;  1.0e10/h solar masses
+UnitVelocity_in_cm_per_s 1e5                ;  1 km/sec
+GravityConstantInternal  0
+
+
+%---- Gravitational softening length
+SofteningComovingClass0     0
+SofteningComovingClass1     1.0
+SofteningComovingClass2     0.4
+
+SofteningMaxPhysClass0       0
+SofteningMaxPhysClass1       1.0
+SofteningMaxPhysClass2       0.4
+
+SofteningClassOfPartType0    0
+SofteningClassOfPartType1    1
+SofteningClassOfPartType2    2
+
+
+%----- SPH
+ArtBulkViscConst       1.0
+MinEgySpec             0
+InitGasTemp            0
diff --git a/examples/G2-galaxy/plot_energy.pro b/examples/G2-galaxy/plot_energy.pro
new file mode 100644
index 0000000000000000000000000000000000000000..918fc60847c71f3d4a2c9e127518e42134d0c539
--- /dev/null
+++ b/examples/G2-galaxy/plot_energy.pro
@@ -0,0 +1,29 @@
+
+da = dblarr(16, 61)
+
+openr, 1, "output_old/energy.txt"
+readf, 1, da
+close, 1
+
+ti =  da(0,*)   ; time
+th =  da(1,*)   ; thermal energy
+po =  da(2,*)   ; potential energy
+ke =  da(3,*)   ; kinertic energy
+
+
+tot = th + ke + po
+
+window,xsize=1000,ysize=900
+
+!p.multi=[0,1,2]
+
+plot, ti,(po-po(0))/abs(po(0)),charsize=2.0, yrange=[-1,1] * max([ abs((po-po(0))/abs(po(0))), abs((ke-ke(0))/abs(ke(0)))])
+
+oplot,ti,(ke-ke(0))/abs(ke(0)),linestyle=2, color=255
+oplot,ti,(po-po(0))/abs(po(0)),linestyle=3, color=255*256L
+
+plot,ti,(tot-tot(0))/abs(tot(0)),charsize=2.0
+
+
+end
+
diff --git a/examples/G2-gassphere/Config.sh b/examples/G2-gassphere/Config.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cdcec751afdab053160a1def4dbf68ac0338c8b1
--- /dev/null
+++ b/examples/G2-gassphere/Config.sh
@@ -0,0 +1,14 @@
+
+# Basic code operation
+
+    SELFGRAVITY
+    NTYPES=1
+
+# Gravity options
+  
+    EVALPOTENTIAL
+    
+# Miscellaneous code options
+
+    DOUBLEPRECISION=1
+    GADGET2_HEADER
diff --git a/examples/G2-gassphere/param.txt b/examples/G2-gassphere/param.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4dd77c2d8cace94c5756f8e4001472f24fb55e7e
--- /dev/null
+++ b/examples/G2-gassphere/param.txt
@@ -0,0 +1,79 @@
+%----  Relevant files 
+InitCondFile        /u/vrs/Simulations/ICs/ExampleICs/gassphere_littleendian.dat
+OutputDir           ./output
+SnapshotFileBase    snapshot
+OutputListFilename  empty.txt
+
+
+%---- File formats
+ICFormat           1
+SnapFormat         3 
+
+%---- CPU-time limits
+TimeLimitCPU              180000  % in seconds
+CpuTimeBetRestartFile     7200    % in seconds
+
+%----- Memory alloction
+MaxMemSize        2300
+
+%---- Caracteristics of run
+TimeBegin           0.0        % Begin of the simulation
+TimeMax	            3.0        % End of the simulation
+
+%---- Basic code options that set the type of simulation
+ComovingIntegrationOn    0
+
+%---- Cosmological parameters
+Omega0	              0
+OmegaLambda           0
+OmegaBaryon           0
+HubbleParam         1.0
+Hubble                0
+BoxSize               0
+
+%---- Output frequency and output paramaters
+OutputListOn              0 
+TimeBetSnapshot           0.2
+TimeOfFirstSnapshot       0.0
+TimeBetStatistics         0.05
+NumFilesPerSnapshot       1
+MaxFilesWithConcurrentIO  1 
+
+%---- Accuracy of time integration
+ErrTolIntAccuracy        0.025 
+CourantFac               0.15
+MaxSizeTimestep          0.02
+MinSizeTimestep          0.0 
+
+%---- Tree algorithm, force accuracy, domain update frequency
+TypeOfOpeningCriterion                1
+ErrTolTheta                           0.5
+ErrTolThetaMax                        0.9
+ErrTolForceAcc                        0.005
+TopNodeFactor                         2.5
+
+ActivePartFracForNewDomainDecomp      0.01
+ 
+%---- Initial density estimate
+DesNumNgb              64
+MaxNumNgbDeviation     1 
+
+%---- System of units
+UnitLength_in_cm         1.0
+UnitMass_in_g            1.0
+UnitVelocity_in_cm_per_s 1.0
+GravityConstantInternal  1.0
+
+
+%---- Gravitational softening length
+SofteningComovingClass0     0.04
+
+SofteningMaxPhysClass0      0.04
+
+SofteningClassOfPartType0    0
+
+
+%----- SPH
+ArtBulkViscConst       1.0
+MinEgySpec             0
+InitGasTemp            0
diff --git a/examples/G2-gassphere/plot_energy_gassphere.pro b/examples/G2-gassphere/plot_energy_gassphere.pro
new file mode 100644
index 0000000000000000000000000000000000000000..c0f30ba3af4476d71666c674b2b1f2f3e0ff84aa
--- /dev/null
+++ b/examples/G2-gassphere/plot_energy_gassphere.pro
@@ -0,0 +1,30 @@
+
+da = dblarr(8, 61)
+
+openr, 1, "output/energy.txt"
+readf, 1, da
+close, 1
+
+ti =  da(0,*)   ; time
+th =  da(1,*)   ; thermal energy
+po =  da(2,*)   ; potential energy
+ke =  da(3,*)   ; kinertic energy
+
+
+tot = th + ke + po
+
+window,xsize=1000,ysize=900
+
+!p.multi=[0,1,2]
+
+plot, ti, tot, charsize=2.0, yrange=[-2,1.5], ystyle=1
+
+oplot,ti, ke, color=255*256L^2 + 100*256L + 100
+oplot,ti, po, color=255*256L
+oplot,ti, th, color=255
+
+plot,ti,(tot-tot(0))/abs(tot(0)),charsize=2.0
+
+
+end
+
diff --git a/examples/G2-lcdm-gas/Config.sh b/examples/G2-lcdm-gas/Config.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e2877b9174b2dea20fe30f318c1864915d21d884
--- /dev/null
+++ b/examples/G2-lcdm-gas/Config.sh
@@ -0,0 +1,17 @@
+
+# Basic code operation
+
+    PERIODIC
+    SELFGRAVITY
+    NTYPES=2
+    NSOFTCLASSES=1
+
+# Gravity options
+  
+    PMGRID=128
+
+    
+# Miscellaneous code options
+
+    DOUBLEPRECISION=1
+    GADGET2_HEADER
diff --git a/examples/G2-lcdm-gas/job.sh b/examples/G2-lcdm-gas/job.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2b083d0530754c9c83c36be166f73f0cc8258dc0
--- /dev/null
+++ b/examples/G2-lcdm-gas/job.sh
@@ -0,0 +1,17 @@
+#!/bin/bash 
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=vspringel@mpa-garching.mpg.de
+#SBATCH --time=1:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=16
+#SBATCH --job-name G2-lcdm-gas
+
+echo
+echo "Running on hosts: $SLURM_NODELIST"
+echo "Running on $SLURM_NNODES nodes."
+echo "Running on $SLURM_NPROCS processors."
+echo "Current working directory is `pwd`"
+echo
+
+mpiexec -np $SLURM_NPROCS  ./Gadget4 param.txt
+
diff --git a/examples/G2-lcdm-gas/outputs_lcdm_gas.txt b/examples/G2-lcdm-gas/outputs_lcdm_gas.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c0007a810dab74cec146b941de8005ec088723e8
--- /dev/null
+++ b/examples/G2-lcdm-gas/outputs_lcdm_gas.txt
@@ -0,0 +1,5 @@
+0.166667
+0.250000
+0.333333 
+0.5
+1.0
diff --git a/examples/G2-lcdm-gas/param.txt b/examples/G2-lcdm-gas/param.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f2ad4884de0ed842e1c11a13dab951d483e58f5c
--- /dev/null
+++ b/examples/G2-lcdm-gas/param.txt
@@ -0,0 +1,79 @@
+%----  Relevant files 
+InitCondFile        /u/vrs/Simulations/ICs/ExampleICs/lcdm_gas_littleendian.dat
+OutputDir           ./output
+SnapshotFileBase    snapshot
+OutputListFilename  outputs_lcdm_gas.txt
+
+
+%---- File formats
+ICFormat           1
+SnapFormat         3 
+
+%---- CPU-time limits
+TimeLimitCPU              180000  % in seconds
+CpuTimeBetRestartFile     7200    % in seconds
+
+%----- Memory alloction
+MaxMemSize        2300
+
+%---- Caracteristics of run
+TimeBegin           0.090909091     % Begin of the simulation, z=10
+TimeMax	            1.0             % End of the simulation   
+
+%---- Basic code options that set the type of simulation
+ComovingIntegrationOn    1
+
+%---- Cosmological parameters
+Omega0	              0.3
+OmegaLambda           0.7
+OmegaBaryon           0.04
+HubbleParam           0.7
+Hubble                0.1
+BoxSize               50000.0
+
+%---- Output frequency and output paramaters
+OutputListOn              1
+TimeBetSnapshot           1.8384163   % 5 constant steps in log(a)
+TimeOfFirstSnapshot       0.047619048  % z=20
+TimeBetStatistics         0.05
+NumFilesPerSnapshot       1
+MaxFilesWithConcurrentIO  1 
+
+%---- Accuracy of time integration
+ErrTolIntAccuracy        0.012 
+CourantFac               0.15
+MaxSizeTimestep          0.025
+MinSizeTimestep          0.0 
+
+%---- Tree algorithm, force accuracy, domain update frequency
+TypeOfOpeningCriterion                1
+ErrTolTheta                           0.5
+ErrTolThetaMax                        1.0
+ErrTolForceAcc                        0.0025
+TopNodeFactor                         2.5
+
+ActivePartFracForNewDomainDecomp      0.01
+ 
+%---- Initial density estimate
+DesNumNgb              64
+MaxNumNgbDeviation     1 
+
+%---- System of units
+UnitLength_in_cm         3.085678e21        ;  1.0 kpc/h
+UnitMass_in_g            1.989e43           ;  1.0e10/h solar masses
+UnitVelocity_in_cm_per_s 1e5                ;  1 km/sec
+GravityConstantInternal  0
+
+
+%---- Gravitational softening length
+SofteningComovingClass0     600.0
+SofteningMaxPhysClass0      600.0
+
+SofteningClassOfPartType0    0
+SofteningClassOfPartType1    0
+
+
+%----- SPH
+ArtBulkViscConst       1.0
+MinEgySpec             0
+InitGasTemp            1000.0    % in K
diff --git a/examples/SantaBarbara-PSPH-64/Config.sh b/examples/SantaBarbara-PSPH-64/Config.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3891cc515f3979c2f83128f393d3705ffc712be3
--- /dev/null
+++ b/examples/SantaBarbara-PSPH-64/Config.sh
@@ -0,0 +1,31 @@
+
+
+       PERIODIC
+       SELFGRAVITY
+
+       TREEPM_NOTIMESPLIT  
+
+       PMGRID=384
+       ASMTH=3.0
+       RCUT=8.0
+
+       MULTIPOLE_ORDER=3
+
+       RANDOMIZE_DOMAINCENTER
+
+       DOUBLEPRECISION=1
+       DOUBLEPRECISION_FFTW
+
+       POSITIONS_IN_64BIT              
+       IDS_32BIT
+
+       FOF 
+       SUBFIND
+
+       
+       NTYPES=2 
+       NSOFTCLASSES=1
+
+       GADGET2_HEADER
+
+       PRESSURE_ENTROPY_SPH
\ No newline at end of file
diff --git a/examples/SantaBarbara-PSPH-64/job.sh b/examples/SantaBarbara-PSPH-64/job.sh
new file mode 100755
index 0000000000000000000000000000000000000000..83db58deec340b500458f064e8f16e95d7b63131
--- /dev/null
+++ b/examples/SantaBarbara-PSPH-64/job.sh
@@ -0,0 +1,17 @@
+#!/bin/bash -l 
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=vspringel@mpa-garching.mpg.de
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=40
+#SBATCH --job-name SB64
+
+echo
+echo "Running on hosts: $SLURM_NODELIST"
+echo "Running on $SLURM_NNODES nodes."
+echo "Running on $SLURM_NPROCS processors."
+echo "Current working directory is `pwd`"
+echo
+
+mpiexec -np $SLURM_NPROCS  ./Gadget4 param.txt 
+
diff --git a/examples/SantaBarbara-PSPH-64/outputs.txt b/examples/SantaBarbara-PSPH-64/outputs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d3827e75a5cadb9fe4a27e1cb9b6d192e7323120
--- /dev/null
+++ b/examples/SantaBarbara-PSPH-64/outputs.txt
@@ -0,0 +1 @@
+1.0
diff --git a/examples/SantaBarbara-PSPH-64/param.txt b/examples/SantaBarbara-PSPH-64/param.txt
new file mode 100644
index 0000000000000000000000000000000000000000..13d5d8055b391e9d45a5b6e14c3cc98ccebb2d63
--- /dev/null
+++ b/examples/SantaBarbara-PSPH-64/param.txt
@@ -0,0 +1,75 @@
+
+        InitCondFile                                      /u/vrs/Simulations/ICs/ExampleICs/sb_64.dat
+        OutputDir                                         ./output
+        SnapshotFileBase                                  snap
+
+	ActivePartFracForPMinsteadOfEwald                 0
+
+
+        TimeLimitCPU                                      90000
+
+        ICFormat                                          1
+        SnapFormat                                        3
+
+
+        ComovingIntegrationOn                             1
+
+        NumFilesPerSnapshot                               1
+
+        TimeBegin                                         0.047619
+        TimeMax                                           1
+
+        Omega0                                            1.0
+        OmegaLambda                                       0.0
+        OmegaBaryon                                       0.1
+        HubbleParam                                       0.5
+        Hubble                                            0.1
+
+        BoxSize                                           32000
+
+        OutputListFilename                                ./outputs.txt
+        OutputListOn                                      1
+        TimeBetSnapshot                                   1.1
+        TimeOfFirstSnapshot                               0.1
+
+        CpuTimeBetRestartFile                             7200
+        TimeBetStatistics                                 0.25
+
+        MaxSizeTimestep                                   0.005
+        MinSizeTimestep                                   0
+        ErrTolIntAccuracy                                 0.0125
+
+        TypeOfOpeningCriterion                            1
+        ErrTolTheta                                       0.7
+        ErrTolForceAcc                                    0.005
+
+        ErrTolThetaMax                                    0.9
+
+        TopNodeFactor                                     3.5
+
+        ActivePartFracForNewDomainDecomp                  0
+
+        MaxFilesWithConcurrentIO                          32
+
+        MaxMemSize                                        4000
+        DesNumNgb                                         64
+        MaxNumNgbDeviation                                1
+        ArtBulkViscConst                                  1
+        InitGasTemp                                       1000.0
+        MinEgySpec                                        0
+        CourantFac                                        0.15
+
+	DesLinkNgb                                        20
+
+        SofteningComovingClass0                           12.5
+        SofteningMaxPhysClass0                            12.5
+     
+      
+        SofteningClassOfPartType0                         0
+        SofteningClassOfPartType1                         0 
+
+
+        UnitLength_in_cm                                  3.08568e+21
+        UnitMass_in_g                                     1.989e+43
+        UnitVelocity_in_cm_per_s                          100000
+        GravityConstantInternal                           0
diff --git a/make-examples.sh b/make-examples.sh
new file mode 100755
index 0000000000000000000000000000000000000000..945bbad051096dac1f802854832ac130c00c2b33
--- /dev/null
+++ b/make-examples.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+
+# Dark matter cosmological simulation with 128^3 resolution in a 50 Mpc/h box for
+# which ICs are created on the fly, and the LEAN option is used.
+make -j 8 DIR=examples/DM-L50-N128
+
+
+# DM-only zoom simulation of the formation of a Milky Way-sized halo.
+make -j 8 DIR=examples/DM-Zoom-Aq-C-5
+
+
+# Simple galaxy collision with gas and ongoing star formation.
+make -j 8 DIR=examples/CollidingGalaxiesSFR
+
+
+# A simulation of the Santa Barbara cluster at 2 x 64^3 resolution using pressure-SPH.
+make -j 8 DIR=examples/SantaBarbara-PSPH-64
+
+
+
+# The following examples are realizations of the (small) test problems contained
+# in the GADGET-2 distribution from Springel (2005).
+
+# collisionless galaxy collision
+make -j 8 DIR=examples/G2-galaxy
+
+# cosmological simulation of a galaxy cluster
+make -j 8 DIR=examples/G2-cluster 
+
+# gravitational collapse of a cold gas cloud ("Evrard test")
+make -j 8 DIR=examples/G2-gassphere 
+
+# low resolution cosmological simulation with gas
+make -j 8 DIR=examples/G2-lcdm-gas 
diff --git a/src/cooling_sfr/cooling.cc b/src/cooling_sfr/cooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e825ebbe01853377e1e975c37d78e40b0ab33a07
--- /dev/null
+++ b/src/cooling_sfr/cooling.cc
@@ -0,0 +1,744 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file cooling.cc
+ *
+ *  \brief Module for gas radiative cooling
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef COOLING
+
+#include <gsl/gsl_math.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/** \brief Compute the new internal energy per unit mass.
+ *
+ *   The function solves for the new internal energy per unit mass of the gas by integrating the equation
+ *   for the internal energy with an implicit Euler scheme. The root of resulting non linear equation,
+ *   which gives tnew internal energy, is found with the bisection method.
+ *   Arguments are passed in code units.
+ *
+ *   \param u_old the initial (before cooling is applied) internal energy per unit mass of the gas particle
+ *   \param rho   the proper density of the gas particle
+ *   \param dt    the duration of the time step
+ *   \param ne_guess electron number density relative to hydrogen number density (for molecular weight computation)
+ *   \return the new internal energy per unit mass of the gas particle
+ */
+double coolsfr::DoCooling(double u_old, double rho, double dt, double *ne_guess, gas_state *gs, do_cool_data *DoCool)
+{
+  DoCool->u_old_input    = u_old;
+  DoCool->rho_input      = rho;
+  DoCool->dt_input       = dt;
+  DoCool->ne_guess_input = *ne_guess;
+
+  if(!gsl_finite(u_old))
+    Terminate("invalid input: u_old=%g\n", u_old);
+
+  if(u_old < 0 || rho < 0)
+    Terminate("invalid input: u_old=%g  rho=%g  dt=%g  All.MinEgySpec=%g\n", u_old, rho, dt, All.MinEgySpec);
+
+  rho *= All.UnitDensity_in_cgs * All.HubbleParam * All.HubbleParam; /* convert to physical cgs units */
+  u_old *= All.UnitPressure_in_cgs / All.UnitDensity_in_cgs;
+  dt *= All.UnitTime_in_s / All.HubbleParam;
+
+  gs->nHcgs       = gs->XH * rho / PROTONMASS; /* hydrogen number dens in cgs units */
+  double ratefact = gs->nHcgs * gs->nHcgs / rho;
+
+  double u       = u_old;
+  double u_lower = u;
+  double u_upper = u;
+
+  double LambdaNet = CoolingRateFromU(u, rho, ne_guess, gs, DoCool);
+
+  /* bracketing */
+
+  if(u - u_old - ratefact * LambdaNet * dt < 0) /* heating */
+    {
+      u_upper *= sqrt(1.1);
+      u_lower /= sqrt(1.1);
+      while(u_upper - u_old - ratefact * CoolingRateFromU(u_upper, rho, ne_guess, gs, DoCool) * dt < 0)
+        {
+          u_upper *= 1.1;
+          u_lower *= 1.1;
+        }
+    }
+
+  if(u - u_old - ratefact * LambdaNet * dt > 0)
+    {
+      u_lower /= sqrt(1.1);
+      u_upper *= sqrt(1.1);
+      while(u_lower - u_old - ratefact * CoolingRateFromU(u_lower, rho, ne_guess, gs, DoCool) * dt > 0)
+        {
+          u_upper /= 1.1;
+          u_lower /= 1.1;
+        }
+    }
+
+  int iter = 0;
+  double du;
+  do
+    {
+      u = 0.5 * (u_lower + u_upper);
+
+      LambdaNet = CoolingRateFromU(u, rho, ne_guess, gs, DoCool);
+
+      if(u - u_old - ratefact * LambdaNet * dt > 0)
+        {
+          u_upper = u;
+        }
+      else
+        {
+          u_lower = u;
+        }
+
+      du = u_upper - u_lower;
+
+      iter++;
+
+      if(iter >= (MAXITER - 10))
+        printf("u= %g\n", u);
+    }
+  while(fabs(du / u) > 1.0e-6 && iter < MAXITER);
+
+  if(iter >= MAXITER)
+    Terminate(
+        "failed to converge in DoCooling(): DoCool->u_old_input=%g\nDoCool->rho_input= %g\nDoCool->dt_input= "
+        "%g\nDoCool->ne_guess_input= %g\n",
+        DoCool->u_old_input, DoCool->rho_input, DoCool->dt_input, DoCool->ne_guess_input);
+
+  u *= All.UnitDensity_in_cgs / All.UnitPressure_in_cgs; /* to internal units */
+
+  return u;
+}
+
+/** \brief Return the cooling time.
+ *
+ *  If we actually have heating, a cooling time of 0 is returned.
+ *
+ *  \param u_old the initial (before cooling is applied) internal energy per unit mass of the gas particle
+ *  \param rho   the proper density of the gas particle
+ *  \param ne_guess electron number density relative to hydrogen number density (for molecular weight computation)
+ */
+double coolsfr::GetCoolingTime(double u_old, double rho, double *ne_guess, gas_state *gs, do_cool_data *DoCool)
+{
+  DoCool->u_old_input    = u_old;
+  DoCool->rho_input      = rho;
+  DoCool->ne_guess_input = *ne_guess;
+
+  rho *= All.UnitDensity_in_cgs * All.HubbleParam * All.HubbleParam; /* convert to physical cgs units */
+  u_old *= All.UnitPressure_in_cgs / All.UnitDensity_in_cgs;
+
+  gs->nHcgs       = gs->XH * rho / PROTONMASS; /* hydrogen number dens in cgs units */
+  double ratefact = gs->nHcgs * gs->nHcgs / rho;
+
+  double u = u_old;
+
+  double LambdaNet = CoolingRateFromU(u, rho, ne_guess, gs, DoCool);
+
+  /* bracketing */
+
+  if(LambdaNet >= 0) /* ups, we have actually heating due to UV background */
+    return 0;
+
+  double coolingtime = u_old / (-ratefact * LambdaNet);
+
+  coolingtime *= All.HubbleParam / All.UnitTime_in_s;
+
+  return coolingtime;
+}
+
+/** \brief Compute gas temperature from internal energy per unit mass.
+ *
+ *   This function determines the electron fraction, and hence the mean
+ *   molecular weight. With it arrives at a self-consistent temperature.
+ *   Element abundances and the rates for the emission are also computed
+ *
+ *  \param u   internal energy per unit mass
+ *  \param rho gas density
+ *  \param ne_guess electron number density relative to hydrogen number density
+ *  \return the gas temperature
+ */
+double coolsfr::convert_u_to_temp(double u, double rho, double *ne_guess, gas_state *gs, const do_cool_data *DoCool)
+{
+  double u_input   = u;
+  double rho_input = rho;
+  double ne_input  = *ne_guess;
+
+  double mu   = (1 + 4 * gs->yhelium) / (1 + gs->yhelium + *ne_guess);
+  double temp = GAMMA_MINUS1 / BOLTZMANN * u * PROTONMASS * mu;
+
+  double max = 0;
+  int iter   = 0;
+  double temp_old;
+  do
+    {
+      double ne_old = *ne_guess;
+
+      find_abundances_and_rates(log10(temp), rho, ne_guess, gs, DoCool);
+      temp_old = temp;
+
+      mu = (1 + 4 * gs->yhelium) / (1 + gs->yhelium + *ne_guess);
+
+      double temp_new = GAMMA_MINUS1 / BOLTZMANN * u * PROTONMASS * mu;
+
+      max = std::max<double>(max, temp_new / (1 + gs->yhelium + *ne_guess) * fabs((*ne_guess - ne_old) / (temp_new - temp_old + 1.0)));
+
+      temp = temp_old + (temp_new - temp_old) / (1 + max);
+      iter++;
+
+      if(iter > (MAXITER - 10))
+        printf("-> temp= %g ne=%g\n", temp, *ne_guess);
+    }
+  while(fabs(temp - temp_old) > 1.0e-3 * temp && iter < MAXITER);
+
+  if(iter >= MAXITER)
+    {
+      printf("failed to converge in convert_u_to_temp()\n");
+      printf("u_input= %g\nrho_input=%g\n ne_input=%g\n", u_input, rho_input, ne_input);
+      printf("DoCool->u_old_input=%g\nDoCool->rho_input= %g\nDoCool->dt_input= %g\nDoCool->ne_guess_input= %g\n", DoCool->u_old_input,
+             DoCool->rho_input, DoCool->dt_input, DoCool->ne_guess_input);
+      Terminate("convergence failure");
+    }
+
+  return temp;
+}
+
+/** \brief Computes the actual abundance ratios.
+ *
+ *  The chemical composition of the gas is primordial (no metals are present)
+ *
+ *  \param logT     log10 of gas temperature
+ *  \param rho      gas density
+ *  \param ne_guess electron number density relative to hydrogen number density
+ */
+void coolsfr::find_abundances_and_rates(double logT, double rho, double *ne_guess, gas_state *gs, const do_cool_data *DoCool)
+{
+  double logT_input = logT;
+  double rho_input  = rho;
+  double ne_input   = *ne_guess;
+
+  if(!gsl_finite(logT))
+    Terminate("logT=%g\n", logT);
+
+  if(logT <= Tmin) /* everything neutral */
+    {
+      gs->nH0   = 1.0;
+      gs->nHe0  = gs->yhelium;
+      gs->nHp   = 0;
+      gs->nHep  = 0;
+      gs->nHepp = 0;
+      gs->ne    = 0;
+      *ne_guess = 0;
+      return;
+    }
+
+  if(logT >= Tmax) /* everything is ionized */
+    {
+      gs->nH0   = 0;
+      gs->nHe0  = 0;
+      gs->nHp   = 1.0;
+      gs->nHep  = 0;
+      gs->nHepp = gs->yhelium;
+      gs->ne    = gs->nHp + 2.0 * gs->nHepp;
+      *ne_guess = gs->ne; /* note: in units of the hydrogen number density */
+      return;
+    }
+
+  double t    = (logT - Tmin) / deltaT;
+  int j       = (int)t;
+  double fhi  = t - j;
+  double flow = 1 - fhi;
+
+  if(*ne_guess == 0)
+    *ne_guess = 1.0;
+
+  gs->nHcgs = gs->XH * rho / PROTONMASS; /* hydrogen number dens in cgs units */
+
+  gs->ne       = *ne_guess;
+  double neold = gs->ne;
+  int niter    = 0;
+  gs->necgs    = gs->ne * gs->nHcgs;
+
+  /* evaluate number densities iteratively (cf KWH eqns 33-38) in units of nH */
+  do
+    {
+      niter++;
+
+      gs->aHp   = flow * RateT[j].AlphaHp + fhi * RateT[j + 1].AlphaHp;
+      gs->aHep  = flow * RateT[j].AlphaHep + fhi * RateT[j + 1].AlphaHep;
+      gs->aHepp = flow * RateT[j].AlphaHepp + fhi * RateT[j + 1].AlphaHepp;
+      gs->ad    = flow * RateT[j].Alphad + fhi * RateT[j + 1].Alphad;
+      gs->geH0  = flow * RateT[j].GammaeH0 + fhi * RateT[j + 1].GammaeH0;
+      gs->geHe0 = flow * RateT[j].GammaeHe0 + fhi * RateT[j + 1].GammaeHe0;
+      gs->geHep = flow * RateT[j].GammaeHep + fhi * RateT[j + 1].GammaeHep;
+
+      if(gs->necgs <= 1.e-25 || pc.J_UV == 0)
+        {
+          gs->gJH0ne = gs->gJHe0ne = gs->gJHepne = 0;
+        }
+      else
+        {
+          gs->gJH0ne  = pc.gJH0 / gs->necgs;
+          gs->gJHe0ne = pc.gJHe0 / gs->necgs;
+          gs->gJHepne = pc.gJHep / gs->necgs;
+        }
+
+      gs->nH0 = gs->aHp / (gs->aHp + gs->geH0 + gs->gJH0ne); /* eqn (33) */
+      gs->nHp = 1.0 - gs->nH0;                               /* eqn (34) */
+
+      if((gs->gJHe0ne + gs->geHe0) <= SMALLNUM) /* no ionization at all */
+        {
+          gs->nHep  = 0.0;
+          gs->nHepp = 0.0;
+          gs->nHe0  = gs->yhelium;
+        }
+      else
+        {
+          gs->nHep = gs->yhelium /
+                     (1.0 + (gs->aHep + gs->ad) / (gs->geHe0 + gs->gJHe0ne) + (gs->geHep + gs->gJHepne) / gs->aHepp); /* eqn (35) */
+          gs->nHe0  = gs->nHep * (gs->aHep + gs->ad) / (gs->geHe0 + gs->gJHe0ne);                                     /* eqn (36) */
+          gs->nHepp = gs->nHep * (gs->geHep + gs->gJHepne) / gs->aHepp;                                               /* eqn (37) */
+        }
+
+      neold = gs->ne;
+
+      gs->ne    = gs->nHp + gs->nHep + 2 * gs->nHepp; /* eqn (38) */
+      gs->necgs = gs->ne * gs->nHcgs;
+
+      if(pc.J_UV == 0)
+        break;
+
+      double nenew = 0.5 * (gs->ne + neold);
+      gs->ne       = nenew;
+      gs->necgs    = gs->ne * gs->nHcgs;
+
+      if(fabs(gs->ne - neold) < 1.0e-4)
+        break;
+
+      if(niter > (MAXITER - 10))
+        printf("ne= %g  niter=%d\n", gs->ne, niter);
+    }
+  while(niter < MAXITER);
+
+  if(niter >= MAXITER)
+    Terminate(
+        "no convergence reached in find_abundances_and_rates(): logT_input= %g  rho_input= %g  ne_input= %g "
+        "DoCool->u_old_input=%g\nDoCool->rho_input= %g\nDoCool->dt_input= %g\nDoCool->ne_guess_input= %g\n",
+        logT_input, rho_input, ne_input, DoCool->u_old_input, DoCool->rho_input, DoCool->dt_input, DoCool->ne_guess_input);
+
+  gs->bH0  = flow * RateT[j].BetaH0 + fhi * RateT[j + 1].BetaH0;
+  gs->bHep = flow * RateT[j].BetaHep + fhi * RateT[j + 1].BetaHep;
+  gs->bff  = flow * RateT[j].Betaff + fhi * RateT[j + 1].Betaff;
+
+  *ne_guess = gs->ne;
+}
+
+/** \brief Get cooling rate from gas internal energy.
+ *
+ *  This function first computes the self-consistent temperature
+ *  and abundance ratios, and then it calculates
+ *  (heating rate-cooling rate)/n_h^2 in cgs units
+ *
+ *  \param u   gas internal energy per unit mass
+ *  \param rho gas density
+ *  \param ne_guess electron number density relative to hydrogen number density
+ */
+double coolsfr::CoolingRateFromU(double u, double rho, double *ne_guess, gas_state *gs, const do_cool_data *DoCool)
+{
+  double temp = convert_u_to_temp(u, rho, ne_guess, gs, DoCool);
+
+  return CoolingRate(log10(temp), rho, ne_guess, gs, DoCool);
+}
+
+/** \brief  This function computes the self-consistent temperature and abundance ratios.
+ *
+ *  Used only in the file io.c (maybe it is not necessary)
+ *
+ *  \param u   internal energy per unit mass
+ *  \param rho gas density
+ *  \param ne_guess electron number density relative to hydrogen number density
+ *  \param nH0_pointer pointer to the neutral hydrogen fraction (set to current value in the GasState struct)
+ *  \param nHeII_pointer pointer to the ionised helium fraction (set to current value in the GasState struct)
+ */
+double coolsfr::AbundanceRatios(double u, double rho, double *ne_guess, double *nH0_pointer, double *nHeII_pointer)
+{
+  gas_state gs          = GasState;
+  do_cool_data DoCool   = DoCoolData;
+  DoCool.u_old_input    = u;
+  DoCool.rho_input      = rho;
+  DoCool.ne_guess_input = *ne_guess;
+
+  rho *= All.UnitDensity_in_cgs * All.HubbleParam * All.HubbleParam; /* convert to physical cgs units */
+  u *= All.UnitPressure_in_cgs / All.UnitDensity_in_cgs;
+
+  double temp = convert_u_to_temp(u, rho, ne_guess, &gs, &DoCool);
+
+  *nH0_pointer   = gs.nH0;
+  *nHeII_pointer = gs.nHep;
+
+  return temp;
+}
+
+/** \brief  Calculate (heating rate-cooling rate)/n_h^2 in cgs units.
+ *
+ *  \param logT     log10 of gas temperature
+ *  \param rho      gas density
+ *  \param nelec    electron number density relative to hydrogen number density
+ *  \return         (heating rate-cooling rate)/n_h^2
+ */
+double coolsfr::CoolingRate(double logT, double rho, double *nelec, gas_state *gs, const do_cool_data *DoCool)
+{
+  double Lambda, Heat;
+
+  if(logT <= Tmin)
+    logT = Tmin + 0.5 * deltaT; /* floor at Tmin */
+
+  gs->nHcgs = gs->XH * rho / PROTONMASS; /* hydrogen number dens in cgs units */
+
+  if(logT < Tmax)
+    {
+      find_abundances_and_rates(logT, rho, nelec, gs, DoCool);
+
+      /* Compute cooling and heating rate (cf KWH Table 1) in units of nH**2 */
+      double T = pow(10.0, logT);
+
+      double LambdaExcH0   = gs->bH0 * gs->ne * gs->nH0;
+      double LambdaExcHep  = gs->bHep * gs->ne * gs->nHep;
+      double LambdaExc     = LambdaExcH0 + LambdaExcHep; /* excitation */
+      double LambdaIonH0   = 2.18e-11 * gs->geH0 * gs->ne * gs->nH0;
+      double LambdaIonHe0  = 3.94e-11 * gs->geHe0 * gs->ne * gs->nHe0;
+      double LambdaIonHep  = 8.72e-11 * gs->geHep * gs->ne * gs->nHep;
+      double LambdaIon     = LambdaIonH0 + LambdaIonHe0 + LambdaIonHep; /* ionization */
+      double LambdaRecHp   = 1.036e-16 * T * gs->ne * (gs->aHp * gs->nHp);
+      double LambdaRecHep  = 1.036e-16 * T * gs->ne * (gs->aHep * gs->nHep);
+      double LambdaRecHepp = 1.036e-16 * T * gs->ne * (gs->aHepp * gs->nHepp);
+      double LambdaRecHepd = 6.526e-11 * gs->ad * gs->ne * gs->nHep;
+      double LambdaRec     = LambdaRecHp + LambdaRecHep + LambdaRecHepp + LambdaRecHepd;
+      double LambdaFF      = gs->bff * (gs->nHp + gs->nHep + 4 * gs->nHepp) * gs->ne;
+      Lambda               = LambdaExc + LambdaIon + LambdaRec + LambdaFF;
+
+      if(All.ComovingIntegrationOn)
+        {
+          double redshift    = 1 / All.Time - 1;
+          double LambdaCmptn = 5.65e-36 * gs->ne * (T - 2.73 * (1. + redshift)) * pow(1. + redshift, 4.) / gs->nHcgs;
+
+          Lambda += LambdaCmptn;
+        }
+
+      Heat = 0;
+      if(pc.J_UV != 0)
+        Heat += (gs->nH0 * pc.epsH0 + gs->nHe0 * pc.epsHe0 + gs->nHep * pc.epsHep) / gs->nHcgs;
+    }
+  else /* here we're outside of tabulated rates, T>Tmax K */
+    {
+      /* at high T (fully ionized); only free-free and Compton cooling are present. Assumes no heating. */
+
+      Heat = 0;
+
+      /* very hot: H and He both fully ionized */
+      gs->nHp   = 1.0;
+      gs->nHep  = 0;
+      gs->nHepp = gs->yhelium;
+      gs->ne    = gs->nHp + 2.0 * gs->nHepp;
+      *nelec    = gs->ne; /* note: in units of the hydrogen number density */
+
+      double T        = pow(10.0, logT);
+      double LambdaFF = 1.42e-27 * sqrt(T) * (1.1 + 0.34 * exp(-(5.5 - logT) * (5.5 - logT) / 3)) * (gs->nHp + 4 * gs->nHepp) * gs->ne;
+      double LambdaCmptn;
+      if(All.ComovingIntegrationOn)
+        {
+          double redshift = 1 / All.Time - 1;
+          /* add inverse Compton cooling off the microwave background */
+          LambdaCmptn = 5.65e-36 * gs->ne * (T - 2.73 * (1. + redshift)) * pow(1. + redshift, 4.) / gs->nHcgs;
+        }
+      else
+        LambdaCmptn = 0;
+
+      Lambda = LambdaFF + LambdaCmptn;
+    }
+
+  return (Heat - Lambda);
+}
+
+/** \brief Make cooling rates interpolation table.
+ *
+ *  Set up interpolation tables in T for cooling rates given in KWH, ApJS, 105, 19
+ */
+void coolsfr::MakeRateTable(void)
+{
+  GasState.yhelium = (1 - GasState.XH) / (4 * GasState.XH);
+  GasState.mhboltz = PROTONMASS / BOLTZMANN;
+
+  deltaT          = (Tmax - Tmin) / NCOOLTAB;
+  GasState.ethmin = pow(10.0, Tmin) * (1. + GasState.yhelium) / ((1. + 4. * GasState.yhelium) * GasState.mhboltz * GAMMA_MINUS1);
+  /* minimum internal energy for neutral gas */
+
+  for(int i = 0; i <= NCOOLTAB; i++)
+    {
+      RateT[i].BetaH0 = RateT[i].BetaHep = RateT[i].Betaff = RateT[i].AlphaHp = RateT[i].AlphaHep = RateT[i].AlphaHepp =
+          RateT[i].Alphad = RateT[i].GammaeH0 = RateT[i].GammaeHe0 = RateT[i].GammaeHep = 0;
+
+      double T     = pow(10.0, Tmin + deltaT * i);
+      double Tfact = 1.0 / (1 + sqrt(T / 1.0e5));
+
+      /* collisional excitation */
+      /* Cen 1992 */
+      if(118348 / T < 70)
+        RateT[i].BetaH0 = 7.5e-19 * exp(-118348 / T) * Tfact;
+      if(473638 / T < 70)
+        RateT[i].BetaHep = 5.54e-17 * pow(T, -0.397) * exp(-473638 / T) * Tfact;
+
+      /* free-free */
+      RateT[i].Betaff = 1.43e-27 * sqrt(T) * (1.1 + 0.34 * exp(-(5.5 - log10(T)) * (5.5 - log10(T)) / 3));
+
+      /* recombination */
+
+      /* Cen 1992 */
+      /* Hydrogen II */
+      RateT[i].AlphaHp = 8.4e-11 * pow(T / 1000, -0.2) / (1. + pow(T / 1.0e6, 0.7)) / sqrt(T);
+      /* Helium II */
+      RateT[i].AlphaHep = 1.5e-10 * pow(T, -0.6353);
+      /* Helium III */
+      RateT[i].AlphaHepp = 4. * RateT[i].AlphaHp;
+      /* Cen 1992 */
+      /* dielectric recombination */
+      if(470000 / T < 70)
+        RateT[i].Alphad = 1.9e-3 * pow(T, -1.5) * exp(-470000 / T) * (1. + 0.3 * exp(-94000 / T));
+
+      /* collisional ionization */
+      /* Cen 1992 */
+      /* Hydrogen */
+      if(157809.1 / T < 70)
+        RateT[i].GammaeH0 = 5.85e-11 * sqrt(T) * exp(-157809.1 / T) * Tfact;
+      /* Helium */
+      if(285335.4 / T < 70)
+        RateT[i].GammaeHe0 = 2.38e-11 * sqrt(T) * exp(-285335.4 / T) * Tfact;
+      /* Hellium II */
+      if(631515.0 / T < 70)
+        RateT[i].GammaeHep = 5.68e-12 * sqrt(T) * exp(-631515.0 / T) * Tfact;
+    }
+}
+
+/** \brief Read table input for ionizing parameters.
+ *
+ *  \param file that contains the tabulated parameters
+ */
+void coolsfr::ReadIonizeParams(char *fname)
+{
+  NheattabUVB = 0;
+  int i, iter;
+  for(iter = 0, i = 0; iter < 2; iter++)
+    {
+      FILE *fdcool;
+      if(!(fdcool = fopen(fname, "r")))
+        Terminate(" Cannot read ionization table in file `%s'\n", fname);
+      if(iter == 0)
+        while(fscanf(fdcool, "%*g %*g %*g %*g %*g %*g %*g") != EOF)
+          NheattabUVB++;
+      if(iter == 1)
+        while(fscanf(fdcool, "%g %g %g %g %g %g %g", &PhotoTUVB[i].variable, &PhotoTUVB[i].gH0, &PhotoTUVB[i].gHe, &PhotoTUVB[i].gHep,
+                     &PhotoTUVB[i].eH0, &PhotoTUVB[i].eHe, &PhotoTUVB[i].eHep) != EOF)
+          i++;
+      fclose(fdcool);
+
+      if(iter == 0)
+        {
+          PhotoTUVB = (photo_table *)Mem.mymalloc("PhotoT", NheattabUVB * sizeof(photo_table));
+          mpi_printf("COOLING: read ionization table with %d entries in file `%s'.\n", NheattabUVB, fname);
+        }
+    }
+  /* ignore zeros at end of treecool file */
+  for(i = 0; i < NheattabUVB; ++i)
+    if(PhotoTUVB[i].gH0 == 0.0)
+      break;
+
+  NheattabUVB = i;
+  mpi_printf("COOLING: using %d ionization table entries from file `%s'.\n", NheattabUVB, fname);
+
+  if(NheattabUVB < 1)
+    Terminate("The length of the cooling table has to have at least one entry");
+}
+
+/** \brief Set the ionization parameters for the UV background.
+ */
+void coolsfr::IonizeParamsUVB(void)
+{
+  if(!All.ComovingIntegrationOn)
+    {
+      SetZeroIonization();
+      return;
+    }
+
+  if(NheattabUVB == 1)
+    {
+      /* treat the one value given as constant with redshift */
+      pc.J_UV   = 1;
+      pc.gJH0   = PhotoTUVB[0].gH0;
+      pc.gJHe0  = PhotoTUVB[0].gHe;
+      pc.gJHep  = PhotoTUVB[0].gHep;
+      pc.epsH0  = PhotoTUVB[0].eH0;
+      pc.epsHe0 = PhotoTUVB[0].eHe;
+      pc.epsHep = PhotoTUVB[0].eHep;
+    }
+  else
+    {
+      double redshift = 1 / All.Time - 1;
+      double logz     = log10(redshift + 1.0);
+      int ilow        = 0;
+      for(int i = 0; i < NheattabUVB; i++)
+        {
+          if(PhotoTUVB[i].variable < logz)
+            ilow = i;
+          else
+            break;
+        }
+
+      if(logz > PhotoTUVB[NheattabUVB - 1].variable || ilow >= NheattabUVB - 1)
+        {
+          SetZeroIonization();
+        }
+      else
+        {
+          double dzlow = logz - PhotoTUVB[ilow].variable;
+          double dzhi  = PhotoTUVB[ilow + 1].variable - logz;
+
+          if(PhotoTUVB[ilow].gH0 == 0 || PhotoTUVB[ilow + 1].gH0 == 0)
+            {
+              SetZeroIonization();
+            }
+          else
+            {
+              pc.J_UV   = 1;
+              pc.gJH0   = pow(10., (dzhi * log10(PhotoTUVB[ilow].gH0) + dzlow * log10(PhotoTUVB[ilow + 1].gH0)) / (dzlow + dzhi));
+              pc.gJHe0  = pow(10., (dzhi * log10(PhotoTUVB[ilow].gHe) + dzlow * log10(PhotoTUVB[ilow + 1].gHe)) / (dzlow + dzhi));
+              pc.gJHep  = pow(10., (dzhi * log10(PhotoTUVB[ilow].gHep) + dzlow * log10(PhotoTUVB[ilow + 1].gHep)) / (dzlow + dzhi));
+              pc.epsH0  = pow(10., (dzhi * log10(PhotoTUVB[ilow].eH0) + dzlow * log10(PhotoTUVB[ilow + 1].eH0)) / (dzlow + dzhi));
+              pc.epsHe0 = pow(10., (dzhi * log10(PhotoTUVB[ilow].eHe) + dzlow * log10(PhotoTUVB[ilow + 1].eHe)) / (dzlow + dzhi));
+              pc.epsHep = pow(10., (dzhi * log10(PhotoTUVB[ilow].eHep) + dzlow * log10(PhotoTUVB[ilow + 1].eHep)) / (dzlow + dzhi));
+            }
+        }
+    }
+}
+
+/** \brief Reset the ionization parameters.
+ */
+void coolsfr::SetZeroIonization(void) { memset(&pc, 0, sizeof(photo_current)); }
+
+/** \brief Wrapper function to set the ionizing background.
+ */
+void coolsfr::IonizeParams(void) { IonizeParamsUVB(); }
+
+/** \brief Initialize the cooling module.
+ *
+ *   This function initializes the cooling module. In particular,
+ *   it allocates the memory for the cooling rate and ionization tables
+ *   and initializes them.
+ */
+void coolsfr::InitCool(void)
+{
+  /* set default hydrogen mass fraction */
+  GasState.XH = HYDROGEN_MASSFRAC;
+
+  /* zero photo-ionization/heating rates */
+  SetZeroIonization();
+
+  /* allocate and construct rate table */
+  RateT = (rate_table *)Mem.mymalloc("RateT", (NCOOLTAB + 1) * sizeof(rate_table));
+  ;
+  MakeRateTable();
+
+  /* read photo tables */
+  ReadIonizeParams(All.TreecoolFile);
+
+  All.Time = All.TimeBegin;
+  All.set_cosmo_factors_for_current_time();
+
+  IonizeParams();
+}
+
+/** \brief Apply the isochoric cooling to all the active gas particles.
+ *
+ */
+void coolsfr::cooling_only(simparticles *Sp) /* normal cooling routine when star formation is disabled */
+{
+  TIMER_START(CPU_COOLING_SFR);
+  All.set_cosmo_factors_for_current_time();
+
+  gas_state gs        = GasState;
+  do_cool_data DoCool = DoCoolData;
+
+  for(int i = 0; i < Sp->TimeBinsHydro.NActiveParticles; i++)
+    {
+      int target = Sp->TimeBinsHydro.ActiveParticleList[i];
+      if(Sp->P[target].getType() == 0)
+        {
+          if(Sp->P[target].getMass() == 0 && Sp->P[target].ID.get() == 0)
+            continue; /* skip particles that have been swallowed or eliminated */
+
+          cool_sph_particle(Sp, target, &gs, &DoCool);
+        }
+    }
+  TIMER_STOP(CPU_COOLING_SFR);
+}
+
+/** \brief Apply the isochoric cooling to a given gas particle.
+ *
+ *  This function applies the normal isochoric cooling to a single gas particle.
+ *  Once the cooling has been applied according to one of the cooling models implemented,
+ *  the internal energy per unit mass, the total energy and the pressure of the particle are updated.
+ *
+ *  \param i index of the gas particle to which cooling is applied
+ */
+void coolsfr::cool_sph_particle(simparticles *Sp, int i, gas_state *gs, do_cool_data *DoCool)
+{
+  double dens = Sp->SphP[i].Density;
+
+  double dt = (Sp->P[i].getTimeBinHydro() ? (((integertime)1) << Sp->P[i].getTimeBinHydro()) : 0) * All.Timebase_interval;
+
+  double dtime = All.cf_atime * dt / All.cf_atime_hubble_a;
+
+  double utherm = Sp->get_utherm_from_entropy(i);
+
+  double ne      = Sp->SphP[i].Ne; /* electron abundance (gives ionization state and mean molecular weight) */
+  double unew    = DoCooling(std::max<double>(All.MinEgySpec, utherm), dens * All.cf_a3inv, dtime, &ne, gs, DoCool);
+  Sp->SphP[i].Ne = ne;
+
+  if(unew < 0)
+    Terminate("invalid temperature: i=%d unew=%g\n", i, unew);
+
+  double du = unew - utherm;
+
+  if(unew < All.MinEgySpec)
+    du = All.MinEgySpec - utherm;
+
+  utherm += du;
+
+#ifdef OUTPUT_COOLHEAT
+  if(dtime > 0)
+    Sp->SphP[i].CoolHeat = du * Sp->P[i].getMass() / dtime;
+#endif
+
+  Sp->set_entropy_from_utherm(utherm, i);
+  Sp->SphP[i].set_thermodynamic_variables();
+}
+
+#endif
diff --git a/src/cooling_sfr/cooling.h b/src/cooling_sfr/cooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..40eb0568e158f2aa0f0d4aa30025471c8f5c775d
--- /dev/null
+++ b/src/cooling_sfr/cooling.h
@@ -0,0 +1,136 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file cooling.h
+ *
+ *  \brief defines a class for dealing with cooling and star formation
+ */
+
+#ifndef COOLING_H
+#define COOLING_H
+
+#ifdef COOLING
+
+#include "../data/simparticles.h"
+#include "../mpi_utils/setcomm.h"
+
+class coolsfr : public setcomm
+{
+ public:
+  coolsfr(MPI_Comm comm) : setcomm(comm) {}
+
+  double AbundanceRatios(double u, double rho, double *ne_guess, double *nH0_pointer, double *nHeII_pointer);
+
+  void InitCool(void);
+  void IonizeParams(void);
+
+  void cooling_only(simparticles *Sp);
+
+#ifdef STARFORMATION
+  void sfr_create_star_particles(simparticles *Sp);
+
+  void set_units_sfr(void);
+
+  void cooling_and_starformation(simparticles *Sp);
+
+  void init_clouds(void);
+#endif
+
+ private:
+#define NCOOLTAB 2000
+
+  /* data for gas state */
+  struct gas_state
+  {
+    double ne, necgs, nHcgs;
+    double bH0, bHep, bff, aHp, aHep, aHepp, ad, geH0, geHe0, geHep;
+    double gJH0ne, gJHe0ne, gJHepne;
+    double nH0, nHp, nHep, nHe0, nHepp;
+    double XH, yhelium;
+    double mhboltz;
+    double ethmin; /* minimum internal energy for neutral gas */
+  };
+
+  /* tabulated rates */
+  struct rate_table
+  {
+    double BetaH0, BetaHep, Betaff;
+    double AlphaHp, AlphaHep, Alphad, AlphaHepp;
+    double GammaeH0, GammaeHe0, GammaeHep;
+  };
+
+  /* photo-ionization/heating rate table */
+  struct photo_table
+  {
+    float variable;       /* logz for UVB */
+    float gH0, gHe, gHep; /* photo-ionization rates */
+    float eH0, eHe, eHep; /* photo-heating rates */
+  };
+
+  /* current interpolated photo-ionization/heating rates */
+  struct photo_current
+  {
+    char J_UV;
+    double gJH0, gJHep, gJHe0, epsH0, epsHep, epsHe0;
+  };
+
+  /* cooling data */
+  struct do_cool_data
+  {
+    double u_old_input, rho_input, dt_input, ne_guess_input;
+  };
+
+  gas_state GasState;      /**< gas state */
+  do_cool_data DoCoolData; /**< cooling data */
+
+  rate_table *RateT;      /**< tabulated rates */
+  photo_table *PhotoTUVB; /**< photo-ionization/heating rate table for UV background*/
+  photo_current pc;       /**< current interpolated photo rates */
+
+  double Tmin = 1.0; /**< min temperature in log10 */
+  double Tmax = 9.0; /**< max temperature in log10 */
+  double deltaT;     /**< log10 of temperature spacing in the interpolation tables */
+  int NheattabUVB;   /**< length of UVB photo table */
+
+#ifdef COOLING
+  double DoCooling(double u_old, double rho, double dt, double *ne_guess, gas_state *gs, do_cool_data *DoCool);
+  double GetCoolingTime(double u_old, double rho, double *ne_guess, gas_state *gs, do_cool_data *DoCool);
+  void cool_sph_particle(simparticles *Sp, int i, gas_state *gs, do_cool_data *DoCool);
+
+  void SetZeroIonization(void);
+#endif
+
+  void integrate_sfr(void);
+
+  double CoolingRate(double logT, double rho, double *nelec, gas_state *gs, const do_cool_data *DoCool);
+  double CoolingRateFromU(double u, double rho, double *ne_guess, gas_state *gs, const do_cool_data *DoCool);
+  void find_abundances_and_rates(double logT, double rho, double *ne_guess, gas_state *gs, const do_cool_data *DoCool);
+  void IonizeParamsUVB(void);
+  void ReadIonizeParams(char *fname);
+
+  double convert_u_to_temp(double u, double rho, double *ne_guess, gas_state *gs, const do_cool_data *DoCool);
+
+  void MakeRateTable(void);
+
+#ifdef STARFORMATION
+  const int WriteMiscFiles = 1;
+
+  void make_star(simparticles *Sp, int i, double prob, MyDouble mass_of_star, double *sum_mass_stars);
+  void spawn_star_from_sph_particle(simparticles *Sp, int igas, double birthtime, int istar, MyDouble mass_of_star);
+  void convert_sph_particle_into_star(simparticles *Sp, int i, double birthtime);
+
+  int stars_spawned;           /**< local number of star particles spawned in the time step */
+  int tot_stars_spawned;       /**< global number of star paricles spawned in the time step */
+  int stars_converted;         /**< local number of gas cells converted into stars in the time step */
+  int tot_stars_converted;     /**< global number of gas cells converted into stars in the time step */
+  int altogether_spawned;      /**< local number of star+wind particles spawned in the time step */
+  int tot_altogether_spawned;  /**< global number of star+wind particles spawned in the time step */
+  double cum_mass_stars = 0.0; /**< cumulative mass of stars created in the time step (global value) */
+#endif
+};
+
+#endif
+#endif
diff --git a/src/cooling_sfr/sfr_eos.cc b/src/cooling_sfr/sfr_eos.cc
new file mode 100644
index 0000000000000000000000000000000000000000..176667d22a4f3ff9588c6eeafea4e0ceab952e02
--- /dev/null
+++ b/src/cooling_sfr/sfr_eos.cc
@@ -0,0 +1,458 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file sfr_eos.cc
+ *
+ *  \brief Star formation rate routines for the effective multi-phase model
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef STARFORMATION
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../logs/logs.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/** \brief Main driver for star formation and gas cooling.
+ *
+ *  This function loops over all the active gas cells. If a given cell
+ *  meets the criteria for star formation to be active the multi-phase
+ *  model is activated, the properties of the cell are updated according to
+ *  the latter and the star formation rate computed. In the other case, the
+ *  standard isochoric cooling is applied to the gas cell by calling the function
+ *  cool_sph_particle() and the star formation rate is set to 0.
+ */
+void coolsfr::cooling_and_starformation(simparticles *Sp)
+{
+  TIMER_START(CPU_COOLING_SFR);
+
+  /* clear the SFR stored in the active timebins */
+  for(int bin = 0; bin < TIMEBINS; bin++)
+    if(Sp->TimeBinSynchronized[bin])
+      Sp->TimeBinSfr[bin] = 0;
+
+  All.set_cosmo_factors_for_current_time();
+
+  gas_state gs    = GasState;
+  do_cool_data dc = DoCoolData;
+
+  for(int i = 0; i < Sp->TimeBinsHydro.NActiveParticles; i++)
+    {
+      int target = Sp->TimeBinsHydro.ActiveParticleList[i];
+      if(Sp->P[target].getType() == 0)
+        {
+          if(Sp->P[target].getMass() == 0 && Sp->P[target].ID.get() == 0)
+            continue; /* skip cells that have been swallowed or eliminated */
+
+          double dens = Sp->SphP[target].Density;
+
+          double dt =
+              (Sp->P[target].getTimeBinHydro() ? (((integertime)1) << Sp->P[target].getTimeBinHydro()) : 0) * All.Timebase_interval;
+          /*  the actual time-step */
+
+          double dtime = All.cf_atime * dt / All.cf_atime_hubble_a;
+
+          /* check whether conditions for star formation are fulfilled.
+           *
+           * f=1  normal cooling
+           * f=0  star formation
+           */
+          int flag = 1; /* default is normal cooling */
+
+          if(dens * All.cf_a3inv >= All.PhysDensThresh)
+            flag = 0;
+
+          if(All.ComovingIntegrationOn)
+            if(dens < All.OverDensThresh)
+              flag = 1;
+
+          if(flag == 1) /* normal implicit isochoric cooling */
+            {
+              Sp->SphP[target].Sfr = 0;
+              cool_sph_particle(Sp, target, &gs, &dc);
+            }
+
+          if(flag == 0) /* active star formation */
+            {
+              double tsfr = sqrt(All.PhysDensThresh / (dens * All.cf_a3inv)) * All.MaxSfrTimescale;
+
+              double factorEVP = pow(dens * All.cf_a3inv / All.PhysDensThresh, -0.8) * All.FactorEVP;
+
+              double egyhot = All.EgySpecSN / (1 + factorEVP) + All.EgySpecCold;
+
+              double ne = Sp->SphP[target].Ne;
+
+              double tcool        = GetCoolingTime(egyhot, dens * All.cf_a3inv, &ne, &gs, &dc);
+              Sp->SphP[target].Ne = ne;
+
+              double y = tsfr / tcool * egyhot / (All.FactorSN * All.EgySpecSN - (1 - All.FactorSN) * All.EgySpecCold);
+
+              double x = 1 + 1 / (2 * y) - sqrt(1 / y + 1 / (4 * y * y));
+
+              double egyeff = egyhot * (1 - x) + All.EgySpecCold * x;
+
+              double cloudmass = x * Sp->P[target].getMass();
+              double utherm    = Sp->get_utherm_from_entropy(target);
+
+              if(tsfr < dtime)
+                tsfr = dtime;
+
+              if(dt > 0)
+                {
+                  if(Sp->P[target].getTimeBinHydro()) /* upon start-up, we need to protect against dt==0 */
+                    {
+                      double trelax     = tsfr * (1 - x) / x / (All.FactorSN * (1 + factorEVP));
+                      double egycurrent = utherm;
+
+                      double unew;
+                      if(egycurrent > egyeff)
+                        {
+                          double dtcool = dtime;
+                          ne            = Sp->SphP[target].Ne;
+
+                          unew = DoCooling(egycurrent, dens * All.cf_a3inv, dtcool, &ne, &gs, &dc);
+
+                          if(unew < egyeff)
+                            {
+                              unew = egyeff;
+                            }
+                        }
+                      else
+                        unew = (egyeff + (egycurrent - egyeff) * exp(-dtime / trelax));
+
+                      double du = unew - utherm;
+                      if(unew < All.MinEgySpec)
+                        du = All.MinEgySpec - utherm;
+
+                      utherm += du;
+                      Sp->set_entropy_from_utherm(utherm, target);
+                      Sp->SphP[target].DtEntropy = 0.0;
+
+#ifdef OUTPUT_COOLHEAT
+                      if(dtime > 0)
+                        Sp->SphP[target].CoolHeat = du * Sp->P[target].getMass() / dtime;
+#endif
+                      Sp->SphP[target].set_thermodynamic_variables();
+                    }
+                }
+
+              if(utherm > 1.01 * egyeff)
+                Sp->SphP[target].Sfr = 0;
+              else
+                {
+                  /* note that we convert the star formation rate to solar masses per year */
+                  Sp->SphP[target].Sfr =
+                      (1 - All.FactorSN) * cloudmass / tsfr * (All.UnitMass_in_g / SOLAR_MASS) / (All.UnitTime_in_s / SEC_PER_YEAR);
+                }
+
+              Sp->TimeBinSfr[Sp->P[target].getTimeBinHydro()] += Sp->SphP[target].Sfr;
+            }
+        }
+    } /* end of main loop over active particles */
+
+  TIMER_STOP(CPU_COOLING_SFR);
+}
+
+/** \brief Initialize the parameters of effective multi-phase model.
+ *
+ *   In particular this function computes the value of PhysDensThresh, that is
+ *   the physical density threshold above which star formation is active, if its
+ *   value was set to 0 in the parameter file.
+ */
+void coolsfr::init_clouds(void)
+{
+  gas_state gs    = GasState;
+  do_cool_data dc = DoCoolData;
+
+  if(All.PhysDensThresh == 0)
+    {
+      double A0 = All.FactorEVP;
+
+      double egyhot = All.EgySpecSN / A0;
+
+      double meanweight = 4 / (8 - 5 * (1 - HYDROGEN_MASSFRAC)); /* note: assuming FULL ionization */
+
+      double u4 = 1 / meanweight * (1.0 / GAMMA_MINUS1) * (BOLTZMANN / PROTONMASS) * 1.0e4;
+      u4 *= All.UnitMass_in_g / All.UnitEnergy_in_cgs;
+
+      double dens;
+      if(All.ComovingIntegrationOn)
+        dens = 1.0e6 * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G);
+      else
+        dens = 1.0e6 * (1.0e-29 / All.UnitDensity_in_cgs);
+
+      if(All.ComovingIntegrationOn)
+        {
+          All.Time = 1.0; /* to be guaranteed to get z=0 rate */
+          All.set_cosmo_factors_for_current_time();
+          IonizeParams();
+        }
+
+      double ne = 1.0;
+      SetZeroIonization();
+
+      double tcool = GetCoolingTime(egyhot, dens, &ne, &gs, &dc);
+
+      double coolrate = egyhot / tcool / dens;
+
+      double x = (egyhot - u4) / (egyhot - All.EgySpecCold);
+
+      All.PhysDensThresh =
+          x / pow(1 - x, 2) * (All.FactorSN * All.EgySpecSN - (1 - All.FactorSN) * All.EgySpecCold) / (All.MaxSfrTimescale * coolrate);
+
+      mpi_printf(
+          "\nA0= %g  \nComputed: PhysDensThresh= %g  (int units)         %g h^2 cm^-3\nEXPECTED FRACTION OF COLD GAS AT THRESHOLD = "
+          "%g\n\ntcool=%g dens=%g egyhot=%g\n",
+          A0, All.PhysDensThresh, All.PhysDensThresh / (PROTONMASS / HYDROGEN_MASSFRAC / All.UnitDensity_in_cgs), x, tcool, dens,
+          egyhot);
+
+      dens = All.PhysDensThresh * 10;
+
+      double neff;
+      do
+        {
+          double tsfr      = sqrt(All.PhysDensThresh / (dens)) * All.MaxSfrTimescale;
+          double factorEVP = pow(dens / All.PhysDensThresh, -0.8) * All.FactorEVP;
+          egyhot           = All.EgySpecSN / (1 + factorEVP) + All.EgySpecCold;
+
+          ne = 0.5;
+
+          tcool = GetCoolingTime(egyhot, dens, &ne, &gs, &dc);
+
+          double y      = tsfr / tcool * egyhot / (All.FactorSN * All.EgySpecSN - (1 - All.FactorSN) * All.EgySpecCold);
+          x             = 1 + 1 / (2 * y) - sqrt(1 / y + 1 / (4 * y * y));
+          double egyeff = egyhot * (1 - x) + All.EgySpecCold * x;
+
+          double peff = GAMMA_MINUS1 * dens * egyeff;
+
+          double fac = 1 / (log(dens * 1.025) - log(dens));
+          dens *= 1.025;
+
+          neff = -log(peff) * fac;
+
+          tsfr      = sqrt(All.PhysDensThresh / (dens)) * All.MaxSfrTimescale;
+          factorEVP = pow(dens / All.PhysDensThresh, -0.8) * All.FactorEVP;
+          egyhot    = All.EgySpecSN / (1 + factorEVP) + All.EgySpecCold;
+
+          ne = 0.5;
+
+          tcool = GetCoolingTime(egyhot, dens, &ne, &gs, &dc);
+
+          y      = tsfr / tcool * egyhot / (All.FactorSN * All.EgySpecSN - (1 - All.FactorSN) * All.EgySpecCold);
+          x      = 1 + 1 / (2 * y) - sqrt(1 / y + 1 / (4 * y * y));
+          egyeff = egyhot * (1 - x) + All.EgySpecCold * x;
+
+          peff = GAMMA_MINUS1 * dens * egyeff;
+
+          neff += log(peff) * fac;
+        }
+      while(neff > 4.0 / 3);
+
+      double thresholdStarburst = dens;
+
+      mpi_printf("Run-away sets in for dens=%g\nDynamic range for quiescent star formation= %g\n", thresholdStarburst,
+                 thresholdStarburst / All.PhysDensThresh);
+
+      integrate_sfr();
+
+      if(ThisTask == 0)
+        {
+          double sigma = 10.0 / All.Hubble * 1.0e-10 / pow(1.0e-3, 2);
+
+          printf("Isotherm sheet central density: %g   z0=%g\n", M_PI * All.G * sigma * sigma / (2 * GAMMA_MINUS1) / u4,
+                 GAMMA_MINUS1 * u4 / (2 * M_PI * All.G * sigma));
+          myflush(stdout);
+        }
+
+      if(All.ComovingIntegrationOn)
+        {
+          All.Time = All.TimeBegin;
+          All.set_cosmo_factors_for_current_time();
+          IonizeParams();
+        }
+    }
+}
+
+/** \brief Compute the effective equation of state for the gas and
+ *         the integrated SFR per unit area.
+ *
+ *  This function computes the effective equation of state for the gas and
+ *  the integrated SFR per unit area. It saves the results into two files:
+ *  eos.txt for the equation of state and sfrrate.txt for the integrated SFR.
+ *  In the latter case, the SFR is determined by integrating along the vertical
+ *  direction the gas density of an infinite self-gravitating isothermal sheet.
+ *  The integrated gas density is saved as well, so effectively sfrrate.txt
+ *  contains the Kennicutt-Schmidt law of the star formation model.
+ */
+void coolsfr::integrate_sfr(void)
+{
+  double meanweight = 4 / (8 - 5 * (1 - HYDROGEN_MASSFRAC)); /* note: assuming FULL ionization */
+  double u4         = 1 / meanweight * (1.0 / GAMMA_MINUS1) * (BOLTZMANN / PROTONMASS) * 1.0e4;
+  u4 *= All.UnitMass_in_g / All.UnitEnergy_in_cgs;
+  gas_state gs    = GasState;
+  do_cool_data dc = DoCoolData;
+
+  if(All.ComovingIntegrationOn)
+    {
+      All.Time = 1.0; /* to be guaranteed to get z=0 rate */
+      All.set_cosmo_factors_for_current_time();
+      IonizeParams();
+    }
+
+  FILE *fd = (WriteMiscFiles && (ThisTask == 0)) ? fopen("eos.txt", "w") : NULL;
+
+  for(double rho = All.PhysDensThresh; rho <= 1000 * All.PhysDensThresh; rho *= 1.1)
+    {
+      double tsfr = sqrt(All.PhysDensThresh / rho) * All.MaxSfrTimescale;
+
+      double factorEVP = pow(rho / All.PhysDensThresh, -0.8) * All.FactorEVP;
+
+      double egyhot = All.EgySpecSN / (1 + factorEVP) + All.EgySpecCold;
+
+      double ne = 1.0;
+
+      double tcool = GetCoolingTime(egyhot, rho, &ne, &gs, &dc);
+
+      double y = tsfr / tcool * egyhot / (All.FactorSN * All.EgySpecSN - (1 - All.FactorSN) * All.EgySpecCold);
+      double x = 1 + 1 / (2 * y) - sqrt(1 / y + 1 / (4 * y * y));
+
+      double egyeff = egyhot * (1 - x) + All.EgySpecCold * x;
+
+      double P0 = GAMMA_MINUS1 * rho * egyeff;
+
+      if(WriteMiscFiles && (ThisTask == 0))
+        {
+          fprintf(fd, "%g %g\n", rho, P0);
+        }
+    }
+
+  if(WriteMiscFiles && (ThisTask == 0))
+    {
+      fclose(fd);
+      fd = fopen("sfrrate.txt", "w");
+    }
+
+  for(double rho0 = All.PhysDensThresh; rho0 <= 10000 * All.PhysDensThresh; rho0 *= 1.02)
+    {
+      double rho = rho0;
+      double q   = 0;
+      double dz  = 0.001;
+
+      double sigma = 0, sigmasfr = 0, sigma_u4 = 0, x = 0;
+
+      while(rho > 0.0001 * rho0)
+        {
+          double tsfr, P0, gam;
+          if(rho > All.PhysDensThresh)
+            {
+              tsfr = sqrt(All.PhysDensThresh / rho) * All.MaxSfrTimescale;
+
+              double factorEVP = pow(rho / All.PhysDensThresh, -0.8) * All.FactorEVP;
+
+              double egyhot = All.EgySpecSN / (1 + factorEVP) + All.EgySpecCold;
+
+              double ne = 1.0;
+
+              double tcool = GetCoolingTime(egyhot, rho, &ne, &gs, &dc);
+
+              double y = tsfr / tcool * egyhot / (All.FactorSN * All.EgySpecSN - (1 - All.FactorSN) * All.EgySpecCold);
+              x        = 1 + 1 / (2 * y) - sqrt(1 / y + 1 / (4 * y * y));
+
+              double egyeff = egyhot * (1 - x) + All.EgySpecCold * x;
+
+              P0        = GAMMA_MINUS1 * rho * egyeff;
+              double P1 = P0;
+
+              double rho2       = 1.1 * rho;
+              double tsfr2      = sqrt(All.PhysDensThresh / rho2) * All.MaxSfrTimescale;
+              double factorEVP2 = pow(rho2 / All.PhysDensThresh, -0.8) * All.FactorEVP;
+              double egyhot2    = All.EgySpecSN / (1 + factorEVP2) + All.EgySpecCold;
+
+              double tcool2  = GetCoolingTime(egyhot2, rho2, &ne, &gs, &dc);
+              double y2      = tsfr2 / tcool2 * egyhot2 / (All.FactorSN * All.EgySpecSN - (1 - All.FactorSN) * All.EgySpecCold);
+              double x2      = 1 + 1 / (2 * y2) - sqrt(1 / y2 + 1 / (4 * y2 * y2));
+              double egyeff2 = egyhot2 * (1 - x2) + All.EgySpecCold * x2;
+              double P2      = GAMMA_MINUS1 * rho2 * egyeff2;
+
+              gam = log(P2 / P1) / log(rho2 / rho);
+            }
+          else
+            {
+              tsfr = 0;
+
+              P0  = GAMMA_MINUS1 * rho * u4;
+              gam = 1.0;
+
+              sigma_u4 += rho * dz;
+            }
+
+          double drho = q;
+          double dq   = -(gam - 2) / rho * q * q - 4 * M_PI * All.G / (gam * P0) * rho * rho * rho;
+
+          sigma += rho * dz;
+          if(tsfr > 0)
+            {
+              sigmasfr += (1 - All.FactorSN) * rho * x / tsfr * dz;
+            }
+
+          rho += drho * dz;
+          q += dq * dz;
+        }
+
+      sigma *= 2; /* to include the other side */
+      sigmasfr *= 2;
+      sigma_u4 *= 2;
+
+      sigma *= All.HubbleParam * (All.UnitMass_in_g / SOLAR_MASS) * PARSEC * PARSEC / (All.UnitLength_in_cm * All.UnitLength_in_cm);
+      sigmasfr *= All.HubbleParam * All.HubbleParam * (All.UnitMass_in_g / SOLAR_MASS) * (SEC_PER_YEAR / All.UnitTime_in_s);
+      sigma_u4 *= All.HubbleParam * (All.UnitMass_in_g / SOLAR_MASS) * PARSEC * PARSEC / (All.UnitLength_in_cm * All.UnitLength_in_cm);
+
+      if(WriteMiscFiles && (ThisTask == 0))
+        {
+          fprintf(fd, "%g %g %g %g\n", rho0, sigma, sigmasfr, sigma_u4);
+        }
+    }
+
+  if(All.ComovingIntegrationOn)
+    {
+      All.Time = All.TimeBegin;
+      All.set_cosmo_factors_for_current_time();
+      IonizeParams();
+    }
+
+  if(WriteMiscFiles && (ThisTask == 0))
+    fclose(fd);
+}
+
+/** \brief Set the appropriate units for the parameters of the multi-phase model.
+ */
+void coolsfr::set_units_sfr(void)
+{
+  All.OverDensThresh = All.CritOverDensity * All.OmegaBaryon * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G);
+
+  All.PhysDensThresh = All.CritPhysDensity * PROTONMASS / HYDROGEN_MASSFRAC / All.UnitDensity_in_cgs;
+
+  double meanweight = 4 / (1 + 3 * HYDROGEN_MASSFRAC); /* note: assuming NEUTRAL GAS */
+
+  All.EgySpecCold = 1 / meanweight * (1.0 / GAMMA_MINUS1) * (BOLTZMANN / PROTONMASS) * All.TempClouds;
+  All.EgySpecCold *= All.UnitMass_in_g / All.UnitEnergy_in_cgs;
+
+  meanweight = 4 / (8 - 5 * (1 - HYDROGEN_MASSFRAC)); /* note: assuming FULL ionization */
+
+  All.EgySpecSN = 1 / meanweight * (1.0 / GAMMA_MINUS1) * (BOLTZMANN / PROTONMASS) * All.TempSupernova;
+  All.EgySpecSN *= All.UnitMass_in_g / All.UnitEnergy_in_cgs;
+}
+#endif
diff --git a/src/cooling_sfr/starformation.cc b/src/cooling_sfr/starformation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f454b63bad01dafd94f86fffe316ebdb516523a
--- /dev/null
+++ b/src/cooling_sfr/starformation.cc
@@ -0,0 +1,324 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file starformation.cc
+ *
+ *  \brief Generic creation routines for creating star particles
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef STARFORMATION
+
+#include <assert.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/** \brief This routine creates star/wind particles according to their respective rates.
+ *
+ *  This function loops over all the active gas cells. If in a given cell the SFR is
+ *  greater than zero, the probability of forming a star or a wind particle is computed
+ *  and the corresponding particle is created stichastically according to the model
+ *  in Springel & Hernquist (2003, MNRAS). It also saves information about the formed stellar
+ *  mass and the star formation rate in the file FdSfr.
+ */
+void coolsfr::sfr_create_star_particles(simparticles *Sp)
+{
+  TIMER_START(CPU_COOLING_SFR);
+
+  double dt, dtime;
+  MyDouble mass_of_star;
+  double sum_sm, total_sm, rate, sum_mass_stars, total_sum_mass_stars;
+  double p = 0, pall = 0, prob, p_decide;
+  double rate_in_msunperyear;
+  double totsfrrate;
+  double w = 0;
+
+  All.set_cosmo_factors_for_current_time();
+
+  stars_spawned = stars_converted = 0;
+
+  sum_sm = sum_mass_stars = 0;
+
+  for(int i = 0; i < Sp->TimeBinsHydro.NActiveParticles; i++)
+    {
+      int target = Sp->TimeBinsHydro.ActiveParticleList[i];
+      if(Sp->P[target].getType() == 0)
+        {
+          if(Sp->P[target].getMass() == 0 && Sp->P[target].ID.get() == 0)
+            continue; /* skip cells that have been swallowed or eliminated */
+
+          dt = (Sp->P[target].getTimeBinHydro() ? (((integertime)1) << Sp->P[target].getTimeBinHydro()) : 0) * All.Timebase_interval;
+          /*  the actual time-step */
+
+          dtime = All.cf_atime * dt / All.cf_atime_hubble_a;
+
+          mass_of_star = 0;
+          prob         = 0;
+          p            = 0;
+
+          if(Sp->SphP[target].Sfr > 0)
+            {
+              p = Sp->SphP[target].Sfr / ((All.UnitMass_in_g / SOLAR_MASS) / (All.UnitTime_in_s / SEC_PER_YEAR)) * dtime /
+                  Sp->P[target].getMass();
+              pall = p;
+              sum_sm += Sp->P[target].getMass() * (1 - exp(-p));
+
+              w = get_random_number();
+
+              Sp->SphP[target].Metallicity += w * METAL_YIELD * (1 - exp(-p));
+              Sp->SphP[target].MassMetallicity = Sp->SphP[target].Metallicity * Sp->P[target].getMass();
+              Sp->P[target].Metallicity        = Sp->SphP[target].Metallicity;
+
+              mass_of_star = Sp->P[target].getMass();
+
+              prob = Sp->P[target].getMass() / mass_of_star * (1 - exp(-pall));
+            }
+
+          if(prob == 0)
+            continue;
+
+          if(prob < 0)
+            Terminate("prob < 0");
+
+          /* decide what process to consider (currently available: make a star or kick to wind) */
+          p_decide = get_random_number();
+
+          if(p_decide < p / pall) /* ok, a star formation is considered */
+            make_star(Sp, target, prob, mass_of_star, &sum_mass_stars);
+
+          if(Sp->SphP[target].Sfr > 0)
+            {
+              if(Sp->P[target].getType() == 0) /* to protect using a particle that has been turned into a star */
+                {
+                  Sp->SphP[target].Metallicity += (1 - w) * METAL_YIELD * (1 - exp(-p));
+                  Sp->SphP[target].MassMetallicity = Sp->SphP[target].Metallicity * Sp->P[target].getMass();
+                }
+            }
+          Sp->P[target].Metallicity = Sp->SphP[target].Metallicity;
+        }
+    } /* end of main loop over active gas particles */
+
+  MPI_Allreduce(&stars_spawned, &tot_stars_spawned, 1, MPI_INT, MPI_SUM, Communicator);
+  MPI_Allreduce(&stars_converted, &tot_stars_converted, 1, MPI_INT, MPI_SUM, Communicator);
+
+  if(tot_stars_spawned > 0 || tot_stars_converted > 0)
+    {
+      mpi_printf("SFR: spawned %d stars, converted %d gas particles into stars\n", tot_stars_spawned, tot_stars_converted);
+    }
+
+  tot_altogether_spawned = tot_stars_spawned;
+  altogether_spawned     = stars_spawned;
+  if(tot_altogether_spawned)
+    {
+      /* need to assign new unique IDs to the spawned stars */
+
+      if(All.MaxID == 0) /* MaxID not calculated yet */
+        {
+          /* determine maximum ID */
+          MyIDType maxid = 0;
+          for(int i = 0; i < Sp->NumPart; i++)
+            if(Sp->P[i].ID.get() > maxid)
+              {
+                maxid = Sp->P[i].ID.get();
+              }
+
+          MyIDType *tmp = (MyIDType *)Mem.mymalloc("tmp", NTask * sizeof(MyIDType));
+
+          MPI_Allgather(&maxid, sizeof(MyIDType), MPI_BYTE, tmp, sizeof(MyIDType), MPI_BYTE, Communicator);
+
+          for(int i = 0; i < NTask; i++)
+            if(tmp[i] > maxid)
+              maxid = tmp[i];
+
+          All.MaxID = maxid;
+
+          Mem.myfree(tmp);
+        }
+
+      int *list = (int *)Mem.mymalloc("list", NTask * sizeof(int));
+
+      MPI_Allgather(&altogether_spawned, 1, MPI_INT, list, 1, MPI_INT, Communicator);
+
+      MyIDType newid = All.MaxID + 1;
+
+      for(int i = 0; i < ThisTask; i++)
+        newid += list[i];
+
+      Mem.myfree(list);
+
+      for(int i = 0; i < altogether_spawned; i++)
+        Sp->P[Sp->NumPart + i].ID.set(newid++);
+
+      All.MaxID += tot_altogether_spawned;
+    }
+
+  /* Note: New tree construction can be avoided because of  `force_add_star_to_tree()' */
+  if(tot_stars_spawned > 0 || tot_stars_converted > 0)
+    {
+      Sp->TotNumPart += tot_stars_spawned;
+      Sp->TotNumGas -= tot_stars_converted;
+      Sp->NumPart += stars_spawned;
+    }
+
+  double sfrrate = 0;
+  for(int bin = 0; bin < TIMEBINS; bin++)
+    if(Sp->TimeBinsHydro.TimeBinCount[bin])
+      sfrrate += Sp->TimeBinSfr[bin];
+
+  MPI_Allreduce(&sfrrate, &totsfrrate, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+
+  MPI_Reduce(&sum_sm, &total_sm, 1, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+  MPI_Reduce(&sum_mass_stars, &total_sum_mass_stars, 1, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+  if(ThisTask == 0)
+    {
+      if(All.TimeStep > 0)
+        rate = total_sm / (All.TimeStep / All.cf_atime_hubble_a);
+      else
+        rate = 0;
+
+      /* compute the cumulative mass of stars (->>> CHECK ME!!!) */
+      cum_mass_stars += total_sum_mass_stars;
+
+      /* convert to solar masses per yr */
+      rate_in_msunperyear = rate * (All.UnitMass_in_g / SOLAR_MASS) / (All.UnitTime_in_s / SEC_PER_YEAR);
+
+      fprintf(Logs.FdSfr, "%14e %14e %14e %14e %14e %14e\n", All.Time, total_sm, totsfrrate, rate_in_msunperyear, total_sum_mass_stars,
+              cum_mass_stars);
+      myflush(Logs.FdSfr);
+    }
+
+  TIMER_STOP(CPU_COOLING_SFR);
+}
+
+/** \brief Convert a SPH particle into a star.
+ *
+ *  This function convertss an active star-forming gas particle into a star.
+ *  The particle information of the gas is copied to the
+ *  location istar and the fields necessary for the creation of the star
+ *  particle are initialized.
+ *
+ *  \param i index of the gas particle to be converted
+ *  \param birthtime time of birth (in code units) of the stellar particle
+ */
+void coolsfr::convert_sph_particle_into_star(simparticles *Sp, int i, double birthtime)
+{
+  Sp->P[i].setType(STAR_TYPE);
+#if NSOFTCLASSES > 1
+  Sp->P[i].setSofteningClass(All.SofteningClassOfPartType[Sp->P[i].getType()]);
+#endif
+#ifdef INDIVIDUAL_GRAVITY_SOFTENING
+  if(((1 << Sp->P[i].getType()) & (INDIVIDUAL_GRAVITY_SOFTENING)))
+    Sp->P[i].setSofteningClass(Sp->get_softening_type_from_mass(Sp->P[i].getMass()));
+#endif
+
+  Sp->TimeBinSfr[Sp->P[i].getTimeBinHydro()] -= Sp->SphP[i].Sfr;
+
+  Sp->P[i].StellarAge = birthtime;
+
+  return;
+}
+
+/** \brief Spawn a star particle from a SPH gas particle.
+ *
+ *  This function spawns a star particle from an active star-forming
+ *  SPH gas particle. The particle information of the gas is copied to the
+ *  location istar and the fields necessary for the creation of the star
+ *  particle are initialized. The total mass of the gas particle is split
+ *  between the newly spawned star and the gas particle.
+ *  (This function is probably unecessary)
+ *
+ *  \param igas index of the gas cell from which the star is spawned
+ *  \param birthtime time of birth (in code units) of the stellar particle
+ *  \param istar index of the spawned stellar particle
+ *  \param mass_of_star the mass of the spawned stellar particle
+ */
+void coolsfr::spawn_star_from_sph_particle(simparticles *Sp, int igas, double birthtime, int istar, MyDouble mass_of_star)
+{
+  Sp->P[istar] = Sp->P[igas];
+  Sp->P[istar].setType(STAR_TYPE);
+#if NSOFTCLASSES > 1
+  Sp->P[istar].setSofteningClass(All.SofteningClassOfPartType[Sp->P[istar].getType()]);
+#endif
+#ifdef INDIVIDUAL_GRAVITY_SOFTENING
+  if(((1 << Sp->P[istar].getType()) & (INDIVIDUAL_GRAVITY_SOFTENING)))
+    Sp->P[istar].setSofteningClass(Sp->get_softening_type_from_mass(Sp->P[istar].getMass()));
+#endif
+
+  Sp->TimeBinsGravity.ActiveParticleList[Sp->TimeBinsGravity.NActiveParticles++] = istar;
+
+  Sp->TimeBinsGravity.timebin_add_particle(istar, igas, Sp->P[istar].TimeBinGrav, Sp->TimeBinSynchronized[Sp->P[istar].TimeBinGrav]);
+
+  Sp->P[istar].setMass(mass_of_star);
+
+  Sp->P[istar].StellarAge = birthtime;
+
+  /* now change the conserved quantities in the cell in proportion */
+  double fac = (Sp->P[igas].getMass() - Sp->P[istar].getMass()) / Sp->P[igas].getMass();
+
+  Sp->P[igas].setMass(fac * Sp->P[igas].getMass());
+
+  return;
+}
+
+/** \brief Make a star particle from a SPH gas particle.
+ *
+ *  Given a gas cell where star formation is active and the probability
+ *  of forming a star, this function selectes either to convert the gas
+ *  particle into a star particle or to spawn a star depending on the
+ *  target mass for the star.
+ *
+ *  \param i index of the gas cell
+ *  \param prob probability of making a star
+ *  \param mass_of_star desired mass of the star particle
+ *  \param sum_mass_stars holds the mass of all the stars created at the current time-step (for the local task)
+ */
+void coolsfr::make_star(simparticles *Sp, int i, double prob, MyDouble mass_of_star, double *sum_mass_stars)
+{
+  if(mass_of_star > Sp->P[i].getMass())
+    Terminate("mass_of_star > P[i].Mass");
+
+  if(get_random_number() < prob)
+    {
+      if(mass_of_star == Sp->P[i].getMass())
+        {
+          /* here we turn the gas particle itself into a star particle */
+          stars_converted++;
+
+          *sum_mass_stars += Sp->P[i].getMass();
+
+          convert_sph_particle_into_star(Sp, i, All.Time);
+        }
+      else
+        {
+          /* in this case we spawn a new star particle, only reducing the mass in the cell by mass_of_star */
+          altogether_spawned = stars_spawned;
+          if(Sp->NumPart + altogether_spawned >= Sp->MaxPart)
+            Terminate("NumPart=%d spwawn %d particles no space left (Sp.MaxPart=%d)\n", Sp->NumPart, altogether_spawned, Sp->MaxPart);
+
+          int j = Sp->NumPart + altogether_spawned; /* index of new star */
+
+          spawn_star_from_sph_particle(Sp, i, All.Time, j, mass_of_star);
+
+          *sum_mass_stars += mass_of_star;
+          stars_spawned++;
+        }
+    }
+}
+
+#endif /* closes SFR */
diff --git a/src/data/allvars.cc b/src/data/allvars.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c3efe6c2a9eb29a6fd916230c87fa02a058405c
--- /dev/null
+++ b/src/data/allvars.cc
@@ -0,0 +1,279 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file allvars.cc
+ *
+ *  \brief instance and code for an object dealing with global parameters and variables
+ */
+
+#include "gadgetconfig.h"
+
+#include "../data/allvars.h"
+#include "../data/constants.h"
+#include "../data/dtypes.h"
+#include "../data/macros.h"
+#include "../time_integration/driftfac.h"
+
+void global_data_all_processes::set_cosmo_factors_for_current_time(void)
+{
+  if(ComovingIntegrationOn)
+    {
+      cf_atime    = Time;
+      cf_atime2   = Time * Time;
+      cf_ainv     = 1 / Time;
+      cf_a2inv    = 1 / (Time * Time);
+      cf_a3inv    = 1 / (Time * Time * Time);
+      cf_afac1    = pow(Time, 3 * GAMMA_MINUS1);
+      cf_afac2    = 1 / pow(Time, 3 * GAMMA - 2);
+      cf_afac3    = pow(Time, 3 * (1 - GAMMA) / 2.0);
+      cf_hubble_a = cf_H = Driftfac.hubble_function(Time);
+      cf_atime_hubble_a  = Time * cf_hubble_a;
+      cf_atime2_hubble_a = Time * Time * cf_hubble_a;
+      cf_redshift        = 1 / Time - 1;
+    }
+  else
+    {
+      cf_atime           = 1;
+      cf_atime2          = 1;
+      cf_ainv            = 1;
+      cf_a2inv           = 1;
+      cf_a3inv           = 1;
+      cf_afac1           = 1;
+      cf_afac2           = 1;
+      cf_afac3           = 1;
+      cf_hubble_a        = 1;
+      cf_H               = 0;
+      cf_atime_hubble_a  = 1;
+      cf_atime2_hubble_a = 1;
+      cf_redshift        = 0;
+    }
+}
+
+void global_data_all_processes::register_parameters(void)
+{
+  add_param("InitCondFile", InitCondFile, PARAM_STRING, PARAM_FIXED);
+
+  add_param("OutputDir", OutputDir, PARAM_STRING, PARAM_CHANGEABLE);
+  add_param("SnapshotFileBase", SnapshotFileBase, PARAM_STRING, PARAM_CHANGEABLE);
+  add_param("OutputListFilename", OutputListFilename, PARAM_STRING, PARAM_CHANGEABLE);
+  add_param("OutputListOn", &OutputListOn, PARAM_INT, PARAM_CHANGEABLE);
+
+  add_param("Omega0", &Omega0, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("OmegaBaryon", &OmegaBaryon, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("OmegaLambda", &OmegaLambda, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("Hubble", &Hubble, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("HubbleParam", &HubbleParam, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("BoxSize", &BoxSize, PARAM_DOUBLE, PARAM_FIXED);
+
+  add_param("MaxMemSize", &MaxMemSize, PARAM_INT, PARAM_CHANGEABLE);
+  add_param("TimeOfFirstSnapshot", &TimeOfFirstSnapshot, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("CpuTimeBetRestartFile", &CpuTimeBetRestartFile, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("TimeBetStatistics", &TimeBetStatistics, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("TimeBegin", &TimeBegin, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("TimeMax", &TimeMax, PARAM_DOUBLE, PARAM_FIXED); /* can be changed nevertheless through special function */
+
+  add_param("TimeBetSnapshot", &TimeBetSnapshot, PARAM_DOUBLE, PARAM_CHANGEABLE);
+
+  add_param("UnitVelocity_in_cm_per_s", &UnitVelocity_in_cm_per_s, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("UnitLength_in_cm", &UnitLength_in_cm, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("UnitMass_in_g", &UnitMass_in_g, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("GravityConstantInternal", &GravityConstantInternal, PARAM_DOUBLE, PARAM_FIXED);
+
+  add_param("ErrTolIntAccuracy", &ErrTolIntAccuracy, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("ErrTolTheta", &ErrTolTheta, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("ErrTolThetaMax", &ErrTolThetaMax, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("ErrTolForceAcc", &ErrTolForceAcc, PARAM_DOUBLE, PARAM_CHANGEABLE);
+
+  add_param("MaxSizeTimestep", &MaxSizeTimestep, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("MinSizeTimestep", &MinSizeTimestep, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("ArtBulkViscConst", &ArtBulkViscConst, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("CourantFac", &CourantFac, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("DesNumNgb", &DesNumNgb, PARAM_INT, PARAM_CHANGEABLE);
+  add_param("TopNodeFactor", &TopNodeFactor, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("ActivePartFracForNewDomainDecomp", &ActivePartFracForNewDomainDecomp, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("MaxNumNgbDeviation", &MaxNumNgbDeviation, PARAM_DOUBLE, PARAM_CHANGEABLE);
+
+  add_param("ComovingIntegrationOn", &ComovingIntegrationOn, PARAM_INT, PARAM_FIXED);
+
+  add_param("ICFormat", &ICFormat, PARAM_INT, PARAM_CHANGEABLE);
+  add_param("SnapFormat", &SnapFormat, PARAM_INT, PARAM_CHANGEABLE);
+
+  add_param("NumFilesPerSnapshot", &NumFilesPerSnapshot, PARAM_INT, PARAM_CHANGEABLE);
+  add_param("MaxFilesWithConcurrentIO", &MaxFilesWithConcurrentIO, PARAM_INT, PARAM_CHANGEABLE);
+
+  add_param("TypeOfOpeningCriterion", &TypeOfOpeningCriterion, PARAM_INT, PARAM_FIXED);
+
+  add_param("TimeLimitCPU", &TimeLimitCPU, PARAM_DOUBLE, PARAM_CHANGEABLE);
+
+  add_param("InitGasTemp", &InitGasTemp, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("MinEgySpec", &MinEgySpec, PARAM_DOUBLE, PARAM_CHANGEABLE);
+
+  for(int i = 0; i < NSOFTCLASSES; i++)
+    {
+      char buf_l[100];
+      sprintf(buf_l, "SofteningComovingClass%d", i);
+      add_param(buf_l, &SofteningComoving[i], PARAM_DOUBLE, PARAM_FIXED);
+    }
+
+  for(int i = 0; i < NSOFTCLASSES; i++)
+    {
+      char buf_l[100];
+      sprintf(buf_l, "SofteningMaxPhysClass%d", i);
+      add_param(buf_l, &SofteningMaxPhys[i], PARAM_DOUBLE, PARAM_FIXED);
+    }
+
+  for(int i = 0; i < NTYPES; i++)
+    {
+      char buf_l[100];
+      sprintf(buf_l, "SofteningClassOfPartType%d", i);
+      add_param(buf_l, &SofteningClassOfPartType[i], PARAM_INT, PARAM_FIXED);
+    }
+
+#if defined(TREEPM_NOTIMESPLIT) || defined(PLACEHIGHRESREGION)
+  add_param("ActivePartFracForPMinsteadOfEwald", &ActivePartFracForPMinsteadOfEwald, PARAM_DOUBLE, PARAM_CHANGEABLE);
+#endif
+
+#ifdef ADAPTIVE_HYDRO_SOFTENING
+  add_param("MinimumComovingHydroSoftening", &MinimumComovingHydroSoftening, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("AdaptiveHydroSofteningSpacing", &AdaptiveHydroSofteningSpacing, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("GasSoftFactor", &GasSoftFactor, PARAM_DOUBLE, PARAM_FIXED);
+#endif
+
+#ifdef TIMEDEP_ART_VISC
+  add_param("ViscosityAlphaMin", &AlphaMin, PARAM_DOUBLE, PARAM_CHANGEABLE);
+#endif
+
+#ifdef SUBFIND
+  add_param("DesLinkNgb", &DesLinkNgb, PARAM_INT, PARAM_CHANGEABLE);
+#endif
+
+#ifdef LIGHTCONE_PARTICLES
+  add_param("LightConeDefinitionFile", LightConeDefinitionFile, PARAM_STRING, PARAM_CHANGEABLE);
+#endif
+
+#ifdef LIGHTCONE_MASSMAPS
+  add_param("LightConeMassMapsNside", &LightConeMassMapsNside, PARAM_INT, PARAM_FIXED);
+  add_param("LightConeMassMapThickness", &LightConeMassMapThickness, PARAM_DOUBLE, PARAM_CHANGEABLE);
+  add_param("LightConeMassMapMaxRedshift", &LightConeMassMapMaxRedshift, PARAM_DOUBLE, PARAM_CHANGEABLE);
+#endif
+
+#ifdef REDUCE_FLUSH
+  add_param("FlushCpuTimeDiff", &FlushCpuTimeDiff, PARAM_DOUBLE, PARAM_CHANGEABLE);
+#endif
+
+#ifdef COOLING
+  add_param("TreecoolFile", TreecoolFile, PARAM_STRING, PARAM_CHANGEABLE);
+#endif
+
+#ifdef STARFORMATION
+  add_param("CritOverDensity", &CritOverDensity, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("CritPhysDensity", &CritPhysDensity, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("FactorSN", &FactorSN, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("FactorEVP", &FactorEVP, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("TempSupernova", &TempSupernova, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("TempClouds", &TempClouds, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("MaxSfrTimescale", &MaxSfrTimescale, PARAM_DOUBLE, PARAM_FIXED);
+#endif
+
+#ifdef NGENIC
+  add_param("NSample", &NSample, PARAM_INT, PARAM_FIXED);
+  add_param("SphereMode", &SphereMode, PARAM_INT, PARAM_FIXED);
+  add_param("PowerSpectrumType", &PowerSpectrumType, PARAM_INT, PARAM_FIXED);
+  add_param("ReNormalizeInputSpectrum", &ReNormalizeInputSpectrum, PARAM_INT, PARAM_FIXED);
+  add_param("PrimordialIndex", &PrimordialIndex, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("ShapeGamma", &ShapeGamma, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("Sigma8", &Sigma8, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("PowerSpectrumFile", PowerSpectrumFile, PARAM_STRING, PARAM_FIXED);
+  add_param("InputSpectrum_UnitLength_in_cm", &InputSpectrum_UnitLength_in_cm, PARAM_DOUBLE, PARAM_FIXED);
+  add_param("Seed", &NgenicSeed, PARAM_INT, PARAM_FIXED);
+#endif
+
+#ifdef CREATE_GRID
+  add_param("GridSize", &GridSize, PARAM_INT, PARAM_FIXED);
+#endif
+}
+
+/*! \brief This function reads a table with a list of desired output times.
+ *
+ *  The table does not have to be ordered in any way, but may not contain more than
+ *  MAXLEN_OUTPUTLIST entries.
+ *
+ *  \param fname The file name of the outputlist
+ */
+void global_data_all_processes::read_outputlist(char *fname)
+{
+  if(ThisTask == 0)
+    {
+      FILE *fd;
+
+      if(!(fd = fopen(fname, "r")))
+        {
+          Terminate("can't read output list in file '%s'\n", fname);
+        }
+
+      OutputListLength = 0;
+
+      while(1)
+        {
+          char buf[512];
+          if(fgets(buf, 500, fd) != buf)
+            break;
+
+          int flag;
+          int count = sscanf(buf, " %lg %d ", &OutputListTimes[OutputListLength], &flag);
+
+          if(count == 1)
+            flag = 1;
+
+          if(count == 1 || count == 2)
+            {
+              if(OutputListLength >= MAXLEN_OUTPUTLIST)
+                Terminate("\ntoo many entries in output-list. You should increase MAXLEN_OUTPUTLIST=%d.\n", MAXLEN_OUTPUTLIST);
+
+              OutputListFlag[OutputListLength] = flag;
+              OutputListLength++;
+            }
+        }
+
+      fclose(fd);
+
+      mpi_printf("\nfound %d times in output-list.\n", OutputListLength);
+    }
+
+  /* tell all other processes */
+  MPI_Bcast(get_data_ptr(), get_data_size(), MPI_BYTE, 0, Communicator);
+}
+
+void global_data_all_processes::some_parameter_checks(void)
+{
+  if(MaxFilesWithConcurrentIO > NTask)
+    {
+      mpi_printf("NOTICE: MaxFilesWithConcurrentIO has been reduced to the number of processors\n");
+      MaxFilesWithConcurrentIO = NTask;
+    }
+
+  if(MaxFilesWithConcurrentIO == 0)
+    {
+      mpi_printf("NOTICE: MaxFilesWithConcurrentIO has been set to be equal to the number of processors\n");
+      MaxFilesWithConcurrentIO = NTask;
+    }
+
+  if(SnapFormat < 1 || SnapFormat > 3)
+    Terminate("Unsupported File-Format: SnapFormat = %d\n", SnapFormat);
+
+  if(NTask < NumFilesPerSnapshot)
+    {
+      mpi_printf("WARNING: Number of processors less than 'NumFilesPerSnapshot=%d' - reducing this to NumFilesPerSnapshot=%d\n",
+                 NumFilesPerSnapshot, NTask);
+      NumFilesPerSnapshot = NTask;
+    }
+
+  for(int i = 0; i < NTYPES; i++)
+    {
+      if(SofteningClassOfPartType[i] >= NSOFTCLASSES || SofteningClassOfPartType[i] < 0)
+        Terminate("SofteningClassOfPartType%d  invalid (NSOFTCLASSES=%d)\n", i, NSOFTCLASSES);
+    }
+}
diff --git a/src/data/allvars.h b/src/data/allvars.h
new file mode 100644
index 0000000000000000000000000000000000000000..435be921ebcc574ac3cf1e621fd87aae3e16d254
--- /dev/null
+++ b/src/data/allvars.h
@@ -0,0 +1,363 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+/*! \file allvars.h
+ *  \brief declares a structure for global parameters and variables.
+ *
+ *  This file declares a structure for holding global parameters and variables and objects.
+ *  Further variables should be added here. The actual instance of this object is provided
+ *  by the file 'allvars.cc'.
+ */
+
+#ifndef ALLVARS_H
+#define ALLVARS_H
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+
+#include "../data/constants.h"
+#include "../data/dtypes.h"
+#include "../data/macros.h"
+#include "../io/parameters.h"
+
+/** Data which is the SAME for all tasks (mostly code parameters read
+ * from the parameter file).  Holding this data in a structure is
+ * convenient for writing/reading the restart file, and it allows the
+ * introduction of new global variables in a simple way. The only
+ * thing to do is to introduce them into this structure.
+ */
+struct global_data_all_processes : public parameters
+{
+#if defined(COOLING)
+  char TreecoolFile[255];
+#endif
+
+#ifdef INDIVIDUAL_GRAVITY_SOFTENING
+  double AvgType1Mass;
+#endif
+
+  double TopNodeFactor;
+
+  int ICFormat; /**< selects different versions of IC file-format */
+
+  int SnapFormat; /**< selects different versions of snapshot file-formats */
+
+  int NumFilesPerSnapshot;      /**< number of files in multi-file snapshot dumps */
+  int MaxFilesWithConcurrentIO; /**< maximum number of files that may be written simultaneously when
+                                      writing/reading restart-files, or when writing snapshot files */
+
+  double TreeAllocFactor; /**< Each processor allocates a number of nodes which is TreeAllocFactor times
+                               the maximum(!) number of particles.  Note: A typical local tree for N
+                               particles needs usually about ~0.65*N nodes. */
+
+  double TopNodeAllocFactor; /**< Each processor allocates a number of nodes which is TreeAllocFactor times
+                                  the maximum(!) number of particles.  Note: A typical local tree for N
+                                  particles needs usually about ~0.65*N nodes. */
+
+  double NgbTreeAllocFactor; /**< Each processor allocates a number of nodes for the neighbor search which is NgbTreeAllocFactor times
+                                   the maximum(!) number of gas particles.  Note: A typical local tree for N
+                                   particles needs usually about ~0.65*N nodes. */
+
+  double ForeignNodeAllocFactor;
+
+  double ForeignPointAllocFactor;
+
+  enum restart_options RestartFlag; /**< taken from command line used to start code. 0 is normal start-up from
+                                         initial conditions, 1 is resuming a run from a set of restart files, while 2
+                                         marks a restart from a snapshot file. */
+
+  int MaxMemSize; /**< size of maximum memory consumption in MB */
+
+  /* some SPH parameters */
+
+  int DesNumNgb; /**< Desired number of SPH neighbours */
+
+#ifdef LEAN
+  MyDouble PartMass;
+#endif
+
+#ifdef TIMEDEP_ART_VISC
+  double AlphaMin; /*!< Minimum of allowed viscosity parameter */
+#endif
+
+#ifdef SUBFIND
+  int DesLinkNgb;
+#endif
+
+  long long GlobalNSynchronizedHydro;
+  long long GlobalNSynchronizedGravity;
+
+  double MaxNumNgbDeviation; /**< Maximum allowed deviation neighbour number */
+
+  double ArtBulkViscConst; /*!< Sets the parameter \f$\alpha\f$ of the artificial viscosity */
+  double InitGasTemp;      /**< may be used to set the temperature in the IC's */
+  double InitGasU;         /**< the same, but converted to thermal energy per unit mass */
+  double MinEgySpec;       /**< the minimum allowed temperature expressed as energy per unit mass */
+
+  int FlagICsContainedEntropy;
+
+  /* some force counters  */
+
+  long long TotNumOfForces;     /**< counts total number of force computations  */
+  long long TotNumDirectForces; /**< counts total number of direct force computations  */
+  long long TotNumDensity;      /**< counts total number of SPH density calculations  */
+  long long TotNumHydro;        /**< counts total number of SPH hydro force calculations  */
+
+  /* various cosmological factors that are only a function of the current scale factor, and in non-comoving runs are set to 1 */
+  double cf_atime, cf_atime2, cf_ainv, cf_a2inv, cf_a3inv, cf_afac1, cf_afac2, cf_afac3, cf_hubble_a, cf_atime_hubble_a,
+      cf_atime2_hubble_a, cf_redshift;
+  /* Hubble rate at the current time, valid both for comoving and non-comoving intergation */
+  double cf_H;
+
+  double accel_normalize_fac; /* used in I/O to normalize accelerations if reduced precision storage is used */
+
+  /* system of units  */
+  double UnitTime_in_s;            /**< factor to convert internal time unit to seconds/h */
+  double UnitMass_in_g;            /**< factor to convert internal mass unit to grams/h */
+  double UnitVelocity_in_cm_per_s; /**< factor to convert internal velocity unit to cm/sec */
+  double UnitLength_in_cm;         /**< factor to convert internal length unit to cm/h */
+  double UnitPressure_in_cgs;      /**< factor to convert internal pressure unit to cgs units (little 'h' still
+                                   around!) */
+  double UnitDensity_in_cgs;       /**< factor to convert internal length unit to g/cm^3*h^2 */
+  double UnitCoolingRate_in_cgs;   /**< factor to convert internal cooling rate to cgs units */
+  double UnitEnergy_in_cgs;        /**< factor to convert internal energy to cgs units */
+  double UnitTime_in_Megayears;    /**< factor to convert internal time to megayears/h */
+  double UnitTime_in_years;        /**< factor to convert internal time to years/h */
+  double GravityConstantInternal;  /**< If set to zero in the parameterfile, the internal value of the
+                                   gravitational constant is set to the Newtonian value based on the system of
+                                   units specified. Otherwise the value provided is taken as internal gravity
+                                   constant G. */
+  double G;                        /**< Gravity-constant in internal units */
+
+  /* Cosmology */
+
+  double Hubble;      /**< Hubble-constant in internal units */
+  double Omega0;      /**< matter density in units of the critical density (at z=0) */
+  double OmegaLambda; /**< vaccum energy density relative to crictical density (at z=0) */
+  double OmegaBaryon; /**< baryon density in units of the critical density (at z=0) */
+  double HubbleParam; /**< little `h', i.e. can be used to scale unit system to absorb uncertain value of Hubble constant.  Only needed
+                       * to get absolute physical values for cooling physics
+                       */
+
+  double BoxSize; /**< Boxsize in case periodic boundary conditions are used */
+
+  /* Code options */
+
+  int ComovingIntegrationOn;  /**< flags that comoving integration is enabled */
+  int TypeOfOpeningCriterion; /**< determines tree cell-opening criterion: 0 for Barnes-Hut, 1 for relative
+                                   criterion */
+  int OutputListOn;           /**< flags that output times are listed in a specified file */
+
+  int LowestActiveTimeBin;
+  int HighestActiveTimeBin;
+  int LowestOccupiedTimeBin;
+  int HighestOccupiedTimeBin;
+  int LowestOccupiedGravTimeBin;
+  int HighestOccupiedGravTimeBin;
+  int HighestSynchronizedTimeBin;
+  int SmallestTimeBinWithDomainDecomposition;
+  double ActivePartFracForNewDomainDecomp;
+  /* parameters determining output frequency */
+
+#if defined(TREEPM_NOTIMESPLIT) || defined(PLACEHIGHRESREGION)
+  double ActivePartFracForPMinsteadOfEwald;
+#endif
+
+  int SnapshotFileCount;        /**< number of snapshot that is written next */
+  double TimeBetSnapshot;       /**< simulation time interval between snapshot files */
+  double TimeOfFirstSnapshot;   /**< simulation time of first snapshot files */
+  double CpuTimeBetRestartFile; /**< cpu-time between regularly generated restart files */
+  double TimeLastRestartFile;   /**< cpu-time when last restart-file was written */
+  double TimeBetStatistics;     /**< simulation time interval between computations of energy statistics */
+  double TimeLastStatistics;    /**< simulation time when the energy statistics was computed the last time */
+  int NumCurrentTiStep;         /**< counts the number of system steps taken up to this point */
+
+  /* Current time of the simulation, global step, and end of simulation */
+
+  double Time;      /**< current time of the simulation */
+  double TimeBegin; /**< time of initial conditions of the simulation */
+  double TimeStep;  /**< difference between current times of previous and current timestep */
+  double TimeMax;   /**< marks the point of time until the simulation is to be evolved */
+  double TimeOld;   /**< time of previous synchronization point, needed only for logging purposes */
+
+  /* variables for organizing discrete timeline */
+
+  double Timebase_interval;  /**< factor to convert from floating point time interval to integer timeline */
+  integertime Ti_Current;    /**< current time on integer timeline */
+  integertime Ti_nextoutput; /**< next output time on integer timeline */
+  integertime Ti_lastoutput;
+
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+  integertime PM_Ti_endstep, PM_Ti_begstep;
+#endif
+
+#if defined(EVALPOTENTIAL) && defined(PMGRID) && defined(PERIODIC)
+  double TotalMass;
+#endif
+
+  inline double get_absolutetime_from_integertime(integertime ti)
+  {
+    if(ComovingIntegrationOn)
+      return TimeBegin * exp(ti * Timebase_interval);
+    else
+      return TimeBegin + ti * Timebase_interval;
+  }
+
+  char DumpFlag_nextoutput;
+
+  integertime Ti_begstep[TIMEBINS]; /**< marks start of current step of each timebin on integer timeline */
+
+#ifdef FORCE_EQUAL_TIMESTEPS
+  integertime GlobalTimeStep;
+#endif
+
+  /* variables that keep track of CPU consumption */
+
+  double TimeLimitCPU;
+
+  double CPUForLastPMExecution;
+
+  /* tree code opening criterion */
+
+  double ErrTolTheta;    /**< BH tree opening angle */
+  double ErrTolThetaMax; /**< maximum BH tree opening angle when relative criterion is in use */
+  double ErrTolForceAcc; /**< parameter for relative opening criterion in tree walk */
+
+  char RelOpeningCriterionInUse; /**< flags that we now an old acceleration and the relative opening criterion can be used */
+
+  /* adjusts accuracy of time-integration */
+
+  double ErrTolIntAccuracy; /**< accuracy tolerance parameter \f$ \eta \f$ for timestep criterion. The
+                                 timesteps is \f$ \Delta t = \sqrt{\frac{2 \eta eps}{a}} \f$ */
+
+  double MinSizeTimestep; /**< minimum allowed timestep. Normally, the simulation terminates if the
+                               timestep determined by the timestep criteria falls below this limit. */
+  double MaxSizeTimestep; /**< maximum allowed timestep */
+
+  double CourantFac; /**< SPH-Courant factor */
+
+  int CPU_TimeBinCountMeasurements[TIMEBINS];
+  double CPU_TimeBinMeasurements[TIMEBINS][NUMBER_OF_MEASUREMENTS_TO_RECORD];
+
+  /* gravitational and hydrodynamical softening lengths (given in terms of an `equivalent' Plummer softening
+   * length)
+   *
+   * five groups of particles are supported 0=gas,1=halo,2=disk,3=bulge,4=stars
+   */
+
+  int SofteningClassOfPartType[NTYPES];
+
+  double SofteningComoving[NSOFTCLASSES]; /**< comoving gravitational softening lengths for each softeniung type */
+  double SofteningMaxPhys[NSOFTCLASSES];  /**< maximum physical gravitational softening lengths for each softening type */
+
+  double SofteningTable[NSOFTCLASSES +
+                        NSOFTCLASSES_HYDRO]; /**< current (comoving) gravitational softening lengths for each softening type */
+  double ForceSoftening[NSOFTCLASSES + NSOFTCLASSES_HYDRO +
+                        2]; /**< the same, but multiplied by a factor 2.8 - at that scale the force is Newtonian */
+
+#ifdef ADAPTIVE_HYDRO_SOFTENING
+  double MinimumComovingHydroSoftening;
+  double AdaptiveHydroSofteningSpacing;
+  double GasSoftFactor;
+#endif
+
+  /** If particle masses are all equal for one type, the corresponding entry in MassTable is set to this
+   *  value, * allowing the size of the snapshot files to be reduced
+   */
+  double MassTable[NTYPES];
+
+  /* some filenames */
+  char InitCondFile[MAXLEN_PATH], OutputDir[MAXLEN_PATH], SnapshotFileBase[MAXLEN_PATH], OutputListFilename[MAXLEN_PATH];
+
+  /** table with desired output times */
+  double OutputListTimes[MAXLEN_OUTPUTLIST];
+  char OutputListFlag[MAXLEN_OUTPUTLIST];
+  int OutputListLength; /**< number of times stored in table of desired output times */
+
+#ifdef SECOND_ORDER_LPT_ICS
+  double LptScalingfactor;
+#endif
+
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+  double DtDisplacement;
+#endif
+
+#ifdef LIGHTCONE
+
+#ifdef LIGHTCONE_PARTICLES
+  char LightConeDefinitionFile[MAXLEN_PATH];
+  int LightconeFileCount;
+#endif
+
+#ifdef LIGHTCONE_MASSMAPS
+  int LightConeMassMapsNside;
+  double LightConeMassMapThickness;
+  double LightConeMassMapMaxRedshift;
+  int CurrentMassMapBoundary;
+#endif
+
+  int LightConeImageConeNr;
+  double LightConeImageLengthX;
+  double LightConeImageLengthY;
+  double LightConeImageCornerX;
+  double LightConeImageCornerY;
+  int LightConeImagePixelsX;
+  int LightConeImagePixelsY;
+  char LightConeImagePicName[MAXLEN_PATH];
+  int LightConeImageFirstConeDir;
+  int LightConeImageLastConeDir;
+#endif
+
+#ifdef STARFORMATION /* star formation and feedback sector */
+  double CritOverDensity;
+  double CritPhysDensity;
+  double OverDensThresh;
+  double PhysDensThresh;
+  double EgySpecSN;
+  double EgySpecCold;
+  double FactorEVP;
+  double TempSupernova;
+  double TempClouds;
+  double MaxSfrTimescale;
+  double FactorSN;
+  MyIDType MaxID;
+#endif
+
+#ifdef REDUCE_FLUSH
+  double FlushCpuTimeDiff;
+  double FlushLast;
+#endif
+
+#ifdef NGENIC
+  int NSample;
+  int SphereMode;
+  int PowerSpectrumType;
+  int ReNormalizeInputSpectrum;
+  double PrimordialIndex;
+  double ShapeGamma;
+  double Sigma8;
+  char PowerSpectrumFile[MAXLEN_PATH];
+  double InputSpectrum_UnitLength_in_cm;
+  int NgenicSeed;
+#endif
+
+#ifdef CREATE_GRID
+  int GridSize;
+#endif
+
+  void set_cosmo_factors_for_current_time(void);
+  void register_parameters(void);
+  void read_outputlist(char *fname);
+  void some_parameter_checks(void);
+
+  inline char *get_data_ptr(void) { return (char *)this + sizeof(parameters); }
+
+  inline size_t get_data_size(void) { return sizeof(global_data_all_processes) - sizeof(parameters); }
+};
+
+extern global_data_all_processes All;
+
+#endif
diff --git a/src/data/constants.h b/src/data/constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..061f3dcf1f399232ce76119d03fa274f732352df
--- /dev/null
+++ b/src/data/constants.h
@@ -0,0 +1,426 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file constants.h
+ *
+ *  \brief declares global constants and checks plausibility of configuration
+ */
+
+#ifndef CONSTANTS_H
+#define CONSTANTS_H
+
+#define GADGET_VERSION "4.0" /* code version string */
+
+#define FILEFORMAT_LEGACY1 1
+#define FILEFORMAT_LEGACY2 2
+#define FILEFORMAT_HDF5 3
+
+#define MODE_LOCAL_PARTICLES 0
+#define MODE_IMPORTED_PARTICLES 1
+#define MODE_DEFAULT 2
+#define MODE_LOCAL_NO_EXPORT 3
+
+#define FIRST_HALF_STEP 0
+#define SECOND_HALF_STEP 1
+
+#define FLAG_OUTSIDE 0
+#define FLAG_INSIDE 1
+#define FLAG_BOUNDARYOVERLAP 2
+
+#define LOW_MESH 0  /* low-res  mesh selector */
+#define HIGH_MESH 1 /* high-res mesh selector */
+
+#define MAX_THREADS 128
+
+#ifndef DIRECT_SUMMATION_THRESHOLD
+#define DIRECT_SUMMATION_THRESHOLD 500
+#endif
+
+#define NUMBER_OF_MEASUREMENTS_TO_RECORD 6
+
+#define MAX_FIRST_ELEMENTS_CONSIDERED \
+  5 /* This sets the number of lowest loaded tasks to be considered for assignment of next domain patch */
+
+#define COMMBUFFERSIZE (32 * 1024LL * 1024LL)
+
+#ifndef MPI_MESSAGE_SIZELIMIT_IN_MB
+#define MPI_MESSAGE_SIZELIMIT_IN_MB 200
+#endif
+
+#define MPI_MESSAGE_SIZELIMIT_IN_BYTES ((MPI_MESSAGE_SIZELIMIT_IN_MB)*1024LL * 1024LL)
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#define TO_MBYTE_FAC (1.0 / (1024.0 * 1024.0))
+
+#ifndef LIGHTCONE_ALLOC_FAC
+#define LIGHTCONE_ALLOC_FAC 0.1
+#endif
+
+#ifndef LIGHTCONE_MASSMAP_ALLOC_FAC
+#define LIGHTCONE_MASSMAP_ALLOC_FAC 1.0
+#endif
+
+#ifndef LIGHTCONE_MAX_FILLFACTOR
+#define LIGHTCONE_MAX_FILLFACTOR 0.9
+#endif
+
+#ifndef ALLOC_TOLERANCE
+#define ALLOC_TOLERANCE 0.2
+#endif
+
+#define ALLOC_STARBH_ROOM 0.02
+
+#define MAX_FLOAT_NUMBER 1e37
+#define MIN_FLOAT_NUMBER 1e-37
+#define MAX_DOUBLE_NUMBER 1e306
+#define MIN_DOUBLE_NUMBER 1e-306
+#define SMALLNUM 1e-60
+
+#ifdef DOUBLEPRECISION
+#if(DOUBLEPRECISION == 2)
+#define MAX_REAL_NUMBER MAX_FLOAT_NUMBER
+#define MIN_REAL_NUMBER MIN_FLOAT_NUMBER
+#else
+#define MAX_REAL_NUMBER MAX_DOUBLE_NUMBER
+#define MIN_REAL_NUMBER MIN_DOUBLE_NUMBER
+#endif
+#else
+#define MAX_REAL_NUMBER MAX_FLOAT_NUMBER
+#define MIN_REAL_NUMBER MIN_FLOAT_NUMBER
+#endif
+
+#if !defined(GAMMA) && !defined(ISOTHERM_EQS)
+#define GAMMA (5.0 / 3) /**< adiabatic index of simulated gas */
+#endif
+
+#ifdef ISOTHERM_EQS
+#if defined(GAMMA)
+#error "ISOTHERM_EQS overwrites your definition of GAMMA"
+#endif
+#undef GAMMA
+#define GAMMA 1.0
+#endif
+
+#define GAMMA_MINUS1 (GAMMA - 1)
+
+#define HYDROGEN_MASSFRAC 0.76 /**< mass fraction of hydrogen, relevant only for radiative cooling */
+
+#define METAL_YIELD 0.02 /**< effective metal yield for star formation */
+
+/* ... often used physical constants (cgs units; NIST 2010) */
+
+#define GRAVITY 6.6738e-8
+#define SOLAR_MASS 1.989e33
+#define BOLTZMANN 1.38065e-16
+#define CLIGHT 2.99792458e10
+
+#define PARSEC 3.085678e18
+#define PROTONMASS 1.67262178e-24
+#define HUBBLE 3.2407789e-18 /* in h/sec */
+
+#define SEC_PER_MEGAYEAR 3.15576e13
+#define SEC_PER_YEAR 3.15576e7
+
+#ifndef FOF_PRIMARY_LINK_TYPES
+#define FOF_PRIMARY_LINK_TYPES 2
+#endif
+
+#ifndef FOF_SECONDARY_LINK_TYPES
+#define FOF_SECONDARY_LINK_TYPES 0
+#endif
+
+#ifndef FOF_LINKLENGTH
+#define FOF_LINKLENGTH 0.2
+#endif
+
+#ifndef FOF_GROUP_MIN_LEN
+#define FOF_GROUP_MIN_LEN 32
+#endif
+
+#if defined(PMGRID) && !defined(HRPMGRID)
+#define HRPMGRID PMGRID
+#endif
+
+#if defined(SUBFIND) && !defined(SELFGRAVITY)
+#error "Running SUBFIND without SELFGRAVITY enabled does not make sense."
+#endif
+
+#if defined(SUBFIND) && !defined(FOF)
+#error "Running SUBFIND without FOF is not possible."
+#endif
+
+#if defined(TILING) && !defined(RECREATE_UNIQUE_IDS)
+#error "Running with TILING requires RECREATE_UNIQUE_IDS"
+#endif
+
+#if defined(MULTIPOLE_ORDER)
+#if(MULTIPOLE_ORDER < 1) || (MULTIPOLE_ORDER > 5)
+#error "MULTIPOLE_ORDER must be either 1, 2, 3, 4, or 5"
+#endif
+#else
+#define MULTIPOLE_ORDER 1
+#endif
+
+#if defined(FORCETEST) && !defined(EVALPOTENTIAL)
+#error "Running with FORCETEST requires EVALPOTENTIAL."
+#endif
+
+#if defined(DEBUG_ENABLE_FPU_EXCEPTIONS) && !defined(__linux__)
+#warning "DEBUG_ENABLE_FPU_EXCEPTIONS only works under Linux."
+#undef DEBUG_ENABLE_FPU_EXCEPTIONS
+#endif
+
+#if defined(HOST_MEMORY_REPORTING) && !defined(__linux__)
+#warning "HOST_MEMORY_REPORTING only works under Linux."
+#undef HOST_MEMORY_REPORTING
+#endif
+
+#if !defined(HOST_MEMORY_REPORTING) && defined(__linux__)
+#define HOST_MEMORY_REPORTING  // let's switch it always on under Linux
+#endif
+
+#if defined(STARFORMATION) && !defined(COOLING)
+#error "STARFORMATION requires COOLING"
+#endif
+
+#if defined(FORCE_EQUAL_TIMESTEPS) && defined(HIERARCHICAL_GRAVITY)
+#error "FORCE_EQUAL_TIMESTEPS cannot be used together with HIERARCHICAL_GRAVITY"
+#endif
+
+#if defined(EXTRAPOTTERM) && !defined(EVALPOTENTIAL)
+#error "EXTRAPOTTERM makes only sense for EVALPOTENTIAL"
+#endif
+
+#if defined(MERGERTREE) && !defined(SUBFIND)
+#error "MERGERTREE requires SUBFIND."
+#endif
+
+#if defined(POWERSPEC_ON_OUTPUT) && !(defined(PERIODIC) && defined(PMGRID))
+#error "The option POWERSPEC_ON_OUTPUT requires PMGRID and PERIODIC."
+#endif
+
+#if defined(CREATE_GRID) && !defined(NGENIC)
+#error "CREATE_GRID only makes sense with NGENIC"
+#endif
+
+#if defined(OUTPUT_COORDINATES_AS_INTEGERS) && !defined(PERIODIC)
+#error "The OUTPUT_COORDINATES_AS_INTEGERS option is only allowed when PERIODIC is on"
+#endif
+
+#if defined(ALLOW_DIRECT_SUMMATION) && !defined(HIERARCHICAL_GRAVITY)
+#error "The option ALLOW_DIRECT_SUMMATION is only availble when HIERARCHICAL_GRAVITY is used"
+#endif
+
+#if defined(PMGRID) && !defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+#error "If PMGRID is used without PERIODIC, TREEPM_NOTIMESPLIT needs to be activated"
+#endif
+
+#if defined(PLACEHIGHRESREGION) && !defined(RANDOMIZE_DOMAINCENTER)
+#error "PLACEHIGHRESREGION requires RANDOMIZE_DOMAINCENTER."
+#endif
+
+#if defined(PLACEHIGHRESREGION) && !defined(PMGRID)
+#error "PLACEHIGHRESREGION requires PMGRID."
+#endif
+
+#if defined(TREEPM_NOTIMESPLIT) && !defined(PMGRID)
+#error "The option TREEPM_NOTIMESPLIT requires PMGRID."
+#endif
+
+#if defined(HRPMGRID) && !defined(PMGRID)
+#error "It doesn't make sense to set HRPMGRID without having PMGRID."
+#endif
+
+#if defined(POSITIONS_IN_64BIT) && defined(POSITIONS_IN_128BIT)
+#error "The options POSITIONS_IN_64BIT and POSITIONS_IN_128BIT should not be activated together."
+#endif
+
+#if !defined(EVALPOTENTIAL) && defined(FORCETEST)
+#error "When you enable FORCETEST you should also switch on EVALPOTENTIAL"
+#endif
+
+#if(defined(LONG_X_BITS) || defined(LONG_Y_BITS) || defined(LONG_Z_BITS)) && !defined(PERIODIC)
+#error "LONG_X/Y/Z_BITS requires the PERIODIC option"
+#endif
+
+#if defined(LIGHTCONE_PARTICLES) && !defined(LIGHTCONE)
+#error "The option LIGHTCONE_PARTICLES requires LIGHTCONE"
+#endif
+
+#if defined(LIGHTCONE_MASSMAPS) && !defined(LIGHTCONE)
+#error "The option LIGHTCONE_MASSMAPS requires LIGHTCONE"
+#endif
+
+#if defined(LIGHTCONE) && !defined(LIGHTCONE_PARTICLES) && !defined(LIGHTCONE_MASSMAPS)
+#error "The option LIGHTCONE requires selection of at least one of LIGHTCONE_PARTICLES or LIGHTCONE_MASSMAPS"
+#endif
+
+#if defined(SUBFIND_HBT) && !defined(MERGERTREE)
+#error "The option SUBFIND_HBT requires MERGERTREE"
+#endif
+
+#if defined(LIGHTCONE_PARTICLES) && defined(LIGHTCONE_OUTPUT_ACCELERATIONS) && defined(PMGRID) && \
+    (!defined(TREEPM_NOTIMESPLIT) || defined(HIERARCHICAL_GRAVITY))
+#error "LIGHTCONE_OUTPUT_ACCELERATIONS only works with PMGRID if TREEPM_NOTIMESPLIT is used and HIERARCHICAL_GRAVITY is not used"
+#endif
+
+#ifndef ASMTH
+/** ASMTH gives the scale of the short-range/long-range force split in units of FFT-mesh cells */
+#define ASMTH 1.25
+#endif
+#ifndef RCUT
+/** RCUT gives the maximum distance (in units of the scale used for the force split) out to which short-range
+ * forces are evaluated in the short-range tree walk.
+ */
+#define RCUT 7.0
+#endif
+
+#ifndef MAXLEN_OUTPUTLIST
+#define MAXLEN_OUTPUTLIST 1100 /**< maxmimum number of entries in output list */
+#endif
+
+#define MAXLEN_PATH 512        /**< maximum length of various filenames (full path) */
+#define MAXLEN_PATH_EXTRA 2048 /**< maximum length of various filenames, plus extra space */
+
+#define BASENUMBER 100
+
+#define MAXITER 10000
+
+#ifndef NTYPES
+#define NTYPES 6
+#endif
+
+#ifndef NSOFTCLASSES
+#define NSOFTCLASSES NTYPES
+#endif
+
+#ifdef ADAPTIVE_HYDRO_SOFTENING
+#ifndef NSOFTCLASSES_HYDRO
+#define NSOFTCLASSES_HYDRO 64
+#endif
+#else
+#undef NSOFTCLASSES_HYDRO
+#define NSOFTCLASSES_HYDRO 0
+#endif
+
+#ifdef ENLARGE_DYNAMIC_RANGE_IN_TIME
+typedef long long integertime;
+#define TIMEBINS 60
+#define TIMEBASE                                                                                           \
+  (((long long)1) << TIMEBINS) /* The simulated timespan is mapped onto the integer interval [0,TIMESPAN], \
+                                *  where TIMESPAN needs to be a power of 2. */
+#else
+typedef int integertime;
+#define TIMEBINS 29
+#define TIMEBASE (1 << TIMEBINS)
+#endif
+
+#if(NSOFTCLASSES + NSOFTCLASSES_HYDRO) >= 128
+#error "(NSOFTCLASSES + NSOFTCLASSES_HYDRO) must be smaller than 128"
+#endif
+
+#if NSOFTCLASSES < 1
+#error "NSOFTCLASSES must be at least 1"
+#endif
+
+#ifdef GADGET2_HEADER
+#if NTYPES > 6
+#error "NTYPES may not be larger than 6 if GADGET2_HEADER is set"
+#endif
+#endif
+
+#ifndef STAR_TYPE
+#define STAR_TYPE 4
+#endif
+
+#if defined(STARFORMATION) && (STAR_TYPE >= NTYPES)
+#error "STAR_TYPE must be an available type according to the set NTYPES"
+#endif
+
+#ifdef ONEDIMS
+#define NUMDIMS 1
+#define KERNEL_COEFF_1 (4.0 / 3)
+#define KERNEL_COEFF_2 (8.0)
+#define KERNEL_COEFF_3 (24.0)
+#define KERNEL_COEFF_4 (16.0)
+#define KERNEL_COEFF_5 (8.0 / 3)
+#define KERNEL_COEFF_6 (-8.0)
+#define NORM_COEFF 2.0
+#else
+#ifndef TWODIMS
+#define NUMDIMS 3                     /**< For 3D-normalized kernel */
+#define KERNEL_COEFF_1 2.546479089470 /**< Coefficients for SPH spline kernel and its derivative */
+#define KERNEL_COEFF_2 15.278874536822
+#define KERNEL_COEFF_3 45.836623610466
+#define KERNEL_COEFF_4 30.557749073644
+#define KERNEL_COEFF_5 5.092958178941
+#define KERNEL_COEFF_6 (-15.278874536822)
+#define NORM_COEFF 4.188790204786 /**< Coefficient for kernel normalization. Note:  4.0/3 * PI = 4.188790204786 */
+#else
+#define NUMDIMS 2                                 /**< For 2D-normalized kernel */
+#define KERNEL_COEFF_1 (5.0 / 7 * 2.546479089470) /**< Coefficients for SPH spline kernel and its derivative */
+#define KERNEL_COEFF_2 (5.0 / 7 * 15.278874536822)
+#define KERNEL_COEFF_3 (5.0 / 7 * 45.836623610466)
+#define KERNEL_COEFF_4 (5.0 / 7 * 30.557749073644)
+#define KERNEL_COEFF_5 (5.0 / 7 * 5.092958178941)
+#define KERNEL_COEFF_6 (5.0 / 7 * (-15.278874536822))
+#define NORM_COEFF M_PI /**< Coefficient for kernel normalization. */
+#endif
+#endif /* ONEDIMS */
+
+#define SOFTFAC1 (32.0 / 3) /**< Coefficients for gravitational softening */
+#define SOFTFAC2 32.0
+#define SOFTFAC3 (-38.4)
+#define SOFTFAC4 (-2.8)
+#define SOFTFAC5 (16.0 / 3)
+#define SOFTFAC6 6.4
+#define SOFTFAC7 (-9.6)
+#define SOFTFAC8 (64.0 / 3)
+#define SOFTFAC9 (-48.0)
+#define SOFTFAC10 38.4
+#define SOFTFAC11 (-32.0 / 3)
+#define SOFTFAC12 (-1.0 / 15)
+#define SOFTFAC13 (-3.2)
+#define SOFTFAC14 (1.0 / 15)
+#define SOFTFAC15 (-16.0)
+#define SOFTFAC16 9.6
+#define SOFTFAC17 (-64.0 / 30)
+#define SOFTFAC18 128.0
+#define SOFTFAC19 (-115.2)
+#define SOFTFAC20 (64.0 / 3)
+#define SOFTFAC21 (-96.0)
+#define SOFTFAC22 115.2
+#define SOFTFAC23 (-128.0 / 3)
+#define SOFTFAC24 (4.0 / 30)
+
+#define SOFTFAC30 (32.0 / 3)
+#define SOFTFAC31 (-576.0 / 5)
+#define SOFTFAC32 (128.0)
+#define SOFTFAC33 (-1152.0 / 5)
+#define SOFTFAC34 (384.0)
+#define SOFTFAC35 (2.0 * 384.0)
+
+#define SOFTFAC40 (64.0 / 3)
+#define SOFTFAC41 (2.0 / 15)
+#define SOFTFAC42 (-96.0)
+#define SOFTFAC43 (576.0 / 5)
+#define SOFTFAC44 (-128.0 / 3)
+#define SOFTFAC45 (-96.0)
+#define SOFTFAC46 (-2.0 / 5)
+#define SOFTFAC47 (1152.0 / 5)
+#define SOFTFAC48 (-128.0)
+#define SOFTFAC49 (8.0 / 5)
+#define SOFTFAC50 (-256.0)
+#define SOFTFAC51 (-8.0)
+
+#define SQRT_PI 1.772453850906           /* sqrt(M_PI) */
+#define FACT1 0.366025403785             /* FACT1 = 0.5 * (sqrt(3)-1) */
+#define FACTSQRT3HALF 0.866025403785     /* sqrt(3)/2 */
+#define FACTSQRT3 (2.0 * 0.866025403785) /* sqrt(3) */
+#endif
diff --git a/src/data/dtypes.h b/src/data/dtypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbae1e2311aab88285ddcb01a511141f6c6124ab
--- /dev/null
+++ b/src/data/dtypes.h
@@ -0,0 +1,429 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file dtypes.h
+ *
+ *  \brief defines some custom data types used by the code
+ */
+
+#ifndef DTYPES_H
+#define DTYPES_H
+
+#include <stdint.h>
+#include <cstddef>
+#ifdef EXPLICIT_VECTORIZATION
+#include "../vectorclass/vectorclass.h"
+#endif
+
+#if !defined(POSITIONS_IN_32BIT) && !defined(POSITIONS_IN_64BIT) && !defined(POSITIONS_IN_128BIT)
+/* ok, nothing has been chosen as part of the configuration, then use a default value */
+#ifndef DOUBLEPRECISION
+#define POSITIONS_IN_32BIT
+#else
+#define POSITIONS_IN_64BIT
+#endif
+#endif
+
+/* Exactly one of the symbols POSITIONS_IN_32BIT, POSITIONS_IN_64BIT or POSITIONS_IN_128BIT need to be fined, otherwise
+ * it is desirable to get a compile time error
+ */
+
+#ifdef POSITIONS_IN_32BIT
+typedef uint32_t MyIntPosType;
+typedef int32_t MySignedIntPosType;
+#define BITS_FOR_POSITIONS 32
+#ifdef EXPLICIT_VECTORIZATION
+typedef Vec4ui Vec4MyIntPosType;
+typedef Vec4i Vec4MySignedIntPosType;
+#endif
+#endif
+
+#ifdef POSITIONS_IN_64BIT
+typedef uint64_t MyIntPosType;
+typedef int64_t MySignedIntPosType;
+#define BITS_FOR_POSITIONS 64
+#ifdef EXPLICIT_VECTORIZATION
+typedef Vec4uq Vec4MyIntPosType;
+typedef Vec4q Vec4MySignedIntPosType;
+#endif
+#endif
+
+#ifdef POSITIONS_IN_128BIT
+typedef uint128_t MyIntPosType;
+typedef int128_t MySignedIntPosType;
+#define BITS_FOR_POSITIONS 128
+#ifdef EXPLICIT_VECTORIZATION
+#error "EXPLICIT_VECTORIZATION and POSITIONS_IN_128BIT do not work together"
+#endif
+#endif
+
+#if !defined(IDS_32BIT) && !defined(IDS_48BIT) && !defined(IDS_64BIT)
+#define IDS_32BIT
+#endif
+
+#ifdef IDS_32BIT
+typedef unsigned int MyIDType;
+#else
+typedef unsigned long long MyIDType;
+#endif
+
+#ifdef FOF_ALLOW_HUGE_GROUPLENGTH
+typedef long long MyLenType;
+#else
+typedef int MyLenType;
+#endif
+
+#ifdef USE_SINGLEPRECISION_INTERNALLY
+typedef float MyReal;
+#else
+typedef double MyReal;
+#endif
+
+#ifndef DOUBLEPRECISION /* default is single-precision */
+typedef float MyFloat;
+typedef float MyDouble;
+typedef float MyNgbTreeFloat;
+#define MPI_MYFLOAT MPI_FLOAT
+#define MPI_MYDOUBLE MPI_FLOAT
+#define H5T_NATIVE_MYFLOAT H5T_NATIVE_FLOAT
+#define H5T_NATIVE_MYDOUBLE H5T_NATIVE_FLOAT
+#else
+#if(DOUBLEPRECISION == 2) /* mixed precision */
+typedef float MyFloat;
+typedef double MyDouble;
+typedef float MyNgbTreeFloat;
+#define MPI_MYFLOAT MPI_FLOAT
+#define MPI_MYDOUBLE MPI_DOUBLE
+#define H5T_NATIVE_MYFLOAT H5T_NATIVE_FLOAT
+#define H5T_NATIVE_MYDOUBLE H5T_NATIVE_DOUBLE
+#else /* everything double-precision */
+typedef double MyFloat;
+typedef double MyDouble;
+typedef double MyNgbTreeFloat;
+#define MPI_MYFLOAT MPI_DOUBLE
+#define MPI_MYDOUBLE MPI_DOUBLE
+#define H5T_NATIVE_MYFLOAT H5T_NATIVE_DOUBLE
+#define H5T_NATIVE_MYDOUBLE H5T_NATIVE_DOUBLE
+#endif
+#endif
+
+#ifdef ENLARGE_DYNAMIC_RANGE_IN_TIME
+typedef long long integertime;
+#define TIMEBINS 60
+#define TIMEBASE                                                                                           \
+  (((long long)1) << TIMEBINS) /* The simulated timespan is mapped onto the integer interval [0,TIMESPAN], \
+                                *  where TIMESPAN needs to be a power of 2. */
+#else
+typedef int integertime;
+#define TIMEBINS 29
+#define TIMEBASE (1 << TIMEBINS)
+#endif
+
+#ifndef NUMBER_OF_MPI_LISTENERS_PER_NODE
+#define NUMBER_OF_MPI_LISTENERS_PER_NODE 1
+#endif
+
+#ifndef MAX_NUMBER_OF_RANKS_WITH_SHARED_MEMORY
+#define MAX_NUMBER_OF_RANKS_WITH_SHARED_MEMORY 64
+#endif
+
+#if MAX_NUMBER_OF_RANKS_WITH_SHARED_MEMORY <= 32
+typedef uint32_t node_bit_field;
+#elif MAX_NUMBER_OF_RANKS_WITH_SHARED_MEMORY <= 64
+typedef uint64_t node_bit_field;
+#else
+#error "unsupported MAX_NUMBER_OF_RANKS_WITH_SHARED_MEMORY setting"
+#endif
+
+struct offset_tuple
+{
+  char n[3];
+
+  offset_tuple() {} /* constructor */
+
+  offset_tuple(const char x) /* constructor  */
+  {
+    n[0] = x;
+    n[1] = x;
+    n[2] = x;
+  }
+
+  offset_tuple(const char x, const char y, const char z) /* constructor  */
+  {
+    n[0] = x;
+    n[1] = y;
+    n[2] = z;
+  }
+};
+
+struct location
+{
+  int task;
+  int index;
+};
+
+inline bool operator==(const location &left, const location &right) { return left.task == right.task && left.index == right.index; }
+
+inline bool operator!=(const location &left, const location &right) { return left.task != right.task || left.index != right.index; }
+
+inline bool operator<(const location &left, const location &right)
+{
+  if(left.task < right.task)
+    return true;
+  else if(left.task == right.task)
+    {
+      if(left.index < right.index)
+        return true;
+      else
+        return false;
+    }
+  else
+    return false;
+}
+
+struct halotrees_table
+{
+  int HaloCount;
+  long long FirstHalo;
+  long long TreeID;
+};
+
+struct parttrees_table
+{
+  int ParticleCount;
+  long long ParticleFirst;
+  long long TreeID;
+};
+
+struct times_catalogue
+{
+  double Time;
+  double Redshift;
+};
+
+class peanokey
+{
+ public:
+  MyIntPosType hs, is, ls; /* 'hs'-high significance, 'is'-intermediate, 'ls'-low significance bits */
+};
+
+inline bool operator>=(const peanokey &a, const peanokey &b)
+{
+  if(a.hs < b.hs)
+    return false;
+  else if(a.hs > b.hs)
+    return true;
+  else if(a.is < b.is)
+    return false;
+  else if(a.is > b.is)
+    return true;
+  else if(a.ls < b.ls)
+    return false;
+  else
+    return true;
+}
+
+inline bool operator<(const peanokey &a, const peanokey &b)
+{
+  if(a.hs < b.hs)
+    return true;
+  else if(a.hs > b.hs)
+    return false;
+  else if(a.is < b.is)
+    return true;
+  else if(a.is > b.is)
+    return false;
+  else if(a.ls < b.ls)
+    return true;
+  else
+    return false;
+}
+
+inline peanokey operator+(const peanokey &a, const peanokey &b)
+{
+  peanokey c;
+
+  c.ls = a.ls + b.ls;
+  c.is = a.is + b.is;
+  c.hs = a.hs + b.hs;
+
+  if(c.is < a.is || c.is < b.is) /* overflow has occurred */
+    {
+      c.hs += 1;
+    }
+
+  if(c.ls < a.ls || c.ls < b.ls) /* overflow has occurred */
+    {
+      c.is += 1;
+      if(c.is == 0) /* overflown again */
+        c.hs += 1;
+    }
+
+  /* note: for hs we don't check for overflow explicitly as this would not be represented in the type anyhow */
+
+  return c;
+}
+
+inline peanokey get_peanokey_offset(unsigned int j, int bits) /* this returns the peanokey for which  j << bits */
+{
+  peanokey key = {j, j, j};
+
+  if(bits < BITS_FOR_POSITIONS)
+    key.ls <<= bits;
+  else
+    key.ls = 0;
+
+  int is_bits = bits - BITS_FOR_POSITIONS;
+
+  if(is_bits <= -BITS_FOR_POSITIONS)
+    key.is = 0;
+  else if(is_bits <= 0)
+    key.is >>= -is_bits;
+  else if(is_bits < BITS_FOR_POSITIONS)
+    key.is <<= is_bits;
+  else
+    key.is = 0;
+
+  int hs_bits = bits - 2 * BITS_FOR_POSITIONS;
+
+  if(hs_bits <= -BITS_FOR_POSITIONS)
+    key.hs = 0;
+  else if(hs_bits <= 0)
+    key.hs >>= -hs_bits;
+  else if(hs_bits < BITS_FOR_POSITIONS)
+    key.hs <<= hs_bits;
+  else
+    key.hs = 0;
+
+  return key;
+}
+
+enum mysnaptype
+{
+  NORMAL_SNAPSHOT,
+  MOST_BOUND_PARTICLE_SNAPHOT,
+  MOST_BOUND_PARTICLE_SNAPHOT_REORDERED
+};
+
+enum restart_options
+{
+  RST_BEGIN,
+  RST_RESUME,
+  RST_STARTFROMSNAP,
+  RST_FOF,
+  RST_POWERSPEC,
+  RST_CONVERTSNAP,
+  RST_CREATEICS,
+  RST_CALCDESC,
+  RST_MAKETREES,
+  RST_IOBANDWIDTH,
+  RST_LCIMAGE,
+  RST_LGALAXIES,
+  RST_LCREARRANGE,
+  RST_SNPREARRANGE
+};
+
+struct data_partlist
+{
+  int Task;  /** The task the item was exported to. */
+  int Index; /** The particle index of the item on the sending task. */
+};
+
+struct thread_data
+{
+  int Nexport;
+  int NexportNodes;
+
+  double Interactions; /*!< The total cost of the particles/nodes processed by each thread */
+
+  double Ewaldcount; /*!< The total cost for the Ewald correction per thread */
+  int FirstExec;     /*!< Keeps track, if a given thread executes the gravity_primary_loop() for the first time */
+
+  size_t ExportSpace;
+  size_t InitialSpace;
+  size_t ItemSize;
+
+  int *P_CostCount;
+  int *TreePoints_CostCount;
+  int *Node_CostCount;
+
+  data_partlist *PartList;
+  int *Ngblist;
+  int *Shmranklist;
+  int *Exportflag;
+};
+
+#ifdef LONG_X_BITS
+#define LONG_X (1 << (LONG_X_BITS))
+#define MAX_LONG_X_BITS LONG_X_BITS
+#else
+#define LONG_X 1
+#define MAX_LONG_X_BITS 0
+#endif
+
+#ifdef LONG_Y_BITS
+#define LONG_Y (1 << (LONG_Y_BITS))
+#define MAX_LONG_Y_BITS LONG_Y_BITS
+#else
+#define LONG_Y 1
+#define MAX_LONG_Y_BITS 0
+#endif
+
+#ifdef LONG_Z_BITS
+#define LONG_Z (1 << (LONG_Z_BITS))
+#define MAX_LONG_Z_BITS LONG_Z_BITS
+#else
+#define LONG_Z 1
+#define MAX_LONG_Z_BITS 0
+#endif
+
+#define LONG_BITS_MAX(A, B) (((A) > (B)) ? (A) : (B))
+
+#define LEVEL_ALWAYS_OPEN LONG_BITS_MAX(MAX_LONG_X_BITS, LONG_BITS_MAX(MAX_LONG_Y_BITS, MAX_LONG_Z_BITS))
+
+#ifdef GRAVITY_TALLBOX
+
+#if(GRAVITY_TALLBOX == 0)
+#define DBX 2
+#define DBX_EXTRA 6
+#define BOXX (1.0 / LONG_Y)
+#define BOXY (1.0 / LONG_Z)
+#else
+#define DBX 1
+#define DBX_EXTRA 0
+#endif
+
+#if(GRAVITY_TALLBOX == 1)
+#define DBY 2
+#define DBY_EXTRA 6
+#define BOXX (1.0 / LONG_X)
+#define BOXY (1.0 / LONG_Z)
+#else
+#define DBY 1
+#define DBY_EXTRA 0
+#endif
+
+#if(GRAVITY_TALLBOX == 2)
+#define DBZ 2
+#define DBZ_EXTRA 6
+#define BOXX (1.0 / LONG_X)
+#define BOXY (1.0 / LONG_Y)
+#else
+#define DBZ 1
+#define DBZ_EXTRA 0
+#endif
+
+#else
+
+#define DBX 1
+#define DBY 1
+#define DBZ 1
+#define DBX_EXTRA 0
+#define DBY_EXTRA 0
+#define DBZ_EXTRA 0
+#endif
+
+#endif
diff --git a/src/data/idstorage.h b/src/data/idstorage.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f1964575e8b8bdb0445ce77ca4e33742d25e680
--- /dev/null
+++ b/src/data/idstorage.h
@@ -0,0 +1,149 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file idstorage.h
+ *
+ *  \brief defines a class that we use to store and manipulate the particle IDs
+ */
+
+#ifndef IDSTORAGE_H
+#define IDSTORAGE_H
+
+#include <climits>
+
+#if !defined(IDS_48BIT)
+#define ID_MSB ((MyIDType)(~((MyIDType)(~((MyIDType)0)) >> ((MyIDType)1))))
+#define ID_MSK ((MyIDType)(((MyIDType)(~((MyIDType)0)) >> ((MyIDType)1))))
+#define HALONR_MAX ((MyIDType)(((MyIDType)(~((MyIDType)0)) >> ((MyIDType)1))))
+#else
+#define ID_MSB ((unsigned short)(~((unsigned short)(~((unsigned short)0)) >> ((unsigned short)1))))
+#define ID_MSK ((unsigned short)(((unsigned short)(~((unsigned short)0)) >> ((unsigned short)1))))
+#define HALONR_MAX ((MyIDType)(((MyIDType)(~((MyIDType)0)) >> ((MyIDType)17))))
+#endif
+
+/* used to store a subhalo len in an approximate (quite accurate) way in just two bytes */
+struct approxlen
+{
+#define ALEN_MAX 1000000000.0
+#define ALEN_MIN 10.0
+
+ private:
+  unsigned short alen;
+
+ public:
+  inline void set(long long size)
+  {
+    double l = log(size / ALEN_MIN) / log(ALEN_MAX / ALEN_MIN) * (USHRT_MAX - 1) + 1;
+
+    if(l < 1)
+      alen = 0;
+    else if(l > USHRT_MAX)
+      alen = USHRT_MAX;
+    else
+      alen = (unsigned short)(l + 0.5);
+  }
+
+  inline long long get(void)
+  {
+    // relatice accuracy of this encoding is about ~0.00012 for particle numbers between 10 to 10^9
+    if(alen == 0)
+      return 0;
+    else
+      {
+        return (long long)(ALEN_MIN * exp((alen - 1.0) / (USHRT_MAX - 1) * log(ALEN_MAX / ALEN_MIN)) + 0.5);
+      }
+  }
+};
+
+struct compactrank_t
+{
+ private:
+  unsigned char rank;
+
+ public:
+  inline void set(MyLenType nr)
+  {
+    if(nr > UCHAR_MAX)
+      nr = UCHAR_MAX;
+    rank = nr;
+  }
+
+  inline unsigned char get(void) { return rank; }
+};
+
+class MyIDStorage
+{
+ private:
+#if !defined(IDS_48BIT)
+  MyIDType id;
+#else
+  unsigned short id[3];
+#endif
+
+ public:
+  inline MyIDType get(void) const
+  {
+#if !defined(IDS_48BIT)
+    return id & ID_MSK;
+#else
+    return (((MyIDType)(id[0] & ID_MSK)) << 32) + (((MyIDType)id[1]) << 16) + id[2];
+#endif
+  }
+
+  inline void set(MyIDType ID)
+  {
+#if !defined(IDS_48BIT)
+    id = ID;
+#else
+    id[2] = (unsigned short)ID;
+    id[1] = (unsigned short)(ID >> 16);
+    id[0] = (unsigned short)(ID >> 32);
+#endif
+  }
+
+  inline void mark_as_formerly_most_bound(void)
+  {
+    /* we set the most significant bit */
+#if !defined(IDS_48BIT)
+    id |= ID_MSB;
+#else
+    id[0] |= ID_MSB;
+#endif
+  }
+
+  inline bool is_previously_most_bound(void)
+  {
+    /* we set the most significant bit */
+#if defined(IDS_48BIT)
+    if(id[0] & ID_MSB)
+      return true;
+#else
+    if(id & ID_MSB)
+      return true;
+#endif
+    return false;
+  }
+};
+
+class MyHaloNrType : public MyIDStorage
+{
+ public:
+  inline MyHaloNrType &operator+=(const long long &x)
+  {
+    set(get() + x);
+    return *this;
+  }
+};
+
+inline bool operator<(const MyHaloNrType &left, const MyHaloNrType &right) { return left.get() < right.get(); }
+
+inline bool operator>(const MyHaloNrType &left, const MyHaloNrType &right) { return left.get() > right.get(); }
+
+inline bool operator!=(const MyHaloNrType &left, const MyHaloNrType &right) { return left.get() != right.get(); }
+
+inline bool operator==(const MyHaloNrType &left, const MyHaloNrType &right) { return left.get() == right.get(); }
+
+#endif /* IDSTORAGE_H */
diff --git a/src/data/intposconvert.h b/src/data/intposconvert.h
new file mode 100644
index 0000000000000000000000000000000000000000..aefed36c911afba031c5a5e4214d268095fe4247
--- /dev/null
+++ b/src/data/intposconvert.h
@@ -0,0 +1,480 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file intposconvert.h
+ *
+ *  \brief defines a class to convert between integer coordinates and floating point positions
+ */
+
+#ifndef CONVERT_H
+#define CONVERT_H
+
+#include "allvars.h"
+#include "dtypes.h"
+
+#include <cmath>
+
+#define MSB ((MyIntPosType)(~((MyIntPosType)(~((MyIntPosType)0)) >> ((MyIntPosType)1))))
+
+#if defined(LONG_X_BITS)
+#define HBITS_X ((MyIntPosType)(~((MyIntPosType)(~((MyIntPosType)0)) >> ((MyIntPosType)LONG_X_BITS))))
+#endif
+
+#if defined(LONG_Y_BITS)
+#define HBITS_Y ((MyIntPosType)(~((MyIntPosType)(~((MyIntPosType)0)) >> ((MyIntPosType)LONG_Y_BITS))))
+#endif
+
+#if defined(LONG_Z_BITS)
+#define HBITS_Z ((MyIntPosType)(~((MyIntPosType)(~((MyIntPosType)0)) >> ((MyIntPosType)LONG_Z_BITS))))
+#endif
+
+class intposconvert
+{
+ public:
+  MyReal FacIntToCoord;
+  MyReal FacCoordToInt;
+  MyReal RegionLen;
+
+#ifndef PERIODIC
+  MyReal RegionCorner[3];
+  MyReal RegionCenter[3];
+#endif
+
+#ifdef RANDOMIZE_DOMAINCENTER
+  MyIntPosType CurrentShiftVector[3];
+#endif
+
+#ifdef EXPLICIT_VECTORIZATION
+  inline Vec4d nearest_image_intpos_to_doublepos_vectorial(MyIntPosType const &a, MyIntPosType const &b0, MyIntPosType const &b1,
+                                                           MyIntPosType const &b2, MyIntPosType const &b3)
+  {
+#if defined(GRAVITY_TALLBOX) || defined(LONG_X_BITS) || defined(LONG_Y_BITS) || defined(LONG_Z_BITS)
+    Terminate("not working in this combination");
+#endif
+
+    Vec4MyIntPosType delta = a - Vec4MyIntPosType(b0, b1, b2, b3);
+
+    Vec4MySignedIntPosType intpos = (Vec4MySignedIntPosType)delta;
+
+    Vec4d pos = to_double(intpos);
+
+    return pos * FacIntToCoord;
+  }
+#endif
+
+  inline void constrain_intpos(MyIntPosType *pos)
+  {
+#ifdef PERIODIC /* restrict the position to the primary range */
+
+#if defined(LONG_X_BITS)
+    pos[0] = (pos[0] << LONG_X_BITS) >> LONG_X_BITS;
+#endif
+
+#if defined(LONG_Y_BITS)
+    pos[1] = (pos[1] << LONG_Y_BITS) >> LONG_Y_BITS;
+#endif
+
+#if defined(LONG_Z_BITS)
+    pos[2] = (pos[2] << LONG_Z_BITS) >> LONG_Z_BITS;
+#endif
+
+#endif
+  }
+
+  /* function to determine the nearest periodic image distance vector in MyReal, exploiting integer wrap around */
+  template <typename T>
+  inline void diff_intpos_to_pos(MyIntPosType *a, MyIntPosType *b, T *posdiff, offset_tuple off = 0)
+  {
+    if(a[0] > b[0])
+      posdiff[0] = (a[0] - b[0]) * FacIntToCoord + All.BoxSize * off.n[0];
+    else
+      posdiff[0] = (b[0] - a[0]) * (-FacIntToCoord) + All.BoxSize * off.n[0];
+
+    if(a[1] > b[1])
+      posdiff[1] = (a[1] - b[1]) * FacIntToCoord + All.BoxSize * off.n[1];
+    else
+      posdiff[1] = (b[1] - a[1]) * (-FacIntToCoord) + All.BoxSize * off.n[1];
+
+    if(a[2] > b[2])
+      posdiff[2] = (a[2] - b[2]) * FacIntToCoord + All.BoxSize * off.n[2];
+    else
+      posdiff[2] = (b[2] - a[2]) * (-FacIntToCoord) + All.BoxSize * off.n[2];
+  }
+
+  inline MyIntPosType nearest_image_intpos_to_intpos_X(const MyIntPosType a, const MyIntPosType b)
+  {
+#if defined(LONG_X_BITS)
+    MyIntPosType delta = (a << LONG_X_BITS) - (b << LONG_X_BITS);
+
+    if(delta & MSB) /* tests MSB */
+      {
+        delta >>= LONG_X_BITS;
+        delta |= HBITS_X;
+      }
+    else
+      delta >>= LONG_X_BITS;
+
+    return delta;
+#else
+    return a - b;
+#endif
+  }
+
+  inline MyIntPosType nearest_image_intpos_to_intpos_Y(const MyIntPosType a, const MyIntPosType b)
+  {
+#if defined(LONG_Y_BITS)
+    MyIntPosType delta = (a << LONG_Y_BITS) - (b << LONG_Y_BITS);
+
+    if(delta & MSB) /* tests MSB */
+      {
+        delta >>= LONG_Y_BITS;
+        delta |= HBITS_Y;
+      }
+    else
+      delta >>= LONG_Y_BITS;
+
+    return delta;
+#else
+    return a - b;
+#endif
+  }
+
+  inline MyIntPosType nearest_image_intpos_to_intpos_Z(const MyIntPosType a, const MyIntPosType b)
+  {
+#if defined(LONG_Z_BITS)
+    MyIntPosType delta = (a << LONG_Z_BITS) - (b << LONG_Z_BITS);
+
+    if(delta & MSB) /* tests MSB */
+      {
+        delta >>= LONG_Z_BITS;
+        delta |= HBITS_Z;
+      }
+    else
+      delta >>= LONG_Z_BITS;
+
+    return delta;
+#else
+    return a - b;
+#endif
+  }
+
+  /* function to determine the nearest periodic image distance vector in T, exploiting integer wrap around */
+  template <typename T>
+  inline void nearest_image_intpos_to_pos(const MyIntPosType *const a, const MyIntPosType *const b, T *posdiff)
+  {
+    MyIntPosType delta[3];
+    MySignedIntPosType *intpos = (MySignedIntPosType *)delta;
+
+    /* we use all these casts here to prevent that implicit type promotions can mess this up for types shorter than int, such as
+     * unsigned char */
+
+#if defined(GRAVITY_TALLBOX) && (GRAVITY_TALLBOX == 0)
+    if(a[0] >= b[0])
+      {
+        delta[0]   = a[0] - b[0];
+        posdiff[0] = delta[0] * FacIntToCoord;
+      }
+    else
+      {
+        delta[0]   = b[0] - a[0];
+        posdiff[0] = delta[0] * (-FacIntToCoord);
+      }
+#else
+
+#if defined(LONG_X_BITS)
+    delta[0] = (a[0] << LONG_X_BITS) - (b[0] << LONG_X_BITS);
+
+    if(delta[0] & MSB) /* tests MSB */
+      {
+        delta[0] >>= LONG_X_BITS;
+        delta[0] |= HBITS_X;
+      }
+    else
+      delta[0] >>= LONG_X_BITS;
+#else
+    delta[0] = a[0] - b[0];
+#endif
+    posdiff[0] = intpos[0] * FacIntToCoord;
+
+#endif
+
+      /** --- **/
+
+#if defined(GRAVITY_TALLBOX) && (GRAVITY_TALLBOX == 1)
+    if(a[1] >= b[1])
+      {
+        delta[1]   = a[1] - b[1];
+        posdiff[1] = delta[1] * FacIntToCoord;
+      }
+    else
+      {
+        delta[1]   = b[1] - a[1];
+        posdiff[1] = delta[1] * (-FacIntToCoord);
+      }
+#else
+
+#if defined(LONG_Y_BITS)
+    delta[1]   = (a[1] << LONG_Y_BITS) - (b[1] << LONG_Y_BITS);
+
+    if(delta[1] & MSB) /* tests MSB */
+      {
+        delta[1] >>= LONG_Y_BITS;
+        delta[1] |= HBITS_Y;
+      }
+    else
+      delta[1] >>= LONG_Y_BITS;
+#else
+    delta[1] = a[1] - b[1];
+#endif
+
+    posdiff[1] = intpos[1] * FacIntToCoord;
+#endif
+
+      /** --- **/
+
+#if defined(GRAVITY_TALLBOX) && (GRAVITY_TALLBOX == 2)
+    if(a[2] >= b[2])
+      {
+        delta[2]   = a[2] - b[2];
+        posdiff[2] = delta[2] * FacIntToCoord;
+      }
+    else
+      {
+        delta[2]   = b[2] - a[2];
+        posdiff[2] = delta[2] * (-FacIntToCoord);
+      }
+#else
+
+#if defined(LONG_Z_BITS)
+    delta[2]   = (a[2] << LONG_Z_BITS) - (b[2] << LONG_Z_BITS);
+
+    if(delta[2] & MSB) /* tests MSB */
+      {
+        delta[2] >>= LONG_Z_BITS;
+        delta[2] |= HBITS_Z;
+      }
+    else
+      delta[2] >>= LONG_Z_BITS;
+#else
+    delta[2] = a[2] - b[2];
+#endif
+    posdiff[2] = intpos[2] * FacIntToCoord;
+
+#endif
+  }
+
+  inline void nearest_image_intpos_to_absolute_intdist(const MyIntPosType *a, const MyIntPosType *b, MyIntPosType *delta)
+  {
+#if defined(LONG_X_BITS)
+    delta[0] = (a[0] << LONG_X_BITS) - (b[0] << LONG_X_BITS);
+
+    if(delta[0] & (~((~((MyIntPosType)0)) >> 1))) /* tests MSB */
+      {
+        delta[0] >>= LONG_X_BITS;
+        delta[0] |= (~((~((MyIntPosType)0)) >> LONG_X_BITS));
+      }
+    else
+      delta[0] >>= LONG_X_BITS;
+#else
+    delta[0]   = a[0] - b[0];
+#endif
+
+#if defined(LONG_Y_BITS)
+    delta[1] = (a[1] << LONG_Y_BITS) - (b[1] << LONG_Y_BITS);
+
+    if(delta[1] & (~((~((MyIntPosType)0)) >> 1))) /* tests MSB */
+      {
+        delta[1] >>= LONG_Y_BITS;
+        delta[1] |= (~((~((MyIntPosType)0)) >> LONG_Y_BITS));
+      }
+    else
+      delta[1] >>= LONG_Y_BITS;
+#else
+    delta[1]   = a[1] - b[1];
+#endif
+
+#if defined(LONG_Z_BITS)
+    delta[2] = (a[2] << LONG_Z_BITS) - (b[2] << LONG_Z_BITS);
+
+    if(delta[2] & (~((~((MyIntPosType)0)) >> 1))) /* tests MSB */
+      {
+        delta[2] >>= LONG_Z_BITS;
+        delta[2] |= (~((~((MyIntPosType)0)) >> LONG_Z_BITS));
+      }
+    else
+      delta[2] >>= LONG_Z_BITS;
+#else
+    delta[2]   = a[2] - b[2];
+#endif
+
+    if(delta[0] & (~((~((MyIntPosType)0)) >> 1))) /* tests MSB, i.e. negative if interpreted as signed int */
+      delta[0] = -delta[0];
+
+    if(delta[1] & (~((~((MyIntPosType)0)) >> 1))) /* tests MSB, i.e. negative if interpreted as signed int */
+      delta[1] = -delta[1];
+
+    if(delta[2] & (~((~((MyIntPosType)0)) >> 1))) /* tests MSB, i.e. negative if interpreted as signed int */
+      delta[2] = -delta[2];
+  }
+
+#if defined(POSITIONS_IN_32BIT)
+  template <typename T>
+  inline MySignedIntPosType pos_to_signedintpos(T posdiff)
+  {
+    return std::lrint(posdiff * FacCoordToInt);
+  }
+
+  template <typename T>
+  inline void pos_to_signedintpos(T *posdiff, MySignedIntPosType *intpos)
+  {
+    for(int j = 0; j < 3; j++)
+      intpos[j] = std::lrint(posdiff[j] * FacCoordToInt);
+  }
+#elif defined(POSITIONS_IN_64BIT)
+  template <typename T>
+  inline MySignedIntPosType pos_to_signedintpos(T posdiff)
+  {
+    return std::llrint(posdiff * FacCoordToInt);
+  }
+
+  template <typename T>
+  inline void pos_to_signedintpos(T *posdiff, MySignedIntPosType *intpos)
+  {
+    for(int j = 0; j < 3; j++)
+      intpos[j] = std::llrint(posdiff[j] * FacCoordToInt);
+  }
+#else
+  template <typename T>
+  inline MySignedIntPosType pos_to_signedintpos(T posdiff)
+  {
+    return static_cast<MySignedIntPosType>(posdiff * FacCoordToInt);
+  }
+
+  template <typename T>
+  inline void pos_to_signedintpos(T *posdiff, MySignedIntPosType *intpos)
+  {
+    for(int j = 0; j < 3; j++)
+      intpos[j] = static_cast<MySignedIntPosType>(posdiff[j] * FacCoordToInt);
+  }
+#endif
+
+  template <typename T>
+  inline void signedintpos_to_pos(MySignedIntPosType *intpos, T *pos)
+  {
+    for(int j = 0; j < 3; j++)
+      pos[j] = intpos[j] * FacIntToCoord;
+  }
+
+  inline double signedintpos_to_distanceorigin(MySignedIntPosType *intpos)
+  {
+    double pos[3];
+    for(int j = 0; j < 3; j++)
+      pos[j] = intpos[j] * FacIntToCoord;
+    return sqrt(pos[0] * pos[0] + pos[1] * pos[1] + pos[2] * pos[2]);
+  }
+
+  template <typename T>
+  inline void intpos_to_pos(MyIntPosType *intpos, T *posdiff)
+  {
+#ifdef RANDOMIZE_DOMAINCENTER
+#ifdef PERIODIC
+    MyIntPosType xyz[3];
+    for(int j = 0; j < 3; j++)
+      xyz[j] = intpos[j] - CurrentShiftVector[j];
+    constrain_intpos(xyz);
+    for(int j = 0; j < 3; j++)
+      posdiff[j] = xyz[j] * FacIntToCoord;
+#else
+    for(int j = 0; j < 3; j++)
+      posdiff[j] = (intpos[j] - CurrentShiftVector[j]) * FacIntToCoord + RegionCorner[j];
+#endif
+#else
+#ifdef PERIODIC
+    for(int j = 0; j < 3; j++)
+      posdiff[j] = intpos[j] * FacIntToCoord;
+#else
+    for(int j = 0; j < 3; j++)
+      posdiff[j] = intpos[j] * FacIntToCoord + RegionCorner[j];
+#endif
+#endif
+  }
+
+  inline void intpos_to_intpos(MyIntPosType *intpos, MyIntPosType *xyz)
+  {
+#ifdef RANDOMIZE_DOMAINCENTER
+    for(int j = 0; j < 3; j++)
+      xyz[j] = intpos[j] - CurrentShiftVector[j];
+#else
+    for(int j = 0; j < 3; j++)
+      xyz[j] = intpos[j];
+#endif
+
+#ifdef PERIODIC
+    constrain_intpos(xyz);
+#else
+    Terminate("integer coordinate output not defined if PERIODIC is not no");
+#endif
+  }
+
+  template <typename T>
+  inline T constrain_pos(T pos)
+  {
+    int rep = 0;
+
+    while(pos < 0)
+      {
+        pos += ((T)(~((MyIntPosType)0)) + static_cast<T>(1.0));
+
+        rep++;
+        if(rep > MAXITER)
+          Terminate("rep > MAX_ITER");
+      }
+
+    while(pos >= ((T)(~((MyIntPosType)0)) + static_cast<T>(1.0)))
+      {
+        pos -= ((T)(~((MyIntPosType)0)) + static_cast<T>(1.0));
+
+        rep++;
+        if(rep > MAXITER)
+          Terminate("rep > MAX_ITER");
+      }
+
+    return pos;
+  }
+
+  template <typename T>
+  inline void pos_to_intpos(T *posdiff, MyIntPosType *intpos)
+  {
+#ifdef RANDOMIZE_DOMAINCENTER
+#ifdef PERIODIC
+    for(int j = 0; j < 3; j++)
+      {
+        intpos[j] = constrain_pos(posdiff[j] * FacCoordToInt);
+        intpos[j] += CurrentShiftVector[j];
+      }
+    constrain_intpos(intpos);
+#else
+    for(int j = 0; j < 3; j++)
+      {
+        intpos[j] = constrain_pos((posdiff[j] - RegionCorner[j]) * FacCoordToInt);
+        intpos[j] += CurrentShiftVector[j];
+      }
+#endif
+#else
+#ifdef PERIODIC
+    for(int j = 0; j < 3; j++)
+      intpos[j] = constrain_pos(posdiff[j] * FacCoordToInt);
+
+    constrain_intpos(intpos);
+#else
+    for(int j = 0; j < 3; j++)
+      intpos[j] = constrain_pos((posdiff[j] - RegionCorner[j]) * FacCoordToInt);
+#endif
+#endif
+  }
+};
+
+#endif
diff --git a/src/data/lcparticles.h b/src/data/lcparticles.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcbebba516346fde47703f9114745721cb5765ca
--- /dev/null
+++ b/src/data/lcparticles.h
@@ -0,0 +1,224 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file lcparticles.h
+ *
+ *  \brief declares a class responsible for holding the (buffered) particles on the lightcone
+ */
+
+#ifndef LCPART_H
+#define LCPART_H
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES)
+
+#include <math.h>
+
+#include "gadgetconfig.h"
+
+#include "../data/constants.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/lightcone_particle_data.h"
+#include "../data/macros.h"
+#include "../data/mymalloc.h"
+#include "../data/particle_data.h"
+#include "../data/sph_particle_data.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../mpi_utils/setcomm.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+class lcparticles : public intposconvert, public setcomm
+{
+ public:
+  lcparticles(MPI_Comm comm) : setcomm(comm) {}
+
+  int NumPart;    /**< number of particles on the LOCAL processor */
+  int NumGas = 0; /* this is added here to simplify the template code */
+
+  int MaxPart;        /**< This gives the maxmimum number of particles that can be stored on one processor. */
+  int MaxPartSph = 0; /* this is added here to simplify the template code */
+
+  long long TotNumPart;
+  long long TotNumGas;
+
+  typedef lightcone_particle_data pdata;
+
+  /*! This structure holds all the information that is
+   * stored for each particle of the simulation.
+   */
+  lightcone_particle_data *P; /*!< holds particle data on local processor */
+
+  /* the following struture holds data that is stored for each SPH particle in addition to the collisionless
+   * variables.
+   */
+
+  sph_particle_data *SphP = NULL; /* the current code does not yet support actual Sph particles on the lightcone */
+
+  subfind_data *PS;
+
+  int *HealPixTab_PartCount;
+  int Npix;
+  int FirstPix;
+  int NpixLoc;
+
+  /* This routine allocates memory for
+   * particle storage, both the collisionless and the SPH particles.
+   * The memory for the ordered binary tree of the timeline
+   * is also allocated.
+   */
+  void allocate_memory(void)
+  {
+    if(MaxPart < BASENUMBER)
+      MaxPart = BASENUMBER;
+
+    P = (lightcone_particle_data *)Mem.mymalloc_movable_clear(&P, "P", MaxPart * sizeof(lightcone_particle_data));
+  }
+
+  void free_memory(void) { Mem.myfree(P); }
+
+  void reallocate_memory_maxpart(int maxpartNew)
+  {
+    MaxPart = maxpartNew;
+
+    if(MaxPart < BASENUMBER)
+      MaxPart = BASENUMBER;
+
+    P = (lightcone_particle_data *)Mem.myrealloc_movable(P, MaxPart * sizeof(lightcone_particle_data));
+
+    /*
+    if(NumPart > MaxPart)  // should be ok because this only happens when P has already been copied away
+      Terminate("NumPart=%d > MaxPart=%d", NumPart, MaxPart);
+      */
+  }
+
+  void reallocate_memory_maxpartsph(int maxpartNew)
+  {
+    // Don't need to do anything here.
+  }
+
+  bool TestIfAboveFillFactor(int SpMaxPart)
+  {
+    int max_in[2] = {NumPart, SpMaxPart}, max_out[2];
+    MPI_Allreduce(max_in, max_out, 2, MPI_INT, MPI_MAX, Communicator);
+
+    /* also recompute the total number of particles in buffer to have current value */
+    sumup_large_ints(1, &NumPart, &TotNumPart, Communicator);
+
+    if(max_out[0] > 0 && (All.Ti_Current >= TIMEBASE || max_out[0] >= LIGHTCONE_MAX_FILLFACTOR * max_out[1]))
+      return true;
+    else
+      return false;
+  }
+
+  static bool compare_ID(const lightcone_particle_data &a, const lightcone_particle_data &b) { return a.ID.get() < b.ID.get(); }
+
+  static bool compare_ipnest(const lightcone_particle_data &a, const lightcone_particle_data &b) { return a.ipnest < b.ipnest; }
+
+#ifdef REARRANGE_OPTION
+  static bool compare_TreeID_ID(const lightcone_particle_data &a, const lightcone_particle_data &b)
+  {
+    if(a.TreeID < b.TreeID)
+      return true;
+
+    if(a.TreeID > b.TreeID)
+      return false;
+
+    return a.ID.get() < b.ID.get();
+  }
+#endif
+
+  void dump_particles(void) {}
+
+  inline int drift_particle(lightcone_particle_data *P, sph_particle_data *SphP, integertime time1, bool ignore_light_cone = false)
+  {
+    return 0;
+  }
+
+  inline MyFloat get_Hsml(int i) { return 0; }
+
+  inline MyFloat get_DtHsml(int i) { return 0; }
+
+  inline MyFloat get_OldAcc(int i) { return 0; }
+
+  inline MyFloat get_Csnd(int i) { return 0; }
+
+  inline double get_utherm_from_entropy(int i) { return 0; }
+
+  inline int getTimeBinSynchronized(int bin) { return 1; }
+
+  void fill_active_gravity_list_with_all_particles(void) {}
+
+#ifdef FOF
+  MyIDStorage *MinID;
+  int *Head, *Len, *Next, *Tail, *MinIDTask;
+  MyFloat *fof_nearest_distance;
+  MyFloat *fof_nearest_hsml;
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+  double *DistanceOrigin;
+#endif
+
+  struct bit_flags
+  {
+    unsigned char Nonlocal : 2, MinIDChanged : 2, Marked : 2, Changed : 2;
+  } * Flags;
+
+  double LinkL;
+
+  void link_two_particles(int target, int j)
+  {
+    if(Head[target] != Head[j]) /* only if not yet linked */
+      {
+        int p, s;
+        if(Len[Head[target]] > Len[Head[j]]) /* p group is longer */
+          {
+            p = target;
+            s = j;
+          }
+        else
+          {
+            p = j;
+            s = target;
+          }
+        Next[Tail[Head[p]]] = Head[s];
+
+        Tail[Head[p]] = Tail[Head[s]];
+
+        Len[Head[p]] += Len[Head[s]];
+
+        if(MinID[Head[s]].get() < MinID[Head[p]].get())
+          {
+            MinID[Head[p]]     = MinID[Head[s]];
+            MinIDTask[Head[p]] = MinIDTask[Head[s]];
+          }
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+        if(DistanceOrigin[Head[p]] < DistanceOrigin[Head[s]])
+          DistanceOrigin[Head[s]] = DistanceOrigin[Head[p]];
+        else
+          DistanceOrigin[Head[p]] = DistanceOrigin[Head[s]];
+#endif
+
+        int ss = Head[s];
+        do
+          Head[ss] = Head[p];
+        while((ss = Next[ss]) >= 0);
+      }
+  }
+
+#ifdef SUBFIND
+  struct nearest_r2_data
+  {
+    double dist[2];
+  } * R2Loc;
+
+#endif
+#endif
+};
+
+#endif
+
+#endif
diff --git a/src/data/lightcone_massmap_data.h b/src/data/lightcone_massmap_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..75d6256094f3e31866cad9ececa63f9b224d76e2
--- /dev/null
+++ b/src/data/lightcone_massmap_data.h
@@ -0,0 +1,52 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file lightcone_massmap_data.h
+ *
+ *  \brief defines a structure to hold data for lightcone particles projected on healpix map
+ */
+
+#ifndef MMPARTDATA_H
+#define MMPARTDATA_H
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_MASSMAPS)
+
+#include "../data/constants.h"
+#include "../data/dtypes.h"
+#include "../data/macros.h"
+
+struct lightcone_massmap_data
+{
+  MyFloat Ascale;
+  int PixIndex; /* global healpix index */
+  int Task;
+
+#ifndef LEAN
+ private:
+  MyDouble Mass;
+
+ public:
+#endif
+
+  inline MyDouble getMass(void)
+  {
+#ifdef LEAN
+    return All.PartMass;
+#else
+    return Mass;
+#endif
+  }
+
+  inline void setMass(MyDouble mass)
+  {
+#ifndef LEAN
+    Mass = mass;
+#endif
+  }
+};
+
+#endif
+#endif
diff --git a/src/data/lightcone_particle_data.h b/src/data/lightcone_particle_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e3260dd2dadbf67f275fbca70956c0f5acf6d25
--- /dev/null
+++ b/src/data/lightcone_particle_data.h
@@ -0,0 +1,153 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file lightcone_particle_data.h
+ *
+ *  \brief declares a structure for holding particle on the lightcone
+ */
+
+#ifndef LCPARTDATA_H
+#define LCPARTDATA_H
+
+#include "gadgetconfig.h"
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES)
+
+#include "../data/constants.h"
+#include "../data/dtypes.h"
+#include "../data/idstorage.h"
+#include "../data/macros.h"
+#include "../data/symtensors.h"
+
+struct lightcone_particle_data
+{
+  MyIntPosType IntPos[3];
+  MyFloat Vel[3];
+#ifdef LIGHTCONE_OUTPUT_ACCELERATIONS
+  vector<MyFloat> GravAccel;
+#endif
+  MyFloat Ascale;
+  MyIDStorage ID;
+
+  long ipnest;
+
+#ifdef REARRANGE_OPTION
+  unsigned long long TreeID;
+#endif
+
+#ifndef LEAN
+ private:
+  MyDouble Mass; /**< particle mass */
+ public:
+#endif
+
+#ifndef LEAN
+ private:
+  unsigned char Type; /**< flags particle type.  0=gas, 1=halo, 2=disk, 3=bulge, 4=stars, 5=bndry */
+ public:
+#endif
+
+#if NSOFTCLASSES > 1
+ private:
+  unsigned char
+      SofteningClass : 7; /* we use only 7 bits here so that we can stuff 1 bit for ActiveFlag into it in the Tree_Points structure */
+ public:
+#endif
+
+#if defined(MERGERTREE) && defined(SUBFIND)
+  // compactrank_t PrevRankInSubhalo;  // 1-byte
+  MyHaloNrType PrevSubhaloNr;   // 6-byte
+  approxlen PrevSizeOfSubhalo;  // 2-byte
+#endif
+
+#ifdef LIGHTCONE_IMAGE_COMP_HSML_VELDISP
+  int NumNgb;
+  MyFloat Hsml;
+  MyFloat VelDisp;
+  MyFloat Density;
+  MyFloat Vx;
+  MyFloat Vy;
+  MyFloat Vz;
+#endif
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS) && defined(FOF)
+ private:
+  bool FlagSaveDistance;
+
+ public:
+  inline void setFlagSaveDistance(void) { FlagSaveDistance = true; }
+  inline void clearFlagSaveDistance(void) { FlagSaveDistance = false; }
+  inline bool getFlagSaveDistance(void) { return FlagSaveDistance; }
+
+  inline unsigned char getSofteningClass(void)
+  {
+#if NSOFTCLASSES > 1
+    return SofteningClass;
+#else
+    return 0;
+#endif
+  }
+  inline void setSofteningClass(unsigned char softclass)
+  {
+#if NSOFTCLASSES > 1
+    SofteningClass = softclass;
+#endif
+  }
+
+#endif
+
+  inline unsigned char getType(void)
+  {
+#ifdef LEAN
+    return 1;
+#else
+    return Type;
+#endif
+  }
+
+  inline double getAscale(void) { return Ascale; }
+
+  inline void setType(unsigned char type)
+  {
+#ifndef LEAN
+    Type = type;
+#endif
+  }
+
+  inline MyDouble getMass(void)
+  {
+#ifdef LEAN
+    return All.PartMass;
+#else
+    return Mass;
+#endif
+  }
+
+  inline integertime get_Ti_Current(void) { return 0; }
+
+  inline void setMass(MyDouble mass)
+  {
+#ifndef LEAN
+    Mass = mass;
+#endif
+  }
+
+  inline float getOldAcc(void) { return 0; }
+
+  inline signed char getTimeBinGrav(void) { return 0; }
+  inline signed char getTimeBinHydro(void) { return 0; }
+  inline int getGravCost(void) { return 0; }
+
+#if defined(MERGERTREE) && defined(SUBFIND)
+  inline void setPrevSubhaloNr(int nr) {}
+  inline void setPrevRankInSubhalo(int nr) {}
+  inline long long getPrevSubhaloNr(void) { return 0; }
+  inline int getPrevRankInSubhalo(void) { return 0; }
+#endif
+};
+
+#endif
+#endif
diff --git a/src/data/macros.h b/src/data/macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..3128e740664613316237119124a2dbdb36429e91
--- /dev/null
+++ b/src/data/macros.h
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file macros.h
+ *
+ *  \brief defines macros for run termination or for warnings
+ */
+
+#ifndef MACROS_H
+#define MACROS_H
+
+#ifdef MPI_HYPERCUBE_ALLGATHERV
+#define MPI_Allgatherv MPI_hypercube_Allgatherv
+#endif
+
+#define Terminate(...)                                                                                                      \
+  {                                                                                                                         \
+    {                                                                                                                       \
+      char termbuf1__[8000], termbuf2__[8000];                                                                              \
+      int thistask;                                                                                                         \
+      MPI_Comm_rank(MPI_COMM_WORLD, &thistask);                                                                             \
+      sprintf(termbuf1__, "Code termination on task=%d, function %s(), file %s, line %d", thistask, __FUNCTION__, __FILE__, \
+              __LINE__);                                                                                                    \
+      sprintf(termbuf2__, __VA_ARGS__);                                                                                     \
+      printf("%s: %s\n", termbuf1__, termbuf2__);                                                                           \
+      fflush(stdout);                                                                                                       \
+      MPI_Abort(MPI_COMM_WORLD, 1);                                                                                         \
+    }                                                                                                                       \
+    exit(0);                                                                                                                \
+  }
+#define warn(...)                                                                                                                \
+  {                                                                                                                              \
+    char termbuf1__[8000], termbuf2__[8000];                                                                                     \
+    int thistask;                                                                                                                \
+    MPI_Comm_rank(MPI_COMM_WORLD, &thistask);                                                                                    \
+    sprintf(termbuf1__, "Code warning on task=%d, function %s(), file %s, line %d", thistask, __FUNCTION__, __FILE__, __LINE__); \
+    sprintf(termbuf2__, __VA_ARGS__);                                                                                            \
+    printf("%s: %s\n", termbuf1__, termbuf2__);                                                                                  \
+    myflush(stdout);                                                                                                             \
+    FILE *fd__ = fopen("WARNINGS", "w");                                                                                         \
+    fclose(fd__);                                                                                                                \
+  }
+
+/*! \brief A wrapper for the printf() function
+ *
+ *  This macro has the same functionalities of the standard printf()
+ *  function. However, data is written to the standard output only for
+ *  the task with rank 0
+ *
+ *  \param fmt string that contains format arguments
+ */
+
+#endif
diff --git a/src/data/mmparticles.h b/src/data/mmparticles.h
new file mode 100644
index 0000000000000000000000000000000000000000..8560421ad864dfe555457166bed8e9501ce9bf51
--- /dev/null
+++ b/src/data/mmparticles.h
@@ -0,0 +1,64 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file mmparticles.h
+ *
+ *  \brief defines class that holds particles for projection on lightcone massmaps
+ */
+
+#ifndef MMPART_H
+#define MMPART_H
+
+/* mass map particle */
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_MASSMAPS)
+
+#include <math.h>
+
+#include "../data/constants.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/lightcone_massmap_data.h"
+#include "../data/macros.h"
+#include "../data/mymalloc.h"
+#include "../mpi_utils/setcomm.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+#include "gadgetconfig.h"
+
+class mmparticles : public setcomm
+{
+ public:
+  mmparticles(MPI_Comm comm) : setcomm(comm) {}
+
+  int NumPart; /**< number of particles on the LOCAL processor */
+  int MaxPart; /**< This gives the maxmimum number of particles that can be stored on one processor. */
+
+  int Npix; /* total number of pixels of Healpix tessellation of lightcone */
+  int FirstPix;
+  int NpixLoc;
+
+  lightcone_massmap_data *P; /*!< holds the mass point particle data on local processor */
+
+  void allocate_memory(void)
+  {
+    P = (lightcone_massmap_data *)Mem.mymalloc_movable_clear(&P, "P", MaxPart * sizeof(lightcone_massmap_data));
+  }
+
+  void reallocate_memory_maxpart(int maxpartNew)
+  {
+    MaxPart = maxpartNew;
+
+    P = (lightcone_massmap_data *)Mem.myrealloc_movable(P, MaxPart * sizeof(lightcone_massmap_data));
+
+    if(NumPart > MaxPart)
+      Terminate("NumPart=%d > MaxPart=%d", NumPart, MaxPart);
+  }
+};
+
+#endif
+
+#endif
diff --git a/src/data/mymalloc.cc b/src/data/mymalloc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf29495ec523d5f18ccbe655fcf7e1b3efa3b978
--- /dev/null
+++ b/src/data/mymalloc.cc
@@ -0,0 +1,627 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../logs/logs.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../mpi_utils/shared_mem_handler.h"
+#include "../system/system.h"
+
+/** \file mymalloc.cc
+ *
+ *  \brief Manager for dynamic memory allocation
+ *
+ *  This module handles the dynamic memory allocation.
+ *  To avoid memory allocation/dellocation overhead a big chunk of memory
+ *  (which will be the maximum amount of dinamically allocatable memory)
+ *  is allocated upon initialization. This chunk is then filled by the memory
+ *  blocks as in a stack structure. The blocks are automatically aligned to a CACHELINESIZE bit boundary.
+ *  Memory blocks come in two flavours: movable and non-movable. In non-movable
+ *  blocks the starting address is fixed once the block is allocated and cannot be changed.
+ *  Due to the stack structure of the dynamic memory, this implies that the last (non-movable)
+ *  block allocated must be the first block to be deallocated. If this condition is not met,
+ *  an abort condition is triggered. If more flexibility is needed, movable memory blocks can
+ *  be used. In this case, the starting address of the block is again fixed upon allocation
+ *  but the block can be shifted (therefore its initial address changes) according to needs.
+ *  For a movable block to be successfully shifted it is required that all the subsequent allocated
+ *  blocks are movable. Again, an abort condition is triggered if this condition is not met.
+ *  Movable blocks can be deallocated in any order provided that the condition just described holds.
+ *  The gap resulting form the deallocation of a block that is not in
+ *  the last position will be automatically filled by shifting all the blocks coming after the
+ *  deallocated block.
+ */
+
+/** \brief Initialize memory manager.
+ *
+ *  This function initializes the memory manager. In particular, it sets
+ *  the global variables of the module to their initial value and allocates
+ *  the memory for the stack.
+ */
+void memory::mymalloc_init(int maxmemsize, enum restart_options restartflag)
+{
+  BlockSize                    = (size_t *)malloc(MAXBLOCKS * sizeof(size_t));
+  Table                        = (char **)malloc(MAXBLOCKS * sizeof(void *));
+  MovableFlag                  = (char *)malloc(MAXBLOCKS * sizeof(char));
+  GenericFlag                  = (char *)malloc(MAXBLOCKS * sizeof(char));
+  BasePointers                 = (char ***)malloc(MAXBLOCKS * sizeof(void **));
+  VarName                      = (char *)malloc(MAXBLOCKS * MAXCHARS * sizeof(char));
+  FunctionName                 = (char *)malloc(MAXBLOCKS * MAXCHARS * sizeof(char));
+  ParentFileName               = (char *)malloc(MAXBLOCKS * MAXCHARS * sizeof(char));
+  FileName                     = (char *)malloc(MAXBLOCKS * MAXCHARS * sizeof(char));
+  LineNumber                   = (int *)malloc(MAXBLOCKS * sizeof(int));
+  HighMarkTabBuf               = (char *)malloc((100 + 4 * MAXCHARS) * (MAXBLOCKS + 10));
+  HighMarkTabBufWithoutGeneric = (char *)malloc((100 + 4 * MAXCHARS) * (MAXBLOCKS + 10));
+
+  memset(VarName, 0, MAXBLOCKS * MAXCHARS);
+  memset(FunctionName, 0, MAXBLOCKS * MAXCHARS);
+  memset(ParentFileName, 0, MAXBLOCKS * MAXCHARS);
+  memset(FileName, 0, MAXBLOCKS * MAXCHARS);
+
+  size_t n = maxmemsize * ((size_t)1024 * 1024);
+  n        = roundup_to_multiple_of_cacheline_size(n);
+
+  // add an extra cache line size to make sure we can guarantee that base returned from MPI_Win_allocate_shared is aligned
+  n += CACHELINESIZE;
+
+  RestartFlag = restartflag;
+
+  MPI_Barrier(MPI_COMM_WORLD);  // wait until both regular and ghost processors are here
+
+  double t0 = Logs.second();
+
+  MPI_Info win_info;
+  MPI_Info_create(&win_info);
+  MPI_Info_set(win_info, "alloc_shared_noncontig", "true");
+
+  if(MPI_Win_allocate_shared(n, 1, win_info, Shmem.SharedMemComm, &Base, &Shmem.SharedMemWin) != MPI_SUCCESS)
+    Terminate("Failed to allocate memory for `Base' (%d Mbytes).\n", All.MaxMemSize);
+
+  /* we now make sure that the allocated local buffer is really aligned, not all MPI libraries guarantee this */
+
+  int off = 0;
+  if((long long)Base / CACHELINESIZE > 0)
+    off = ((long long)Base / CACHELINESIZE + 1) * CACHELINESIZE - (long long)Base;
+
+  /* allign our base */
+  Base += off;
+
+  MPI_Info_free(&win_info);
+
+  double t1 = Logs.second();
+  if(Shmem.World_ThisTask == 0)
+    mpi_printf("MALLOC: Allocation of shared memory took %g sec\n", Logs.timediff(t0, t1));
+
+  TotBytes = FreeBytes = n - CACHELINESIZE;
+
+  AllocatedBytes                  = 0;
+  Nblocks                         = 0;
+  HighMarkBytes                   = 0;
+  HighMarkBytesWithoutGeneric     = 0;
+  OldGlobHighMarkMB               = 0;
+  OldGlobHighMarkMBWithoutGeneric = 0;
+
+  char mode[2], buf[MAXLEN_PATH_EXTRA];
+
+  if(All.RestartFlag == RST_BEGIN)
+    strcpy(mode, "w");
+  else
+    strcpy(mode, "a");
+
+  MPI_Bcast(All.OutputDir, sizeof(All.OutputDir), MPI_BYTE, 0, MPI_COMM_WORLD);
+
+  if(Shmem.GhostRank == 0)
+    sprintf(buf, "%s%s", All.OutputDir, "memory.txt");
+  else
+    sprintf(buf, "%s%s", All.OutputDir, "memory_ghostranks.txt");
+
+  if(!(FdMemory = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+  /* tell also the ghost ranks about the total size of the simulation partition */
+  MPI_Bcast(&Shmem.Sim_NTask, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+  Shmem.GetGhostRankForSimulCommRank = (int *)mymalloc("GetGhostRankForSimulCommRank", Shmem.Sim_NTask * sizeof(int));
+  Shmem.GetShmRankForSimulCommRank   = (int *)mymalloc("GetShmRankForSimulCommRank", Shmem.Sim_NTask * sizeof(int));
+  Shmem.GetNodeIDForSimulCommRank    = (int *)mymalloc("GetNodeIDForSimulCommRank", Shmem.Sim_NTask * sizeof(int));
+
+  if(Shmem.GhostRank == 0)
+    {
+      MPI_Allgather(&Shmem.MyShmRankInGlobal, 1, MPI_INT, Shmem.GetGhostRankForSimulCommRank, 1, MPI_INT, Shmem.SimulationComm);
+      MPI_Allgather(&Shmem.Island_ThisTask, 1, MPI_INT, Shmem.GetShmRankForSimulCommRank, 1, MPI_INT, Shmem.SimulationComm);
+      MPI_Allgather(&Shmem.Island_Smallest_WorldTask, 1, MPI_INT, Shmem.GetNodeIDForSimulCommRank, 1, MPI_INT, Shmem.SimulationComm);
+    }
+
+  // to make sure that also the ghost processors have this table
+  MPI_Bcast(Shmem.GetGhostRankForSimulCommRank, Shmem.Sim_NTask, MPI_INT, 0, MPI_COMM_WORLD);
+  MPI_Bcast(Shmem.GetShmRankForSimulCommRank, Shmem.Sim_NTask, MPI_INT, 0, MPI_COMM_WORLD);
+
+  /* we also need the base offsets of the other MPI ranks in the same shared memory island */
+  Shmem.SharedMemBaseAddr = (void **)mymalloc("SharedMemBaseAddr", Shmem.Island_NTask * sizeof(void *));
+
+  for(int i = 0; i < Shmem.Island_NTask; i++)
+    {
+      MPI_Aint size;
+      int disp_unit;
+      MPI_Win_shared_query(Shmem.SharedMemWin, i, &size, &disp_unit, &Shmem.SharedMemBaseAddr[i]);
+    }
+
+  // now propagte the alignment correction also to the base addresses that all the other processes see
+  int *off_list = (int *)Mem.mymalloc("off_list", Shmem.Island_NTask * sizeof(int));
+
+  MPI_Allgather(&off, 1, MPI_INT, off_list, 1, MPI_INT, Shmem.SharedMemComm);
+
+  for(int i = 0; i < Shmem.Island_NTask; i++)
+    Shmem.SharedMemBaseAddr[i] = (char *)Shmem.SharedMemBaseAddr[i] + off_list[i];
+
+  Mem.myfree(off_list);
+}
+
+void memory::report_memory_usage(int rank, char *tabbuf)
+{
+  int thistask;
+  MPI_Comm_rank(Communicator, &thistask);
+
+  if(thistask == rank)
+    {
+      char *buf = (char *)mymalloc("buf", (100 + 4 * MAXCHARS) * (Nblocks + 10));
+      int cc    = 0;
+      cc += sprintf(buf + cc, "\nMEMORY:  Largest Allocation = %g Mbyte  |  Largest Allocation Without Generic = %g Mbyte\n\n",
+                    OldGlobHighMarkMB, OldGlobHighMarkMBWithoutGeneric);
+
+      cc += sprintf(buf + cc, "%s", tabbuf);
+      if(thistask == 0)
+        {
+          if(RestartFlag == RST_BEGIN || RestartFlag == RST_RESUME || RestartFlag == RST_STARTFROMSNAP)
+            {
+              fprintf(FdMemory, "%s", buf);
+              fflush(FdMemory);
+            }
+        }
+      else
+        {
+          MPI_Send(&cc, 1, MPI_INT, 0, TAG_N, Communicator);
+          MPI_Send(buf, cc + 1, MPI_BYTE, 0, TAG_PDATA, Communicator);
+        }
+      myfree(buf);
+    }
+
+  if(thistask == 0 && rank > 0)
+    {
+      int cc;
+      MPI_Recv(&cc, 1, MPI_INT, rank, TAG_N, Communicator, MPI_STATUS_IGNORE);
+      char *buf = (char *)mymalloc("buf", cc + 1);
+      MPI_Recv(buf, cc + 1, MPI_BYTE, rank, TAG_PDATA, Communicator, MPI_STATUS_IGNORE);
+      if(RestartFlag == RST_BEGIN || RestartFlag == RST_RESUME || RestartFlag == RST_STARTFROMSNAP)
+        {
+          fprintf(FdMemory, "%s", buf);
+          fflush(FdMemory);
+        }
+      myfree(buf);
+    }
+}
+
+/** \brief Output memory usage for the task with the greatest amount of memory allocated.
+ *
+ */
+void memory::report_detailed_memory_usage_of_largest_task(void)
+{
+  int flag = 0;
+  int thistask;
+  MPI_Comm_rank(Communicator, &thistask);
+
+  struct
+  {
+    double mem;
+    int rank;
+  } local, global;
+
+  local.mem  = HighMarkBytes * TO_MBYTE_FAC;
+  local.rank = thistask;
+
+  MPI_Allreduce(&local, &global, 1, MPI_DOUBLE_INT, MPI_MAXLOC, Communicator);
+
+  if(global.mem >= 1.05 * OldGlobHighMarkMB)
+    {
+      OldGlobHighMarkMB = global.mem;
+      flag |= 1;
+    }
+
+  local.mem  = HighMarkBytesWithoutGeneric * TO_MBYTE_FAC;
+  local.rank = thistask;
+
+  MPI_Allreduce(&local, &global, 1, MPI_DOUBLE_INT, MPI_MAXLOC, Communicator);
+
+  if(global.mem >= 1.05 * OldGlobHighMarkMBWithoutGeneric)
+    {
+      OldGlobHighMarkMBWithoutGeneric = global.mem;
+      flag |= 2;
+    }
+
+  if(flag & 2)
+    report_memory_usage(global.rank, HighMarkTabBufWithoutGeneric);
+
+  if(flag & 1)
+    report_memory_usage(global.rank, HighMarkTabBuf);
+}
+
+/** \brief Dump the buffer where the memory information is stored to the standard output.
+ *
+ */
+void memory::dump_memory_table(void)
+{
+  char *buf = (char *)malloc(200 * (Nblocks + 10));
+  dump_memory_table_buffer(buf);
+  printf("%s", buf);
+  free(buf);
+}
+
+/** \brief Fill the output buffer with the memory log.
+ *
+ *  \param p output buffer
+ *  \return the number of characters written to p
+ */
+int memory::dump_memory_table_buffer(char *p)
+{
+  int cc              = 0;
+  size_t totBlocksize = 0;
+  int thistask;
+  MPI_Comm_rank(Communicator, &thistask);
+
+  cc +=
+      sprintf(p + cc, "-------------------------- Allocated Memory Blocks---- ( Step %8d )------------------\n", All.NumCurrentTiStep);
+  cc += sprintf(p + cc, "Task    Nr F                  Variable      MBytes   Cumulative  Function|File|Linenumber\n");
+  cc += sprintf(p + cc, "------------------------------------------------------------------------------------------\n");
+  for(int i = 0; i < Nblocks; i++)
+    {
+      totBlocksize += BlockSize[i];
+
+      cc += sprintf(p + cc, "%4d %5d %d %40s  %10.4f   %10.4f  %s%s()|%s|%d\n", thistask, i, MovableFlag[i], VarName + i * MAXCHARS,
+                    BlockSize[i] * TO_MBYTE_FAC, totBlocksize * TO_MBYTE_FAC, ParentFileName + i * MAXCHARS,
+                    FunctionName + i * MAXCHARS, FileName + i * MAXCHARS, LineNumber[i]);
+    }
+  cc += sprintf(p + cc, "------------------------------------------------------------------------------------------\n");
+
+  return cc;
+}
+
+/** \brief Allocate a movable memory block and store the relative information.
+ *
+ *  \param ptr pointer to the initial memory address of the block
+ *  \param varname name of the variable to be stored in the allocated block
+ *  \param n size of the memory block in bytes
+ *  \param func name of function that has called the allocation routine (usually given by the __FUNCTION__ macro)
+ *  \param file file where the function that has called the allocation routine resides (usually given by the __FILE__ macro)
+ *  \param line line number of file where the allocation routine was called (usually given by the __LINE__ macro)
+ *  \return a pointer to the beginning of the allocated memory block
+ */
+void *memory::mymalloc_movable_fullinfo(void *ptr, const char *varname, size_t n, const char *func, const char *file, int line,
+                                        int movable_flag, int clear_flag, char *callorigin)
+{
+  if((n % CACHELINESIZE) > 0)
+    n = (n / CACHELINESIZE + 1) * CACHELINESIZE;
+
+  if(n < CACHELINESIZE)
+    n = CACHELINESIZE;
+
+  if(Nblocks >= MAXBLOCKS)
+    Terminate("No blocks left in mymalloc_fullinfo() at %s()/%s/line %d. MAXBLOCKS=%d\n", func, file, line, MAXBLOCKS);
+
+  if(n > FreeBytes)
+    {
+      dump_memory_table();
+      Terminate(
+          "\nNot enough memory in mymalloc_fullinfo() to allocate %g MB for variable '%s' at %s()/%s/line %d (FreeBytes=%g MB).\n",
+          n * TO_MBYTE_FAC, varname, func, file, line, FreeBytes * TO_MBYTE_FAC);
+    }
+  Table[Nblocks] = Base + (TotBytes - FreeBytes);
+  FreeBytes -= n;
+
+  strncpy(VarName + Nblocks * MAXCHARS, varname, MAXCHARS - 1);
+  if(callorigin)
+    {
+      strncpy(ParentFileName + Nblocks * MAXCHARS, callorigin, MAXCHARS - 1);
+      GenericFlag[Nblocks] = 1;
+      AllocatedBytesGeneric += n;
+    }
+  else
+    {
+      memset(ParentFileName + Nblocks * MAXCHARS, 0, MAXCHARS);
+      GenericFlag[Nblocks] = 0;
+    }
+  strncpy(FunctionName + Nblocks * MAXCHARS, func, MAXCHARS - 1);
+  strncpy(FileName + Nblocks * MAXCHARS, file, MAXCHARS - 1);
+  LineNumber[Nblocks] = line;
+
+  AllocatedBytes += n;
+  BlockSize[Nblocks]    = n;
+  MovableFlag[Nblocks]  = movable_flag;
+  BasePointers[Nblocks] = (char **)ptr;
+
+  Nblocks += 1;
+
+  if(AllocatedBytes - AllocatedBytesGeneric > HighMarkBytesWithoutGeneric)
+    {
+      HighMarkBytesWithoutGeneric = AllocatedBytes - AllocatedBytesGeneric;
+      dump_memory_table_buffer(HighMarkTabBufWithoutGeneric);
+    }
+
+  if(AllocatedBytes > HighMarkBytes)
+    {
+      HighMarkBytes = AllocatedBytes;
+      dump_memory_table_buffer(HighMarkTabBuf);
+    }
+
+  if(clear_flag)
+    memset(Table[Nblocks - 1], 0, n);
+
+  return Table[Nblocks - 1];
+}
+
+size_t memory::roundup_to_multiple_of_cacheline_size(size_t n)
+{
+  if((n % CACHELINESIZE) > 0)
+    n = (n / CACHELINESIZE + 1) * CACHELINESIZE;
+
+  return n;
+}
+
+void *memory::myfree_query_last_block(void)
+{
+  if(Nblocks == 0)
+    Terminate("no allocated blocks that could be returned");
+
+  return Table[Nblocks - 1];
+}
+
+/** \brief Deallocate a movable memory block.
+ *
+ *  For this operation to be successful all the blocks allocated after the block that has to be freed must be of movable type.
+ *
+ *  \param p pointer to the memory block to be deallocated
+ *  \param func name of function that has called the deallocation routine (usually given by the __FUNCTION__ macro)
+ *  \param file file where the function that has called the deallocation routine resides (usually given by the __FILE__ macro)
+ *  \param line line number of file where the deallocation routine was called (usually given by the __LINE__ macro)
+ */
+void memory::myfree_movable_fullinfo(void *p, const char *func, const char *file, int line, int movable_flag)
+{
+  if(Nblocks == 0)
+    Terminate("no allocated blocks that could be freed");
+
+  /* first, let's find the block */
+  int nr;
+
+  if(movable_flag)
+    {
+      for(nr = Nblocks - 1; nr >= 0; nr--)
+        if(p == Table[nr])
+          break;
+
+      if(nr < 0)
+        {
+          dump_memory_table();
+          Terminate("Wrong call of myfree_movable() from %s()/%s/line %d - this block has not been allocated!\n", func, file, line);
+        }
+
+      if(nr < Nblocks - 1) /* the block is not the last allocated block */
+        {
+          /* check that all subsequent blocks are actually movable */
+          for(int i = nr + 1; i < Nblocks; i++)
+            if(MovableFlag[i] == 0)
+              {
+                dump_memory_table();
+                myflush(stdout);
+                Terminate(
+                    "Wrong call of myfree_movable() from %s()/%s/line %d - behind block=%d there are subsequent non-movable allocated "
+                    "blocks\n",
+                    func, file, line, nr);
+              }
+        }
+    }
+  else
+    {
+      nr = Nblocks - 1;
+      if(p != Table[nr])
+        {
+          dump_memory_table();
+          Terminate("Wrong call of myfree() at %s()/%s/line %d: not the last allocated block!\n", func, file, line);
+        }
+    }
+
+  if(GenericFlag[nr])
+    AllocatedBytesGeneric -= BlockSize[nr];
+
+  AllocatedBytes -= BlockSize[nr];
+  FreeBytes += BlockSize[nr];
+
+  if(BasePointers[nr])
+    *BasePointers[nr] = NULL;
+
+  if(movable_flag)
+    {
+      ptrdiff_t offset = -BlockSize[nr];
+      size_t length    = 0;
+
+      for(int i = nr + 1; i < Nblocks; i++)
+        length += BlockSize[i];
+
+      if(nr < Nblocks - 1)
+        memmove(Table[nr + 1] + offset, Table[nr + 1], length);
+
+      for(int i = nr + 1; i < Nblocks; i++)
+        {
+          Table[i] += offset;
+          *BasePointers[i] = *BasePointers[i] + offset;
+        }
+
+      for(int i = nr + 1; i < Nblocks; i++)
+        {
+          Table[i - 1]        = Table[i];
+          BasePointers[i - 1] = BasePointers[i];
+          BlockSize[i - 1]    = BlockSize[i];
+          MovableFlag[i - 1]  = MovableFlag[i];
+          GenericFlag[i - 1]  = GenericFlag[i];
+
+          memmove(VarName + (i - 1) * MAXCHARS, VarName + i * MAXCHARS, MAXCHARS);
+          memmove(FunctionName + (i - 1) * MAXCHARS, FunctionName + i * MAXCHARS, MAXCHARS);
+          memmove(ParentFileName + (i - 1) * MAXCHARS, ParentFileName + i * MAXCHARS, MAXCHARS);
+          memmove(FileName + (i - 1) * MAXCHARS, FileName + i * MAXCHARS, MAXCHARS);
+          LineNumber[i - 1] = LineNumber[i];
+        }
+    }
+
+  Nblocks -= 1;
+}
+
+/** \brief Reallocate an existing movable memory block.
+ *
+ *  For this operation to be successful all the blocks allocated after the block that has to be reallocated must be of movable type.
+ *
+ *  \param p pointer to the existing memory block to be reallocated
+ *  \param n the new size of the memory block in bytes
+ *  \param func name of function that has called the reallocation routine (usually given by the __FUNCTION__ macro)
+ *  \param file file where the function that has called the reallocation routine resides (usually given by the __FILE__ macro)
+ *  \param line line number of file where the reallocation routine was called (usually given by the __LINE__ macro)
+ *  \return a pointer to the beginning of the newly allocated memory block
+ */
+void *memory::myrealloc_movable_fullinfo(void *p, size_t n, const char *func, const char *file, int line, int movable_flag)
+{
+  if((n % CACHELINESIZE) > 0)
+    n = (n / CACHELINESIZE + 1) * CACHELINESIZE;
+
+  if(n < CACHELINESIZE)
+    n = CACHELINESIZE;
+
+  if(Nblocks == 0)
+    Terminate("no allocated blocks that could be reallocated");
+
+  /* first, let's find the block */
+  int nr;
+
+  if(movable_flag)
+    {
+      for(nr = Nblocks - 1; nr >= 0; nr--)
+        if(p == Table[nr])
+          break;
+
+      if(nr < 0)
+        {
+          dump_memory_table();
+          Terminate("Wrong call of myrealloc_movable() from %s()/%s/line %d - this block has not been allocated!\n", func, file, line);
+        }
+
+      if(nr < Nblocks - 1) /* the block is not the last allocated block */
+        {
+          /* check that all subsequent blocks are actually movable */
+          for(int i = nr + 1; i < Nblocks; i++)
+            if(MovableFlag[i] == 0)
+              {
+                dump_memory_table();
+                Terminate(
+                    "Wrong call of myrealloc_movable() from %s()/%s/line %d - behind block=%d there are subsequent non-movable "
+                    "allocated blocks\n",
+                    func, file, line, nr);
+              }
+        }
+    }
+  else
+    {
+      nr = Nblocks - 1;
+
+      if(p != Table[nr])
+        {
+          dump_memory_table();
+          Terminate("Wrong call of myrealloc() at %s()/%s/line %d - not the last allocated block!\n", func, file, line);
+        }
+    }
+
+  if(GenericFlag[nr])
+    Terminate("unexpected");
+
+  AllocatedBytes -= BlockSize[nr];
+  FreeBytes += BlockSize[nr];
+
+  if(n > FreeBytes)
+    {
+      dump_memory_table();
+      Terminate("At %s()/%s/line %d: Not enough memory in myremalloc_movable(n=%g MB). previous=%g FreeBytes=%g MB\n", func, file,
+                line, n * TO_MBYTE_FAC, BlockSize[nr] * TO_MBYTE_FAC, FreeBytes * TO_MBYTE_FAC);
+    }
+
+  ptrdiff_t offset = n - BlockSize[nr];
+  size_t length    = 0;
+
+  for(int i = nr + 1; i < Nblocks; i++)
+    length += BlockSize[i];
+
+  if(nr < Nblocks - 1)
+    memmove(Table[nr + 1] + offset, Table[nr + 1], length);
+
+  for(int i = nr + 1; i < Nblocks; i++)
+    {
+      Table[i] += offset;
+
+      *BasePointers[i] = *BasePointers[i] + offset;
+    }
+
+  FreeBytes -= n;
+  AllocatedBytes += n;
+  BlockSize[nr] = n;
+
+  if(AllocatedBytes > HighMarkBytes)
+    {
+      HighMarkBytes = AllocatedBytes;
+      dump_memory_table_buffer(HighMarkTabBuf);
+    }
+
+  return Table[nr];
+}
+
+void memory::check_maxmemsize_setting(int maxmemsize)
+{
+  int errflag = 0, errflag_tot;
+
+  if(maxmemsize > (MemoryOnNode / 1024.0 / TasksInThisNode) && RankInThisNode == 0)
+    {
+      char name[MPI_MAX_PROCESSOR_NAME];
+      int len;
+      MPI_Get_processor_name(name, &len);
+
+      printf("On node '%s', we have %d MPI ranks and at most %g MB available. This is not enough space for MaxMemSize = %g MB\n", name,
+             TasksInThisNode, MemoryOnNode / 1024.0, (double)maxmemsize);
+      errflag = 1;
+      fflush(stdout);
+    }
+
+  if(maxmemsize > (SharedMemoryOnNode / 1024.0 / TasksInThisNode) && RankInThisNode == 0)
+    {
+      char name[MPI_MAX_PROCESSOR_NAME];
+      int len;
+      MPI_Get_processor_name(name, &len);
+
+      printf(
+          "On node '%s', we have %d MPI ranks and at most %g MB of *shared* memory available. This is not enough space for MaxMemSize "
+          "= %g MB\n",
+          name, TasksInThisNode, SharedMemoryOnNode / 1024.0, (double)maxmemsize);
+      errflag = 1;
+      fflush(stdout);
+    }
+
+  MPI_Allreduce(&errflag, &errflag_tot, 1, MPI_INT, MPI_MAX, Communicator);
+
+  if(errflag_tot)
+    Terminate("At least one node has insufficient memory");
+}
diff --git a/src/data/mymalloc.h b/src/data/mymalloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..86a05bdae2f76370af92e8b2e8daf3c8c8fd36ba
--- /dev/null
+++ b/src/data/mymalloc.h
@@ -0,0 +1,140 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file mymalloc.h
+ *
+ *  \brief declares class that organizes the dynamic memory allocation
+ */
+
+#ifndef MYMALLOC_H
+#define MYMALLOC_H
+
+#include <stdio.h>
+
+#define CACHELINESIZE 64
+
+#define MAXBLOCKS 5000
+#define MAXCHARS 40
+
+#define LOC __FUNCTION__, __FILE__, __LINE__
+#define MMM(x, y) (x, #x, y, __FUNCTION__, __FILE__, __LINE__)
+#define DDD(x) (x, __FUNCTION__, __FILE__, __LINE__)
+
+#define mymalloc(x, y) mymalloc_movable_fullinfo(NULL, x, y, __FUNCTION__, __FILE__, __LINE__, 0, 0, NULL)
+#define mymalloc_clear(x, y) mymalloc_movable_fullinfo(NULL, x, y, __FUNCTION__, __FILE__, __LINE__, 0, 1, NULL)
+#define mymalloc_g(x, y) mymalloc_movable_fullinfo(NULL, x, y, __FUNCTION__, __FILE__, __LINE__, 0, 0, callorigin)
+
+#define mymalloc_movable(x, y, z) mymalloc_movable_fullinfo(x, y, z, __FUNCTION__, __FILE__, __LINE__, 1, 0, NULL)
+#define mymalloc_movable_clear(x, y, z) mymalloc_movable_fullinfo(x, y, z, __FUNCTION__, __FILE__, __LINE__, 1, 1, NULL)
+#define mymalloc_movable_g(x, y, z) mymalloc_movable_fullinfo(x, y, z, __FUNCTION__, __FILE__, __LINE__, 1, 0, callorigin)
+
+#define myfree(x) myfree_movable_fullinfo(x, __FUNCTION__, __FILE__, __LINE__, 0)
+#define myfree_movable(x) myfree_movable_fullinfo(x, __FUNCTION__, __FILE__, __LINE__, 1)
+
+#define myrealloc(x, y) myrealloc_movable_fullinfo(x, y, __FUNCTION__, __FILE__, __LINE__, 0)
+#define myrealloc_movable(x, y) myrealloc_movable_fullinfo(x, y, __FUNCTION__, __FILE__, __LINE__, 1)
+
+#include "../mpi_utils/setcomm.h"
+
+class memory : public setcomm
+{
+ public:
+  memory() : setcomm("delayed init") {}
+
+  size_t AllocatedBytes;
+  size_t FreeBytes;
+  char *Base; /**< Base pointer (initial memory address) of the stack. */
+
+  void mymalloc_init(int maxmemsize, enum restart_options restartflag);
+
+  void *mymalloc_movable_fullinfo(void *ptr, const char *varname, size_t n, const char *func, const char *file, int line,
+                                  int movable_flag, int clear_flag, char *originflag);
+
+  void *myrealloc_movable_fullinfo(void *p, size_t n, const char *func, const char *file, int line, int movable_flag);
+
+  void myfree_movable_fullinfo(void *p, const char *func, const char *file, int line, int movable_flag);
+
+  void *myfree_query_last_block(void);
+
+  size_t roundup_to_multiple_of_cacheline_size(size_t n);
+
+  void report_detailed_memory_usage_of_largest_task(void);
+
+  void check_maxmemsize_setting(int maxmemsize);
+
+  inline double getAllocatedBytesInMB(void) { return AllocatedBytes * TO_MBYTE_FAC; }
+
+  template <typename T>
+  inline T *alloc(T *&ptr, const char *varname, size_t n, const char *func, const char *file, int linenr)
+  {
+    return static_cast<T *>(mymalloc_movable_fullinfo(&ptr, varname, n * sizeof(T), func, file, linenr, 0, 0, NULL));
+  }
+
+  template <typename T>
+  inline T *alloc_movable(T *&ptr, const char *varname, size_t n, const char *func, const char *file, int linenr)
+  {
+    return static_cast<T *>(mymalloc_movable_fullinfo(&ptr, varname, n * sizeof(T), func, file, linenr, 1, 0, NULL));
+  }
+
+  template <typename T>
+  inline T *realloc(T *&ptr, const char *varname, size_t n, const char *func, const char *file, int linenr)
+  {
+    return static_cast<T *>(myrealloc_movable_fullinfo(&ptr, n * sizeof(T), func, file, linenr, 0));
+  }
+
+  template <typename T>
+  inline T *realloc_movable(T *&ptr, const char *varname, size_t n, const char *func, const char *file, int linenr)
+  {
+    return static_cast<T *>(myrealloc_movable_fullinfo(&ptr, n * sizeof(T), func, file, linenr, 1));
+  }
+
+  void dealloc(void *ptr, const char *func, const char *file, int linenr) { myfree_movable_fullinfo(ptr, func, file, linenr, 0); }
+
+  void dealloc_movable(void *ptr, const char *func, const char *file, int linenr)
+  {
+    myfree_movable_fullinfo(ptr, func, file, linenr, 1);
+  }
+
+  void dump_memory_table(void);
+
+ private:
+  size_t AllocatedBytesGeneric;
+
+  size_t HighMarkBytes;
+  size_t HighMarkBytesWithoutGeneric;
+
+  double OldGlobHighMarkMB;
+  double OldGlobHighMarkMBWithoutGeneric;
+
+  FILE *FdMemory; /**< file handle for memory.txt log-file. */
+
+  size_t TotBytes; /**< The total dimension (in bytes) of dynamic memory available to the current task. */
+  int Nblocks;     /**< The current number of allocated memory blocks. */
+
+  char **Table;         /**< Table containing the initial addresses of the allocated memory blocks.*/
+  size_t *BlockSize;    /**< Array containing the size (in bytes) of all the allocated memory blocks. */
+  char *MovableFlag;    /**< Identifies whether a block is movable. */
+  char *GenericFlag;    /**< Identifies whether a block has been identified in the generic allocation routines. */
+  char ***BasePointers; /**< Base pointers containing the initial addresses of movable memory blocks */
+  char *VarName;        /**< The name of the variable with which the block has been allocated. */
+  char *FunctionName;   /**< The function name that has allocated the memory block. */
+  char *ParentFileName; /**< The location from which the generich routines were called */
+  char *FileName;       /**< The file name where the function that has allocated the block is called. */
+  int *LineNumber;      /**< The line number in FileName where the function that allocated the block has been called. */
+  char *HighMarkTabBuf; /**< This is a buffer that holds the log-file output corresponding to the largest memory use that has occurred
+                           on this task */
+  char *HighMarkTabBufWithoutGeneric; /**< This is a buffer that holds the log-file output corresponding to the largest memory use that
+                                         has occurred on this task */
+  enum restart_options RestartFlag;
+
+  int dump_memory_table_buffer(char *p);
+
+  void report_memory_usage(int rank, char *tabbuf);
+};
+
+extern memory Mem;
+
+#endif
diff --git a/src/data/particle_data.h b/src/data/particle_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cf01efef46a15e66fc03edf24a2784b2aa37512
--- /dev/null
+++ b/src/data/particle_data.h
@@ -0,0 +1,305 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file particle_data.h
+ *
+ *  \brief declares a structure that holds the data stored for a single particle
+ */
+
+#ifndef PARTDATA_H
+#define PARTDATA_H
+
+#include "gadgetconfig.h"
+
+#include <atomic>
+#include <climits>
+
+#include "../data/constants.h"
+#include "../data/dtypes.h"
+#include "../data/idstorage.h"
+#include "../data/intposconvert.h"
+#include "../data/macros.h"
+#include "../data/mymalloc.h"
+#include "../data/symtensors.h"
+#include "../mpi_utils/setcomm.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/** This structure holds all the information that is
+ * stored for each particle of the simulation.
+ */
+struct particle_data
+{
+  // we do this ugly trick of using memcpy for our own copy constructor and assignment operator
+  // because the atomic_flag in particle_data has an implicitly deleted copy operator... so that the implicit functions
+  // for this are unavailable. But we know what we are doing here, and surrounding this with an ugly hack
+  // is the easiest way at the moment to work around this in our case unnecessary protection
+
+  particle_data() {}
+
+  // declare our own copy constructor
+  particle_data(particle_data& other) { memcpy(static_cast<void*>(this), static_cast<void*>(&other), sizeof(particle_data)); }
+
+  // declare our own assignment operator
+  particle_data& operator=(particle_data& other)
+  {
+    memcpy(static_cast<void*>(this), static_cast<void*>(&other), sizeof(particle_data));
+    return *this;
+  }
+
+  MyIntPosType IntPos[3];    /**< particle position at its current time, stored as an integer type */
+  MyFloat Vel[3];            /**< particle velocity at its current time */
+  vector<MyFloat> GravAccel; /**< particle acceleration due to gravity */
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+  MyFloat GravPM[3]; /**< particle acceleration due to long-range PM gravity force */
+#endif
+
+  std::atomic<integertime> Ti_Current; /**< current time on integer timeline */
+  float OldAcc;                        /**< magnitude of old gravitational force. Used in relative opening criterion */
+  int GravCost;                        /**< weight factors used for balancing the work-load */
+
+#ifndef LEAN
+ private:
+  MyDouble Mass; /**< particle mass */
+ public:
+#endif
+
+  MyIDStorage ID;           // 6-byte
+  signed char TimeBinGrav;  // 1-byte
+#ifndef LEAN
+  signed char TimeBinHydro;
+#endif
+#if defined(MERGERTREE) && defined(SUBFIND)
+  compactrank_t PrevRankInSubhalo;  // 1-byte
+  MyHaloNrType PrevSubhaloNr;       // 6-byte
+  approxlen PrevSizeOfSubhalo;      // 2-byte
+#endif
+
+#ifndef LEAN
+ private:
+  unsigned char Type; /**< flags particle type.  0=gas, 1=halo, 2=disk, 3=bulge, 4=stars, 5=bndry */
+ public:
+#endif
+
+#ifndef LEAN
+  std::atomic_flag access;
+#endif
+
+#ifdef REARRANGE_OPTION
+  unsigned long long TreeID;
+#endif
+
+#if NSOFTCLASSES > 1
+ private:
+  unsigned char
+      SofteningClass : 7; /* we use only 7 bits here so that we can stuff 1 bit for ActiveFlag into it in the Tree_Points structure */
+ public:
+#endif
+
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+  unsigned char InsideOutsideFlag : 1;
+#endif
+
+#ifdef FORCETEST
+  MyFloat GravAccelDirect[3]; /*!< particle acceleration calculated by direct summation */
+  MyFloat PotentialDirect;
+  MyFloat DistToID1;
+#ifdef PMGRID
+  MyFloat GravAccelShortRange[3];
+  MyFloat PotentialShortRange;
+#ifdef PLACEHIGHRESREGION
+  MyFloat GravAccelVeryShortRange[3];
+  MyFloat PotentialVeryShortRange;
+  MyFloat PotentialHPM;
+  MyFloat GravAccelHPM[3];
+#endif
+#endif
+#ifdef FORCETEST_FIXEDPARTICLESET
+  bool SelectedFlag;
+#endif
+#endif
+
+#if defined(EVALPOTENTIAL) || defined(OUTPUT_POTENTIAL)
+  MyFloat Potential; /**< gravitational potential */
+#if defined(PMGRID)
+  MyFloat PM_Potential;
+#endif
+#ifdef EXTERNALGRAVITY
+  MyFloat ExtPotential;
+#endif
+#endif
+
+#ifdef STARFORMATION
+  MyFloat StellarAge;  /**< formation time of star particle */
+  MyFloat Metallicity; /**< metallicity of gas or star particle */
+#endif
+
+  inline unsigned char getType(void)
+  {
+#ifdef LEAN
+    return 1;
+#else
+    return Type;
+#endif
+  }
+
+  inline unsigned char getTimeBinHydro(void)
+  {
+#ifndef LEAN
+    return TimeBinHydro;
+#else
+    return 0;
+#endif
+  }
+
+  inline void setTimeBinHydro(unsigned char bin)
+  {
+#ifndef LEAN
+    TimeBinHydro = bin;
+#endif
+  }
+
+  inline void setType(unsigned char type)
+  {
+#ifndef LEAN
+    Type = type;
+#endif
+  }
+
+  inline float getOldAcc(void) { return OldAcc; }
+
+  inline int getGravCost(void) { return GravCost; }
+
+  inline MyDouble getMass(void)
+  {
+#ifdef LEAN
+    return All.PartMass;
+#else
+    return Mass;
+#endif
+  }
+
+  inline void setMass(MyDouble mass)
+  {
+#ifndef LEAN
+    Mass = mass;
+#endif
+  }
+
+  inline integertime get_Ti_Current(void) { return Ti_Current; }
+
+  inline signed char getTimeBinGrav(void) { return TimeBinGrav; }
+
+  inline unsigned char getSofteningClass(void)
+  {
+#if NSOFTCLASSES > 1
+    return SofteningClass;
+#else
+    return 0;
+#endif
+  }
+
+  inline void setSofteningClass(unsigned char softclass)
+  {
+#if NSOFTCLASSES > 1
+    SofteningClass = softclass;
+#endif
+  }
+
+  inline double getAscale(void) { return All.Time; }
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+  inline void setFlagSaveDistance(void) {}
+  inline void clearFlagSaveDistance(void) {}
+
+  inline bool getFlagSaveDistance(void) { return true; }
+#endif
+};
+
+struct subfind_data
+{
+  MyHaloNrType GroupNr;
+#if defined(MERGERTREE)
+  MyHaloNrType SubhaloNr;
+  approxlen SizeOfSubhalo;
+  compactrank_t RankInSubhalo;
+#endif
+  char DomainFlag;
+
+  int OriginIndex, OriginTask;
+  int TargetIndex, TargetTask;
+
+#ifdef SUBFIND
+  int SubRankInGr;
+
+#ifndef SUBFIND_HBT
+  struct nearest_ngb_data
+  {
+    location index[2];
+    int count;
+  };
+
+  nearest_ngb_data nearest;
+
+  int submark;
+  int InvIndex;
+#endif
+
+#ifndef LEAN
+  int Type;
+  MyFloat Utherm;
+#endif
+
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+  MyFloat SubfindHsml;     // search radius used for SUBFIND dark matter neighborhood
+  MyFloat SubfindDensity;  // total matter density
+  MyFloat SubfindVelDisp;  // 3D dark matter velocity dispersion
+#endif
+
+  union
+  {
+    struct
+    {
+      int originindex, origintask;
+
+      union
+      {
+        MyFloat DM_Density;
+        MyFloat DM_Potential;
+      } u;
+
+    } s;
+
+    peanokey Key;
+  } u;
+
+  union
+  {
+    MyFloat DM_Hsml;
+    MyFloat DM_BindingEnergy;
+  } v;
+#else
+  /* this are fields defined when we have FOF without SUBFIND */
+#ifndef LEAN
+  int Type;
+#endif
+  union
+  {
+    peanokey Key;
+  } u;
+#endif
+};
+
+#ifdef SUBFIND_ORPHAN_TREATMENT
+struct idstoredata
+{
+  int NumPart;
+  MyIDType* ID;
+};
+
+#endif
+
+#endif
diff --git a/src/data/simparticles.h b/src/data/simparticles.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bf084d26fd0611d55b442bc2118024184572148
--- /dev/null
+++ b/src/data/simparticles.h
@@ -0,0 +1,535 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file simparticles.h
+ *
+ *  \brief class for organizing the storage of the actual simulation particles
+ */
+
+#ifndef SIMPART_H
+#define SIMPART_H
+
+#include <math.h>
+
+#include "../data/allvars.h"
+#include "../data/constants.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/macros.h"
+#include "../data/mymalloc.h"
+#include "../data/particle_data.h"
+#include "../data/sph_particle_data.h"
+#include "../main/main.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../mpi_utils/setcomm.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+#ifdef LIGHTCONE
+class lightcone;
+#endif
+
+class simparticles : public intposconvert, public setcomm
+{
+ public:
+  simparticles(MPI_Comm comm) : setcomm(comm) {}
+
+  int NumPart; /**< number of particles on the LOCAL processor */
+  int NumGas;  /**< number of gas particles on the LOCAL processor  */
+
+  int MaxPart;    /**< This gives the maxmimum number of particles that can be stored on one processor. */
+  int MaxPartSph; /**< This gives the maxmimum number of SPH particles that can be stored on one processor. */
+
+  long long TotNumPart; /**<  total particle numbers (global value) */
+  long long TotNumGas;  /**<  total gas particle number (global value) */
+
+  typedef particle_data pdata;
+
+  /*! This structure holds all the information that is
+   * stored for each particle of the simulation.
+   */
+  particle_data *P; /*!< holds particle data on local processor */
+
+  /* the following struture holds data that is stored for each SPH particle in addition to the collisionless
+   * variables.
+   */
+  sph_particle_data *SphP; /*!< holds SPH particle data on local processor */
+
+  unsigned short int MarkerValue;
+
+  subfind_data *PS;
+
+  inline void copy_particle(particle_data *Ptarget, particle_data *Psource)
+  {
+    // we do this ugly trick here because the atomic_flag in particle_data has an implicitly deleted copy operator...
+    // but we know what we are doing, and this is the easiest way at the moment to work around this in our case unnecessary protection
+    memcpy(static_cast<void *>(Ptarget), static_cast<void *>(Psource), sizeof(particle_data));
+  }
+
+  static bool inline compare_IDs(const MyIDType &a, const MyIDType &b) { return a < b; }
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS) && defined(FOF)
+  double *DistanceOrigin;
+#endif
+
+#ifdef SUBFIND_ORPHAN_TREATMENT
+  idstoredata IdStore;
+  static inline bool compare_SpP_ID(const particle_data &a, const particle_data &b) { return a.ID.get() < b.ID.get(); }
+#endif
+
+#ifdef LIGHTCONE
+  lightcone *LightCone;
+#endif
+
+#ifdef LGALAXIES
+  parttrees_table PartTreeTable[1];  // need only one entry
+#endif
+
+#ifdef FOF
+  MyIDStorage *MinID;
+  int *Len;  // this is here enough in 32bit because only the group segments on the local processor are treated
+  int *Head, *Next, *Tail, *MinIDTask;
+  MyFloat *fof_nearest_distance;
+  MyFloat *fof_nearest_hsml;
+
+  struct bit_flags
+  {
+    unsigned char Nonlocal : 2, MinIDChanged : 2, Marked : 2;
+  } * Flags;
+
+  double LinkL;
+
+  inline void link_two_particles(int target, int j)
+  {
+    if(Head[target] != Head[j]) /* only if not yet linked */
+      {
+        int p, s;
+        if(Len[Head[target]] > Len[Head[j]]) /* p group is longer */
+          {
+            p = target;
+            s = j;
+          }
+        else
+          {
+            p = j;
+            s = target;
+          }
+        Next[Tail[Head[p]]] = Head[s];
+
+        Tail[Head[p]] = Tail[Head[s]];
+
+        Len[Head[p]] += Len[Head[s]];
+
+        if(MinID[Head[s]].get() < MinID[Head[p]].get())
+          {
+            MinID[Head[p]]     = MinID[Head[s]];
+            MinIDTask[Head[p]] = MinIDTask[Head[s]];
+          }
+
+        int ss = Head[s];
+        do
+          Head[ss] = Head[p];
+        while((ss = Next[ss]) >= 0);
+      }
+  }
+
+#ifdef SUBFIND
+  struct nearest_r2_data
+  {
+    double dist[2];
+  } * R2Loc;
+
+#endif
+#endif
+
+#ifdef PMGRID
+  double Asmth[2], Rcut[2];
+#endif
+
+#if defined(PMGRID) && (!defined(PERIODIC) || defined(PLACEHIGHRESREGION))
+  double TotalMeshSize[2]; /* this is in integer space but should be double here to protect against overflows */
+  MySignedIntPosType Corner[2][3];
+  MySignedIntPosType Xmintot[2][3], Xmaxtot[2][3];
+  MyIntPosType MeshSize[2][3];
+  MyIntPosType Left[2][3];
+  MyIntPosType OldMeshSize[2];
+  MyIntPosType ReferenceIntPos[2][3];
+  MyIntPosType PlacingMask;
+  MyIntPosType PlacingBlocksize;
+#endif
+
+#ifdef PLACEHIGHRESREGION
+  inline int check_high_res_overlap(MyIntPosType *center, MyIntPosType halflen)
+  {
+    MyIntPosType intleft[3] = {center[0] - halflen - ReferenceIntPos[HIGH_MESH][0],
+                               center[1] - halflen - ReferenceIntPos[HIGH_MESH][1],
+                               center[2] - halflen - ReferenceIntPos[HIGH_MESH][2]};
+
+    MyIntPosType intright[3] = {center[0] + halflen - ReferenceIntPos[HIGH_MESH][0],
+                                center[1] + halflen - ReferenceIntPos[HIGH_MESH][1],
+                                center[2] + halflen - ReferenceIntPos[HIGH_MESH][2]};
+
+    MySignedIntPosType *left  = (MySignedIntPosType *)intleft;
+    MySignedIntPosType *right = (MySignedIntPosType *)intright;
+
+    if(right[0] <= Xmintot[HIGH_MESH][0] || left[0] >= Xmaxtot[HIGH_MESH][0] || right[1] <= Xmintot[HIGH_MESH][1] ||
+       left[1] >= Xmaxtot[HIGH_MESH][1] || right[2] <= Xmintot[HIGH_MESH][2] || left[2] >= Xmaxtot[HIGH_MESH][2])
+      return FLAG_OUTSIDE;
+    else if(right[0] <= Xmaxtot[HIGH_MESH][0] && left[0] >= Xmintot[HIGH_MESH][0] && right[1] <= Xmaxtot[HIGH_MESH][1] &&
+            left[1] >= Xmintot[HIGH_MESH][1] && right[2] <= Xmaxtot[HIGH_MESH][2] && left[2] >= Xmintot[HIGH_MESH][2])
+      return FLAG_INSIDE;
+    else
+      return FLAG_BOUNDARYOVERLAP;
+  }
+
+  inline int check_high_res_point_location(MyIntPosType *intpos)
+  {
+    MyIntPosType relpos[3] = {intpos[0] - ReferenceIntPos[HIGH_MESH][0], intpos[1] - ReferenceIntPos[HIGH_MESH][1],
+                              intpos[2] - ReferenceIntPos[HIGH_MESH][2]};
+
+    MySignedIntPosType *pos = (MySignedIntPosType *)relpos;
+
+    if(pos[0] < Xmintot[HIGH_MESH][0] || pos[0] >= Xmaxtot[HIGH_MESH][0] || pos[1] < Xmintot[HIGH_MESH][1] ||
+       pos[1] >= Xmaxtot[HIGH_MESH][1] || pos[2] < Xmintot[HIGH_MESH][2] || pos[2] >= Xmaxtot[HIGH_MESH][2])
+      return FLAG_OUTSIDE;
+    else
+      return FLAG_INSIDE;
+  }
+
+#endif
+
+  int TimeBinSynchronized[TIMEBINS];
+  TimeBinData TimeBinsHydro;
+  TimeBinData TimeBinsGravity;
+
+  int nsource;
+  int *indexlist;
+
+#ifdef STARFORMATION
+  double TimeBinSfr[TIMEBINS];
+#endif
+
+  inline int getTimeBinSynchronized(int bin) { return TimeBinSynchronized[bin]; }
+
+#ifdef REARRANGE_OPTION
+  static bool compare_TreeID_ID(const particle_data &a, const particle_data &b)
+  {
+    if(a.TreeID < b.TreeID)
+      return true;
+
+    if(a.TreeID > b.TreeID)
+      return false;
+
+    return a.ID.get() < b.ID.get();
+  }
+
+  static bool compare_ID(const particle_data &a, const particle_data &b) { return a.ID.get() < b.ID.get(); }
+#endif
+
+  inline MyFloat get_DtHsml(int i) { return SphP[i].DtHsml; }
+
+  inline MyFloat get_Csnd(int i) { return SphP[i].Csnd; }
+
+  inline MyFloat get_OldAcc(int i) { return P[i].OldAcc; }
+
+  /* sets the internal energy per unit mass of particle i  from its entropy */
+  inline double get_utherm_from_entropy(int i)
+  {
+#ifdef ISOTHERM_EQS
+    return SphP[i].Entropy;
+#else
+    double fact_entropy_to_u = pow(SphP[i].Density * All.cf_a3inv, GAMMA_MINUS1) / GAMMA_MINUS1;
+    return SphP[i].Entropy * fact_entropy_to_u;
+#endif
+  }
+
+  /* sets the entropy of particle i from its internal energy per unit mass */
+  inline void set_entropy_from_utherm(double utherm, int i)
+  {
+    double fact_u_to_entropy = GAMMA_MINUS1 / pow(SphP[i].Density * All.cf_a3inv, GAMMA_MINUS1);
+    SphP[i].Entropy          = utherm * fact_u_to_entropy;
+    SphP[i].EntropyPred      = SphP[i].Entropy;
+
+#ifdef PRESSURE_ENTROPY_SPH
+    SphP[i].EntropyToInvGammaPred = pow(SphP[i].EntropyPred, 1.0 / GAMMA);
+#endif
+  }
+
+  void fill_active_gravity_list_with_all_particles(void)
+  {
+    TimeBinsGravity.NActiveParticles = 0;
+
+    for(int i = 0; i < NumPart; i++)
+      TimeBinsGravity.ActiveParticleList[TimeBinsGravity.NActiveParticles++] = i;
+  }
+
+  /* This routine allocates memory for
+   * particle storage, both the collisionless and the SPH particles.
+   * The memory for the ordered binary tree of the timeline
+   * is also allocated.
+   */
+  void allocate_memory(void)
+  {
+    /* Note: P and SphP are initialized to zero */
+    P    = (particle_data *)Mem.mymalloc_movable_clear(&P, "P", MaxPart * sizeof(particle_data));
+    SphP = (sph_particle_data *)Mem.mymalloc_movable_clear(&SphP, "SphP", MaxPartSph * sizeof(sph_particle_data));
+
+    TimeBinsHydro.timebins_allocate();
+    TimeBinsGravity.timebins_allocate();
+  }
+
+  void free_memory(void)
+  {
+    TimeBinsGravity.timebins_free();
+    TimeBinsHydro.timebins_free();
+
+    Mem.myfree(SphP);
+    Mem.myfree(P);
+  }
+
+  void reallocate_memory_maxpart(int maxpartNew)
+  {
+    mpi_printf("ALLOCATE: Changing to MaxPart = %d\n", maxpartNew);
+
+    P = (particle_data *)Mem.myrealloc_movable(P, maxpartNew * sizeof(particle_data));
+    if(maxpartNew > MaxPart)
+      memset(((char *)P) + MaxPart * sizeof(particle_data), 0, (maxpartNew - MaxPart) * sizeof(particle_data));
+    MaxPart = maxpartNew;
+
+    TimeBinsGravity.timebins_reallocate();
+  }
+
+  void reallocate_memory_maxpartsph(int maxpartsphNew)
+  {
+    mpi_printf("ALLOCATE: Changing to MaxPartSph = %d\n", maxpartsphNew);
+
+    SphP = (sph_particle_data *)Mem.myrealloc_movable(SphP, maxpartsphNew * sizeof(sph_particle_data));
+    if(maxpartsphNew > MaxPartSph)
+      memset(((char *)SphP) + MaxPartSph * sizeof(sph_particle_data), 0, (maxpartsphNew - MaxPartSph) * sizeof(sph_particle_data));
+    MaxPartSph = maxpartsphNew;
+
+    TimeBinsHydro.timebins_reallocate();
+  }
+
+  /*! This function dumps some of the basic particle data to a file. In case
+   *  the tree construction fails, this is called just before the run
+   *  terminates with an error message. Examination of the generated file may
+   *  then give clues to what caused the problem.
+   */
+  void dump_particles(void)
+  {
+    /*
+    FILE *fd;
+    char buffer[200];
+    sprintf(buffer, "particles_%d.dat", ThisTask);
+    if((fd = fopen(buffer, "w")))
+      {
+        my_fwrite(&NumPart, 1, sizeof(int), fd);
+        for(int i = 0; i < NumPart; i++)
+          my_fwrite(&P[i].IntPos[0], 3, sizeof(MyIntPosType), fd);
+        for(int i = 0; i < NumPart; i++)
+          my_fwrite(&P[i].Vel[0], 3, sizeof(MyFloat), fd);
+        for(int i = 0; i < NumPart; i++)
+          my_fwrite(&P[i].ID, 1, sizeof(int), fd);
+        fclose(fd);
+      }
+
+      */
+  }
+
+  /** \brief Print information relative to a particle / cell to standard output.
+   *
+   *  \param i particle / cell index
+   */
+  void print_particle_info(int i)
+  {
+    MyReal pos[3];
+    intpos_to_pos(P[i].IntPos, pos); /* converts the integer coordinates to floating point */
+
+    printf("Task=%d, ID=%llu, Type=%d, TimeBinGrav=%d, TimeBinHydro=%d, Mass=%g, pos=%g|%g|%g, vel=%g|%g|%g, OldAcc=%g\n", ThisTask,
+           (unsigned long long)P[i].ID.get(), P[i].getType(), P[i].TimeBinGrav, P[i].getTimeBinHydro(), P[i].getMass(), pos[0], pos[1],
+           pos[2], P[i].Vel[0], P[i].Vel[1], P[i].Vel[2], P[i].OldAcc);
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+    printf("GravAccel=%g|%g|%g, GravPM=%g|%g|%g, Soft=%g, SoftClass=%d\n", P[i].GravAccel[0], P[i].GravAccel[1], P[i].GravAccel[2],
+           P[i].GravPM[0], P[i].GravPM[1], P[i].GravPM[2], All.ForceSoftening[P[i].getSofteningClass()], P[i].getSofteningClass());
+#else
+#ifndef LEAN
+    printf("GravAccel=%g|%g|%g, Soft=%g, SoftType=%d\n", P[i].GravAccel[0], P[i].GravAccel[1], P[i].GravAccel[2],
+           All.ForceSoftening[P[i].getSofteningClass()], P[i].getSofteningClass());
+#endif
+#endif
+
+    if(P[i].getType() == 0)
+      {
+        printf("rho=%g, hsml=%g, entr=%g, csnd=%g\n", SphP[i].Density, SphP[i].Hsml, SphP[i].Entropy, SphP[i].get_sound_speed());
+        printf("ID=%llu SphP[p].CurrentMaxTiStep=%g\n", (unsigned long long)P[i].ID.get(), SphP[i].CurrentMaxTiStep);
+      }
+
+    myflush(stdout);
+  }
+
+  /** \brief Print information relative to a particle / cell to standard output given its ID.
+   *  *
+   *   *  \param ID particle / cell ID
+   *    */
+  void print_particle_info_from_ID(MyIDType ID)
+  {
+    for(int i = 0; i < NumPart; i++)
+      if(P[i].ID.get() == ID)
+        print_particle_info(i);
+  }
+
+ public:
+  inline int get_active_index(int idx)
+  {
+#ifdef HIERARCHICAL_GRAVITY
+    return TimeBinsGravity.ActiveParticleList[idx];
+#else
+    return idx;
+#endif
+  }
+
+  void reconstruct_timebins(void);
+  integertime find_next_sync_point(void);
+  void mark_active_timebins(void);
+  void drift_all_particles(void);
+  int drift_particle(particle_data *P, sph_particle_data *SphP, integertime time1, bool ignore_light_cone = false);
+  void make_list_of_active_particles(void);
+  integertime get_timestep_grav(int p);
+  integertime get_timestep_hydro(int p);
+
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+  integertime get_timestep_pm(void);
+#endif
+
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+  void find_long_range_step_constraint(void);
+#endif
+
+  void timebins_get_bin_and_do_validity_checks(integertime ti_step, int *bin_new, int bin_old);
+
+  void assign_hydro_timesteps(void);
+  void timebin_cleanup_list_of_active_particles(void);
+
+  int test_if_grav_timestep_is_too_large(int p, int bin);
+  int get_timestep_bin(integertime ti_step);
+
+ private:
+#ifdef ADAPTIVE_HYDRO_SOFTENING
+  int get_softeningtype_for_hydro_particle(int i)
+  {
+    double soft = All.GasSoftFactor * SphP[i].Hsml;
+
+    if(soft <= All.ForceSoftening[NSOFTCLASSES])
+      return NSOFTCLASSES;
+
+    int k = 0.5 + log(soft / All.ForceSoftening[NSOFTCLASSES]) / log(All.AdaptiveHydroSofteningSpacing);
+    if(k >= NSOFTCLASSES_HYDRO)
+      k = NSOFTCLASSES_HYDRO - 1;
+
+    return NSOFTCLASSES + k;
+  }
+#endif
+
+#ifdef INDIVIDUAL_GRAVITY_SOFTENING
+
+ public:
+#if(INDIVIDUAL_GRAVITY_SOFTENING) & 2
+#error "INDIVIDUAL_GRAVITY_SOFTENING may not include particle type 1 which is used as a reference point"
+#endif
+
+#if((INDIVIDUAL_GRAVITY_SOFTENING)&1) && defined(ADAPTIVE_HYDRO_SOFTENING)
+#error "INDIVIDUAL_GRAVITY_SOFTENING may not include particle type 0 when ADAPTIVE_HYDRO_SOFTENING is used"
+#endif
+
+  int get_softening_type_from_mass(double mass)
+  {
+    int min_type   = -1;
+    double eps     = get_desired_softening_from_mass(mass);
+    double min_dln = MAX_FLOAT_NUMBER;
+
+    for(int i = 0; i < NSOFTCLASSES; i++)
+      {
+        if(All.ForceSoftening[i] > 0)
+          {
+            double dln = fabs(log(eps) - log(All.ForceSoftening[i]));
+
+            if(dln < min_dln)
+              {
+                min_dln  = dln;
+                min_type = i;
+              }
+          }
+      }
+
+    if(min_type < 0)
+      Terminate("min_type < 0");
+
+    return min_type;
+  }
+
+  /*! \brief Returns the desired softening length depending on the particle mass with type 1 as a reference point
+   *
+   * \param mass particle mass
+   * \return softening length for a particle of mass #mass
+   */
+  double get_desired_softening_from_mass(double mass)
+  {
+    return All.ForceSoftening[All.SofteningClassOfPartType[1]] * pow(mass / All.AvgType1Mass, 1.0 / 3);
+  }
+
+  /*! \brief Initializes the mass dependent softening calculation for Type 1 particles
+   *
+   * The average mass of Type 1 particles is calculated.
+   */
+  void init_individual_softenings(void)
+  {
+    int ndm     = 0;
+    double mass = 0, masstot, massmin = MAX_DOUBLE_NUMBER, massmax = 0;
+    long long ndmtot;
+
+    for(int i = 0; i < NumPart; i++)
+      if(P[i].getType() == 1)
+        {
+          ndm++;
+          mass += P[i].getMass();
+
+          if(massmin > P[i].getMass())
+            massmin = P[i].getMass();
+
+          if(massmax < P[i].getMass())
+            massmax = P[i].getMass();
+        }
+
+    sumup_large_ints(1, &ndm, &ndmtot, Communicator);
+    MPI_Allreduce(&mass, &masstot, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+
+    MPI_Allreduce(MPI_IN_PLACE, &massmin, 1, MPI_DOUBLE, MPI_MIN, Communicator);
+    MPI_Allreduce(MPI_IN_PLACE, &massmax, 1, MPI_DOUBLE, MPI_MAX, Communicator);
+
+    All.AvgType1Mass = masstot / ndmtot;
+
+    mpi_printf("INIT: AvgType1Mass = %g   (min=%g max=%g) Ndm1tot=%lld\n", All.AvgType1Mass, massmin, massmax, ndmtot);
+
+    if(massmax > 1.00001 * massmin)
+      Terminate("Strange: Should use constant mass type-1 particles if INDIVIDUAL_GRAVITY_SOFTENING is used\n");
+
+    if(All.ComovingIntegrationOn)
+      {
+        double rhomean_dm = (All.Omega0 - All.OmegaBaryon) * (3 * All.Hubble * All.Hubble / (8 * M_PI * All.G));
+
+        mpi_printf("INIT: For this AvgType1Mass, the mean particle spacing is %g and the assigned softening is %g\n",
+                   pow(All.AvgType1Mass / rhomean_dm, 1.0 / 3), All.SofteningTable[All.SofteningClassOfPartType[1]]);
+      }
+
+    for(int i = 0; i < NumPart; i++)
+      if(((1 << P[i].getType()) & (INDIVIDUAL_GRAVITY_SOFTENING)))
+        P[i].setSofteningClass(get_softening_type_from_mass(P[i].getMass()));
+  }
+#endif
+};
+
+#endif
diff --git a/src/data/sph_particle_data.h b/src/data/sph_particle_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1c6bd8c24f440c84e9dfa602efce17a52dab17f
--- /dev/null
+++ b/src/data/sph_particle_data.h
@@ -0,0 +1,160 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file sph_particle_data.h
+ *
+ *  \brief defines the structure holding the extra hydrodynamic data for a single SPH particle
+ */
+
+#ifndef SPHPARTDATA_H
+#define SPHPARTDATA_H
+
+#include "../data/constants.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/macros.h"
+#include "../data/mymalloc.h"
+#include "../mpi_utils/setcomm.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/* in this structure, all SPH variables are put that are needed for passive
+ * particles in the hydro force calculation. Only this part will be sent
+ * to other nodes if needed
+ */
+struct sph_particle_data_hydrocore
+{
+  MyFloat Hsml;               /*!< current smoothing length */
+  MyFloat DhsmlDensityFactor; /*!< correction factor needed in entropy formulation of SPH */
+  MyFloat VelPred[3];         /*!< predicted SPH particle velocity at the current time, needed if particle is inactive */
+
+  MyFloat DivVel;  /*!< local velocity divergence */
+  MyFloat CurlVel; /*!< local velocity curl */
+  MyFloat Csnd;
+
+  MyFloat Density;  /*!< current baryonic mass density of particle */
+  MyFloat Pressure; /*!< current pressure */
+
+#ifdef TIMEDEP_ART_VISC
+  MyFloat Alpha; /*!< time-dependend viscosity parameter */
+#endif
+#ifdef PRESSURE_ENTROPY_SPH
+  MyFloat EntropyToInvGammaPred;     /*!< current entropy function A to the power 1 / gamma */
+  MyFloat DhsmlDerivedDensityFactor; /*!< additional correction factor needed for pressure formulation of SPH */
+  MyFloat PressureSphDensity;        /* current density derived from the pressure estimate */
+#endif
+};
+
+/** Holds data that is stored for each sph particle in addition to
+    the collisionless variables.
+ */
+struct sph_particle_data : public sph_particle_data_hydrocore
+{
+  MyFloat Entropy;     /*!< value of the entropic function */
+  MyFloat EntropyPred; /*!< predicted entropy at current time, needed if the particle is inactive */
+
+  MyFloat HydroAccel[3]; /*!< acceleration due to hydrodynamical forces */
+#ifdef HIERARCHICAL_GRAVITY
+  MyFloat FullGravAccel[3]; /*!< most recent full calculation of gravitational acceleration, used to advanced VelPred */
+#endif
+  MyFloat DtEntropy; /*!< rate of change of entropy */
+  MyFloat DtDensity; /*!< rate of change of density, needed to predict densities for passive particles */
+  MyFloat DtHsml;    /*!< rate of change of smoothing length, needed to predict hsml for passive particles */
+
+  MyFloat NumNgb; /*!< effective number of neighbours used in density estimation loop (note: this could be changed to a temporary
+                     variable in density) */
+
+  MyFloat Rot[3]; /*!< local velocity curl */
+
+  MyFloat MaxSignalVel; /*!< maximum signal velocity */
+  MyFloat CurrentMaxTiStep;
+
+#ifdef PRESSURE_ENTROPY_SPH
+  MyFloat
+      DtPressureSphDensity; /*!< rate of change of the pressure derived density, needed to predict densities for passive particles */
+#endif
+
+#ifdef TIMEDEP_ART_VISC
+  MyFloat DivVelOld; /* local velocity gradient from the previous time step */
+  MyFloat decayVel;  /* decay velocity for the viscosity parameter */
+#endif
+
+#ifdef IMPROVED_VELOCITY_GRADIENTS
+  struct
+  {
+    MyFloat dx_dx;
+    MyFloat dx_dy;
+    MyFloat dx_dz;
+    MyFloat dy_dy;
+    MyFloat dy_dz;
+    MyFloat dz_dz;
+  } dpos; /* contains the matrix elements needed for the improved gradient estimate */
+
+  MyFloat dvel[NUMDIMS][NUMDIMS]; /* contains the velocity gradients */
+#endif
+
+#ifdef STARFORMATION
+  MyFloat Metallicity;
+  MyFloat MassMetallicity;
+#endif
+
+#ifdef COOLING
+  MyFloat Ne; /*!< free electron fraction, expressed as local electron number density normalized to the hydrogen number density. Gives
+                 indirectly mean molecular weight. */
+#endif
+
+#ifdef OUTPUT_COOLHEAT
+  MyFloat CoolHeat;
+#endif
+
+#ifdef STARFORMATION
+  MyFloat Sfr;
+#endif
+
+  inline MyFloat get_sound_speed(void)
+  {
+    MyFloat csnd;
+
+    if(Density > 0)
+      csnd = sqrt(static_cast<MyReal>(GAMMA) * Pressure / Density);
+    else
+      csnd = 0;
+
+    return csnd;
+  }
+
+  /* compute the pressure of particle i */
+  inline MyFloat get_pressure(void)
+  {
+#ifndef PRESSURE_ENTROPY_SPH
+    return EntropyPred * pow(Density, (MyFloat)GAMMA);
+#else
+    return pow(EntropyToInvGammaPred * PressureSphDensity, GAMMA);
+#endif
+  }
+
+  inline void set_thermodynamic_variables(void)
+  {
+    Pressure = get_pressure();
+
+    if(Pressure < 0)
+      Terminate("Pressure=%g  rho=%g  entr=%g entrpred=%g\n", Pressure, Density, Entropy, EntropyPred);
+
+    Csnd = get_sound_speed();
+  }
+
+  inline MyFloat get_Hsml() { return Hsml; }
+
+#ifdef IMPROVED_VELOCITY_GRADIENTS
+  void set_velocity_gradients(void);
+#endif
+
+#ifdef TIMEDEP_ART_VISC
+  void set_viscosity_coefficient(double dt);
+#endif
+};
+
+#endif
diff --git a/src/data/symtensor_indices.h b/src/data/symtensor_indices.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce536230bfc27f6df1f28e7a01da39678c4fb39a
--- /dev/null
+++ b/src/data/symtensor_indices.h
@@ -0,0 +1,485 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file symtensor_indices.h
+ *
+ *  \brief defines some symbols for accessing the elements of (storage-optimized) symmetric tensors
+ */
+
+#ifndef SRC_DATA_SYMTENSOR_INDICES_H_
+#define SRC_DATA_SYMTENSOR_INDICES_H_
+
+/* 2-tensor element mapping, symmetric 3x3 */
+#define qXX 0
+#define qXY 1
+#define qXZ 2
+#define qYX qXY
+#define qYY 3
+#define qYZ 4
+#define qZX qXZ
+#define qZY qYZ
+#define qZZ 5
+
+/* 3-tensor element mapping, symmetric 3x3x3 */
+#define dXXX 0
+#define dXXY 1
+#define dXXZ 2
+
+#define dXYX dXXY
+#define dXYY 3
+#define dXYZ 4
+
+#define dXZX dXXZ
+#define dXZY dXYZ
+#define dXZZ 5
+
+#define dYXX dXXY
+#define dYXY dXYY
+#define dYXZ dXYZ
+
+#define dYYX dXYY
+#define dYYY 6
+#define dYYZ 7
+
+#define dYZX dXYZ
+#define dYZY dYYZ
+#define dYZZ 8
+
+#define dZXX dXXZ
+#define dZXY dXYZ
+#define dZXZ dXZZ
+
+#define dZYX dXYZ
+#define dZYY dYYZ
+#define dZYZ dYZZ
+
+#define dZZX dXZZ
+#define dZZY dYZZ
+#define dZZZ 9
+
+/* 4-tensor element mapping, symmetric 3x3x3x3 */
+#define sXXXX 0
+#define sXXXY 1
+#define sXXXZ 2
+
+#define sXXYX sXXXY
+#define sXXYY 3
+#define sXXYZ 4
+
+#define sXXZX sXXXZ
+#define sXXZY sXXYZ
+#define sXXZZ 5
+
+#define sXYXX sXXXY
+#define sXYXY sXXYY
+#define sXYXZ sXXYZ
+
+#define sXYYX sXXYY
+#define sXYYY 6
+#define sXYYZ 7
+
+#define sXYZX sXXYZ
+#define sXYZY sXYYZ
+#define sXYZZ 8
+
+#define sXZXX sXXXZ
+#define sXZXY sXXYZ
+#define sXZXZ sXXZZ
+
+#define sXZYX sXXYZ
+#define sXZYY sXYYZ
+#define sXZYZ sXYZZ
+
+#define sXZZX sXXZZ
+#define sXZZY sXYZZ
+#define sXZZZ 9
+//-----------------
+#define sYXXX sXXXY
+#define sYXXY sXXYY
+#define sYXXZ sXXYZ
+
+#define sYXYX sXXYY
+#define sYXYY sXYYY
+#define sYXYZ sXYYZ
+
+#define sYXZX sXXYZ
+#define sYXZY sXYYZ
+#define sYXZZ sXYZZ
+
+#define sYYXX sXXYY
+#define sYYXY sXYYY
+#define sYYXZ sXYYZ
+
+#define sYYYX sXYYY
+#define sYYYY 10
+#define sYYYZ 11
+
+#define sYYZX sXYYZ
+#define sYYZY sYYYZ
+#define sYYZZ 12
+
+#define sYZXX sXXYZ
+#define sYZXY sXYYZ
+#define sYZXZ sXYZZ
+
+#define sYZYX sXYYZ
+#define sYZYY sYYYZ
+#define sYZYZ sYYZZ
+
+#define sYZZX sXYZZ
+#define sYZZY sYYZZ
+#define sYZZZ 13
+//-----------------
+#define sZXXX sXXXZ
+#define sZXXY sXXYZ
+#define sZXXZ sXXZZ
+
+#define sZXYX sXXYZ
+#define sZXYY sXYYZ
+#define sZXYZ sXYZZ
+
+#define sZXZX sXXZZ
+#define sZXZY sXYZZ
+#define sZXZZ sXZZZ
+
+#define sZYXX sXXYZ
+#define sZYXY sXYYZ
+#define sZYXZ sXYZZ
+
+#define sZYYX sXYYZ
+#define sZYYY sYYYZ
+#define sZYYZ sYYZZ
+
+#define sZYZX sXYZZ
+#define sZYZY sYYZZ
+#define sZYZZ sYZZZ
+
+#define sZZXX sXXZZ
+#define sZZXY sXYZZ
+#define sZZXZ sXZZZ
+
+#define sZZYX sXYZZ
+#define sZZYY sYYZZ
+#define sZZYZ sYZZZ
+
+#define sZZZX sXZZZ
+#define sZZZY sYZZZ
+#define sZZZZ 14
+
+/* 5-tensor element mapping, symmetric 3x3x3x3x3 */
+#define rXXXXX 0
+#define rXXXXY 1
+#define rXXXXZ 2
+#define rXXXYX rXXXXY
+#define rXXXYY 3
+#define rXXXYZ 4
+#define rXXXZX rXXXXZ
+#define rXXXZY rXXXYZ
+#define rXXXZZ 5
+#define rXXYXX rXXXXY
+#define rXXYXY rXXXYY
+#define rXXYXZ rXXXYZ
+#define rXXYYX rXXXYY
+#define rXXYYY 6
+#define rXXYYZ 7
+#define rXXYZX rXXXYZ
+#define rXXYZY rXXYYZ
+#define rXXYZZ 8
+#define rXXZXX rXXXXZ
+#define rXXZXY rXXXYZ
+#define rXXZXZ rXXXZZ
+#define rXXZYX rXXXYZ
+#define rXXZYY rXXYYZ
+#define rXXZYZ rXXYZZ
+#define rXXZZX rXXXZZ
+#define rXXZZY rXXYZZ
+#define rXXZZZ 9
+#define rXYXXX rXXXXY
+#define rXYXXY rXXXYY
+#define rXYXXZ rXXXYZ
+#define rXYXYX rXXXYY
+#define rXYXYY rXXYYY
+#define rXYXYZ rXXYYZ
+#define rXYXZX rXXXYZ
+#define rXYXZY rXXYYZ
+#define rXYXZZ rXXYZZ
+#define rXYYXX rXXXYY
+#define rXYYXY rXXYYY
+#define rXYYXZ rXXYYZ
+#define rXYYYX rXXYYY
+#define rXYYYY 10
+#define rXYYYZ 11
+#define rXYYZX rXXYYZ
+#define rXYYZY rXYYYZ
+#define rXYYZZ 12
+#define rXYZXX rXXXYZ
+#define rXYZXY rXXYYZ
+#define rXYZXZ rXXYZZ
+#define rXYZYX rXXYYZ
+#define rXYZYY rXYYYZ
+#define rXYZYZ rXYYZZ
+#define rXYZZX rXXYZZ
+#define rXYZZY rXYYZZ
+#define rXYZZZ 13
+#define rXZXXX rXXXXZ
+#define rXZXXY rXXXYZ
+#define rXZXXZ rXXXZZ
+#define rXZXYX rXXXYZ
+#define rXZXYY rXXYYZ
+#define rXZXYZ rXXYZZ
+#define rXZXZX rXXXZZ
+#define rXZXZY rXXYZZ
+#define rXZXZZ rXXZZZ
+#define rXZYXX rXXXYZ
+#define rXZYXY rXXYYZ
+#define rXZYXZ rXXYZZ
+#define rXZYYX rXXYYZ
+#define rXZYYY rXYYYZ
+#define rXZYYZ rXYYZZ
+#define rXZYZX rXXYZZ
+#define rXZYZY rXYYZZ
+#define rXZYZZ rXYZZZ
+#define rXZZXX rXXXZZ
+#define rXZZXY rXXYZZ
+#define rXZZXZ rXXZZZ
+#define rXZZYX rXXYZZ
+#define rXZZYY rXYYZZ
+#define rXZZYZ rXYZZZ
+#define rXZZZX rXXZZZ
+#define rXZZZY rXYZZZ
+#define rXZZZZ 14
+#define rYXXXX rXXXXY
+#define rYXXXY rXXXYY
+#define rYXXXZ rXXXYZ
+#define rYXXYX rXXXYY
+#define rYXXYY rXXYYY
+#define rYXXYZ rXXYYZ
+#define rYXXZX rXXXYZ
+#define rYXXZY rXXYYZ
+#define rYXXZZ rXXYZZ
+#define rYXYXX rXXXYY
+#define rYXYXY rXXYYY
+#define rYXYXZ rXXYYZ
+#define rYXYYX rXXYYY
+#define rYXYYY rXYYYY
+#define rYXYYZ rXYYYZ
+#define rYXYZX rXXYYZ
+#define rYXYZY rXYYYZ
+#define rYXYZZ rXYYZZ
+#define rYXZXX rXXXYZ
+#define rYXZXY rXXYYZ
+#define rYXZXZ rXXYZZ
+#define rYXZYX rXXYYZ
+#define rYXZYY rXYYYZ
+#define rYXZYZ rXYYZZ
+#define rYXZZX rXXYZZ
+#define rYXZZY rXYYZZ
+#define rYXZZZ rXYZZZ
+#define rYYXXX rXXXYY
+#define rYYXXY rXXYYY
+#define rYYXXZ rXXYYZ
+#define rYYXYX rXXYYY
+#define rYYXYY rXYYYY
+#define rYYXYZ rXYYYZ
+#define rYYXZX rXXYYZ
+#define rYYXZY rXYYYZ
+#define rYYXZZ rXYYZZ
+#define rYYYXX rXXYYY
+#define rYYYXY rXYYYY
+#define rYYYXZ rXYYYZ
+#define rYYYYX rXYYYY
+#define rYYYYY 15
+#define rYYYYZ 16
+#define rYYYZX rXYYYZ
+#define rYYYZY rYYYYZ
+#define rYYYZZ 17
+#define rYYZXX rXXYYZ
+#define rYYZXY rXYYYZ
+#define rYYZXZ rXYYZZ
+#define rYYZYX rXYYYZ
+#define rYYZYY rYYYYZ
+#define rYYZYZ rYYYZZ
+#define rYYZZX rXYYZZ
+#define rYYZZY rYYYZZ
+#define rYYZZZ 18
+#define rYZXXX rXXXYZ
+#define rYZXXY rXXYYZ
+#define rYZXXZ rXXYZZ
+#define rYZXYX rXXYYZ
+#define rYZXYY rXYYYZ
+#define rYZXYZ rXYYZZ
+#define rYZXZX rXXYZZ
+#define rYZXZY rXYYZZ
+#define rYZXZZ rXYZZZ
+#define rYZYXX rXXYYZ
+#define rYZYXY rXYYYZ
+#define rYZYXZ rXYYZZ
+#define rYZYYX rXYYYZ
+#define rYZYYY rYYYYZ
+#define rYZYYZ rYYYZZ
+#define rYZYZX rXYYZZ
+#define rYZYZY rYYYZZ
+#define rYZYZZ rYYZZZ
+#define rYZZXX rXXYZZ
+#define rYZZXY rXYYZZ
+#define rYZZXZ rXYZZZ
+#define rYZZYX rXYYZZ
+#define rYZZYY rYYYZZ
+#define rYZZYZ rYYZZZ
+#define rYZZZX rXYZZZ
+#define rYZZZY rYYZZZ
+#define rYZZZZ 19
+#define rZXXXX rXXXXZ
+#define rZXXXY rXXXYZ
+#define rZXXXZ rXXXZZ
+#define rZXXYX rXXXYZ
+#define rZXXYY rXXYYZ
+#define rZXXYZ rXXYZZ
+#define rZXXZX rXXXZZ
+#define rZXXZY rXXYZZ
+#define rZXXZZ rXXZZZ
+#define rZXYXX rXXXYZ
+#define rZXYXY rXXYYZ
+#define rZXYXZ rXXYZZ
+#define rZXYYX rXXYYZ
+#define rZXYYY rXYYYZ
+#define rZXYYZ rXYYZZ
+#define rZXYZX rXXYZZ
+#define rZXYZY rXYYZZ
+#define rZXYZZ rXYZZZ
+#define rZXZXX rXXXZZ
+#define rZXZXY rXXYZZ
+#define rZXZXZ rXXZZZ
+#define rZXZYX rXXYZZ
+#define rZXZYY rXYYZZ
+#define rZXZYZ rXYZZZ
+#define rZXZZX rXXZZZ
+#define rZXZZY rXYZZZ
+#define rZXZZZ rXZZZZ
+#define rZYXXX rXXXYZ
+#define rZYXXY rXXYYZ
+#define rZYXXZ rXXYZZ
+#define rZYXYX rXXYYZ
+#define rZYXYY rXYYYZ
+#define rZYXYZ rXYYZZ
+#define rZYXZX rXXYZZ
+#define rZYXZY rXYYZZ
+#define rZYXZZ rXYZZZ
+#define rZYYXX rXXYYZ
+#define rZYYXY rXYYYZ
+#define rZYYXZ rXYYZZ
+#define rZYYYX rXYYYZ
+#define rZYYYY rYYYYZ
+#define rZYYYZ rYYYZZ
+#define rZYYZX rXYYZZ
+#define rZYYZY rYYYZZ
+#define rZYYZZ rYYZZZ
+#define rZYZXX rXXYZZ
+#define rZYZXY rXYYZZ
+#define rZYZXZ rXYZZZ
+#define rZYZYX rXYYZZ
+#define rZYZYY rYYYZZ
+#define rZYZYZ rYYZZZ
+#define rZYZZX rXYZZZ
+#define rZYZZY rYYZZZ
+#define rZYZZZ rYZZZZ
+#define rZZXXX rXXXZZ
+#define rZZXXY rXXYZZ
+#define rZZXXZ rXXZZZ
+#define rZZXYX rXXYZZ
+#define rZZXYY rXYYZZ
+#define rZZXYZ rXYZZZ
+#define rZZXZX rXXZZZ
+#define rZZXZY rXYZZZ
+#define rZZXZZ rXZZZZ
+#define rZZYXX rXXYZZ
+#define rZZYXY rXYYZZ
+#define rZZYXZ rXYZZZ
+#define rZZYYX rXYYZZ
+#define rZZYYY rYYYZZ
+#define rZZYYZ rYYZZZ
+#define rZZYZX rXYZZZ
+#define rZZYZY rYYZZZ
+#define rZZYZZ rYZZZZ
+#define rZZZXX rXXZZZ
+#define rZZZXY rXYZZZ
+#define rZZZXZ rXZZZZ
+#define rZZZYX rXYZZZ
+#define rZZZYY rYYZZZ
+#define rZZZYZ rYZZZZ
+#define rZZZZX rXZZZZ
+#define rZZZZY rYZZZZ
+#define rZZZZZ 20
+
+/* 6-tensor element mapping, symmetric 3x3x3x3x3x3 */
+#define pXXXXXX 0
+#define pXXXXXY 1
+#define pXXXXXZ 2
+#define pXXXXYY 3
+#define pXXXXYZ 4
+#define pXXXXZZ 5
+#define pXXXYYY 6
+#define pXXXYYZ 7
+#define pXXXYZZ 8
+#define pXXXZZZ 9
+#define pXXYYYY 10
+#define pXXYYYZ 11
+#define pXXYYZZ 12
+#define pXXYZZZ 13
+#define pXXZZZZ 14
+#define pXYYYYY 15
+#define pXYYYYZ 16
+#define pXYYYZZ 17
+#define pXYYZZZ 18
+#define pXYZZZZ 19
+#define pXZZZZZ 20
+#define pYYYYYY 21
+#define pYYYYYZ 22
+#define pYYYYZZ 23
+#define pYYYZZZ 24
+#define pYYZZZZ 25
+#define pYZZZZZ 26
+#define pZZZZZZ 27
+
+/* 7-tensor element mapping, symmetric 3x3x3x3x3x3x3 */
+#define tXXXXXXX 0
+#define tXXXXXXY 1
+#define tXXXXXXZ 2
+#define tXXXXXYY 3
+#define tXXXXXYZ 4
+#define tXXXXXZZ 5
+#define tXXXXYYY 6
+#define tXXXXYYZ 7
+#define tXXXXYZZ 8
+#define tXXXXZZZ 9
+#define tXXXYYYY 10
+#define tXXXYYYZ 11
+#define tXXXYYZZ 12
+#define tXXXYZZZ 13
+#define tXXXZZZZ 14
+#define tXXYYYYY 15
+#define tXXYYYYZ 16
+#define tXXYYYZZ 17
+#define tXXYYZZZ 18
+#define tXXYZZZZ 19
+#define tXXZZZZZ 20
+#define tXYYYYYY 21
+#define tXYYYYYZ 22
+#define tXYYYYZZ 23
+#define tXYYYZZZ 24
+#define tXYYZZZZ 25
+#define tXYZZZZZ 26
+#define tXZZZZZZ 27
+#define tYYYYYYY 28
+#define tYYYYYYZ 29
+#define tYYYYYZZ 30
+#define tYYYYZZZ 31
+#define tYYYZZZZ 32
+#define tYYZZZZZ 33
+#define tYZZZZZZ 34
+#define tZZZZZZZ 35
+
+#endif /* SRC_DATA_SYMTENSOR_INDICES_H_ */
diff --git a/src/data/symtensors.h b/src/data/symtensors.h
new file mode 100644
index 0000000000000000000000000000000000000000..a64609ba283a848814d2f2acf55f39ec25774f56
--- /dev/null
+++ b/src/data/symtensors.h
@@ -0,0 +1,2189 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file symtensors.h
+ *
+ *  \brief defines symmetric tensors of different rank and basic operations for them
+ */
+
+#ifndef SYMTENSORS_H
+#define SYMTENSORS_H
+
+#include "symtensor_indices.h"
+
+void symtensor_test(void);
+
+template <typename T1, typename T2>
+struct which_return;
+
+template <typename T>
+struct which_return<T, T>
+{
+  typedef T type;
+};
+
+template <>
+struct which_return<float, double>
+{
+  typedef double type;
+};
+
+template <>
+struct which_return<double, float>
+{
+  typedef double type;
+};
+
+template <>
+struct which_return<int, float>
+{
+  typedef float type;
+};
+
+template <>
+struct which_return<float, int>
+{
+  typedef float type;
+};
+
+template <>
+struct which_return<int, double>
+{
+  typedef double type;
+};
+
+template <>
+struct which_return<double, int>
+{
+  typedef double type;
+};
+
+/* with the above construction, the expression
+ *
+ *    typename which_return<T1, T2>::type
+ *
+ * gives us now the more accurate type if mixed precisions are used for T1 and T2
+ */
+
+template <typename T>
+struct compl_return;
+
+template <typename T>
+struct compl_return
+{
+  typedef double type;
+};
+
+template <>
+struct compl_return<float>
+{
+  typedef double type;
+};
+
+template <>
+struct compl_return<double>
+{
+  typedef float type;
+};
+
+/* with the above construction, the expression
+ *
+ *    typename compl_return<T>::type
+ *
+ * gives us the type float of T is double, and the type double if T is type float (i.e. the complementary type)
+ * If T is another type, we'll get the type double
+ * We'll use this to define some implicit type casts.
+ */
+
+// vector
+template <typename T>
+class vector
+{
+ public:
+  T da[3];
+
+  vector() {} /* constructor */
+
+  inline vector(const T x) /* constructor  */
+  {
+    da[0] = x;
+    da[1] = x;
+    da[2] = x;
+  }
+
+  inline vector(const T x, const T y, const T z) /* constructor  */
+  {
+    da[0] = x;
+    da[1] = y;
+    da[2] = z;
+  }
+
+  inline vector(const float *x) /* constructor  */
+  {
+    da[0] = x[0];
+    da[1] = x[1];
+    da[2] = x[2];
+  }
+
+  inline vector(const double *x) /* constructor  */
+  {
+    da[0] = x[0];
+    da[1] = x[1];
+    da[2] = x[2];
+  }
+
+  /* implicit conversion operator to type double or float as needed */
+  typedef typename compl_return<T>::type float_or_double;
+  operator vector<float_or_double>() const { return vector<float_or_double>(da); }
+
+  inline vector &operator+=(const vector<double> &right)
+  {
+    da[0] += right.da[0];
+    da[1] += right.da[1];
+    da[2] += right.da[2];
+
+    return *this;
+  }
+
+  inline vector &operator+=(const vector<float> &right)
+  {
+    da[0] += right.da[0];
+    da[1] += right.da[1];
+    da[2] += right.da[2];
+
+    return *this;
+  }
+
+  inline vector &operator-=(const vector<double> &right)
+  {
+    da[0] -= right.da[0];
+    da[1] -= right.da[1];
+    da[2] -= right.da[2];
+
+    return *this;
+  }
+
+  inline vector &operator-=(const vector<float> &right)
+  {
+    da[0] -= right.da[0];
+    da[1] -= right.da[1];
+    da[2] -= right.da[2];
+
+    return *this;
+  }
+
+  inline vector &operator*=(const T fac)
+  {
+    da[0] *= fac;
+    da[1] *= fac;
+    da[2] *= fac;
+
+    return *this;
+  }
+
+  inline T r2(void) { return da[0] * da[0] + da[1] * da[1] + da[2] * da[2]; }
+
+  inline double norm(void) { return sqrt(da[0] * da[0] + da[1] * da[1] + da[2] * da[2]); }
+
+  inline T &operator[](const size_t index) { return da[index]; }
+};
+
+// fully symmetric 2-tensor (i.e. symmetic 3x3 matrix)
+template <typename T>
+class symtensor2
+{
+ public:
+  T da[6];
+
+  symtensor2() {} /* constructor */
+
+  inline symtensor2(const T x) /* constructor  */
+  {
+    da[0] = x;
+    da[1] = x;
+    da[2] = x;
+    da[3] = x;
+    da[4] = x;
+    da[5] = x;
+  }
+
+  inline symtensor2(const float *x) /* constructor  */
+  {
+    da[0] = x[0];
+    da[1] = x[1];
+    da[2] = x[2];
+    da[3] = x[3];
+    da[4] = x[4];
+    da[5] = x[5];
+  }
+
+  inline symtensor2(const double *x) /* constructor  */
+  {
+    da[0] = x[0];
+    da[1] = x[1];
+    da[2] = x[2];
+    da[3] = x[3];
+    da[4] = x[4];
+    da[5] = x[5];
+  }
+
+  /* implicit conversion operator to type float or double as needed */
+  typedef typename compl_return<T>::type float_or_double;
+  operator symtensor2<float_or_double>() const { return symtensor2<float_or_double>(da); }
+
+  inline symtensor2(const vector<T> &v, const vector<T> &w) /* constructor based on the outer product of two vectors */
+  {
+    da[qXX] = v.da[0] * w.da[0];
+    da[qYY] = v.da[1] * w.da[1];
+    da[qZZ] = v.da[2] * w.da[2];
+    da[qXY] = v.da[0] * w.da[1];
+    da[qXZ] = v.da[0] * w.da[2];
+    da[qYZ] = v.da[1] * w.da[2];
+  }
+
+  inline symtensor2 &operator+=(const symtensor2 &right)
+  {
+    da[0] += right.da[0];
+    da[1] += right.da[1];
+    da[2] += right.da[2];
+    da[3] += right.da[3];
+    da[4] += right.da[4];
+    da[5] += right.da[5];
+
+    return *this;
+  }
+
+  inline symtensor2 &operator-=(const symtensor2 &right)
+  {
+    da[0] -= right.da[0];
+    da[1] -= right.da[1];
+    da[2] -= right.da[2];
+    da[3] -= right.da[3];
+    da[4] -= right.da[4];
+    da[5] -= right.da[5];
+
+    return *this;
+  }
+
+  inline symtensor2 &operator*=(const T fac)
+  {
+    da[0] *= fac;
+    da[1] *= fac;
+    da[2] *= fac;
+    da[3] *= fac;
+    da[4] *= fac;
+    da[5] *= fac;
+
+    return *this;
+  }
+
+  inline T &operator[](const size_t index) { return da[index]; }
+
+  inline T trace(void) { return da[qXX] + da[qYY] + da[qZZ]; }
+
+  inline double norm(void)
+  {
+    double sum2 = 0;
+    for(int i = 0; i < 6; i++)
+      sum2 += da[i] * da[i];
+
+    return sqrt(sum2 / 6);
+  }
+};
+
+// fully symmetric 3-tensor (3x3x3)
+template <typename T>
+class symtensor3
+{
+ public:
+  T da[10];
+
+  symtensor3() {} /* constructor */
+
+  inline symtensor3(const T x) /* constructor  */
+  {
+    da[0] = x;
+    da[1] = x;
+    da[2] = x;
+    da[3] = x;
+    da[4] = x;
+    da[5] = x;
+    da[6] = x;
+    da[7] = x;
+    da[8] = x;
+    da[9] = x;
+  }
+
+  inline symtensor3(const float *x) /* constructor  */
+  {
+    da[0] = x[0];
+    da[1] = x[1];
+    da[2] = x[2];
+    da[3] = x[3];
+    da[4] = x[4];
+    da[5] = x[5];
+    da[6] = x[6];
+    da[7] = x[7];
+    da[8] = x[8];
+    da[9] = x[9];
+  }
+
+  inline symtensor3(const double *x) /* constructor  */
+  {
+    da[0] = x[0];
+    da[1] = x[1];
+    da[2] = x[2];
+    da[3] = x[3];
+    da[4] = x[4];
+    da[5] = x[5];
+    da[6] = x[6];
+    da[7] = x[7];
+    da[8] = x[8];
+    da[9] = x[9];
+  }
+
+  /* implicit conversion operator to type float or double as needed */
+  typedef typename compl_return<T>::type float_or_double;
+  operator symtensor3<float_or_double>() const { return symtensor3<float_or_double>(da); }
+
+  /* constructor based on the outer product  */
+  inline symtensor3(const vector<T> &v, const symtensor2<T> &D)
+  {
+    da[dXXX] = D.da[qXX] * v.da[0];
+    da[dXXY] = D.da[qXX] * v.da[1];
+    da[dXXZ] = D.da[qXX] * v.da[2];
+    da[dXYY] = D.da[qXY] * v.da[1];
+    da[dXYZ] = D.da[qXY] * v.da[2];
+    da[dXZZ] = D.da[qXZ] * v.da[2];
+    da[dYYY] = D.da[qYY] * v.da[1];
+    da[dYYZ] = D.da[qYY] * v.da[2];
+    da[dYZZ] = D.da[qYZ] * v.da[2];
+    da[dZZZ] = D.da[qZZ] * v.da[2];
+  }
+
+  inline symtensor3 &operator+=(const symtensor3 &right)
+  {
+    da[0] += right.da[0];
+    da[1] += right.da[1];
+    da[2] += right.da[2];
+    da[3] += right.da[3];
+    da[4] += right.da[4];
+    da[5] += right.da[5];
+    da[6] += right.da[6];
+    da[7] += right.da[7];
+    da[8] += right.da[8];
+    da[9] += right.da[9];
+
+    return *this;
+  }
+
+  inline symtensor3 &operator-=(const symtensor3 &right)
+  {
+    da[0] -= right.da[0];
+    da[1] -= right.da[1];
+    da[2] -= right.da[2];
+    da[3] -= right.da[3];
+    da[4] -= right.da[4];
+    da[5] -= right.da[5];
+    da[6] -= right.da[6];
+    da[7] -= right.da[7];
+    da[8] -= right.da[8];
+    da[9] -= right.da[9];
+
+    return *this;
+  }
+
+  inline symtensor3 &operator*=(const T fac)
+  {
+    da[0] *= fac;
+    da[1] *= fac;
+    da[2] *= fac;
+    da[3] *= fac;
+    da[4] *= fac;
+    da[5] *= fac;
+    da[6] *= fac;
+    da[7] *= fac;
+    da[8] *= fac;
+    da[9] *= fac;
+
+    return *this;
+  }
+
+  inline T &operator[](const size_t index) { return da[index]; }
+
+  inline double norm(void)
+  {
+    double sum2 = 0;
+    for(int i = 0; i < 10; i++)
+      sum2 += da[i] * da[i];
+
+    return sqrt(sum2 / 10);
+  }
+};
+
+// fully symmetric 4-tensor (3x3x3x3)
+template <typename T>
+class symtensor4
+{
+ public:
+  T da[15];
+
+  symtensor4() {} /* constructor */
+
+  inline symtensor4(const T x) /* constructor  */
+  {
+    for(int i = 0; i < 15; i++)
+      da[i] = x;
+  }
+
+  inline symtensor4(const float *x) /* constructor  */
+  {
+    for(int i = 0; i < 15; i++)
+      da[i] = x[i];
+  }
+
+  inline symtensor4(const double *x) /* constructor  */
+  {
+    for(int i = 0; i < 15; i++)
+      da[i] = x[i];
+  }
+
+  /* implicit conversion operator to type float or double as needed */
+  typedef typename compl_return<T>::type float_or_double;
+  operator symtensor4<float_or_double>() const { return symtensor4<float_or_double>(da); }
+
+  /* constructor based on an outer product  */
+  inline symtensor4(const vector<T> &v, const symtensor3<T> &D)
+  {
+    da[sXXXX] = D.da[dXXX] * v.da[0];
+    da[sXXXY] = D.da[dXXX] * v.da[1];
+    da[sXXXZ] = D.da[dXXX] * v.da[2];
+    da[sXXYY] = D.da[dXXY] * v.da[1];
+    da[sXXYZ] = D.da[dXXY] * v.da[2];
+    da[sXXZZ] = D.da[dXXZ] * v.da[2];
+    da[sXYYY] = D.da[dXYY] * v.da[1];
+    da[sXYYZ] = D.da[dXYY] * v.da[2];
+    da[sXYZZ] = D.da[dXYZ] * v.da[2];
+    da[sXZZZ] = D.da[dXZZ] * v.da[2];
+    da[sYYYY] = D.da[dYYY] * v.da[1];
+    da[sYYYZ] = D.da[dYYY] * v.da[2];
+    da[sYYZZ] = D.da[dYYZ] * v.da[2];
+    da[sYZZZ] = D.da[dYZZ] * v.da[2];
+    da[sZZZZ] = D.da[dZZZ] * v.da[2];
+  }
+
+  inline symtensor4 &operator+=(const symtensor4 &right)
+  {
+    for(int i = 0; i < 15; i++)
+      da[i] += right.da[i];
+
+    return *this;
+  }
+
+  inline symtensor4 &operator-=(const symtensor4 &right)
+  {
+    for(int i = 0; i < 15; i++)
+      da[i] -= right.da[i];
+
+    return *this;
+  }
+
+  inline symtensor4 &operator*=(const T fac)
+  {
+    for(int i = 0; i < 15; i++)
+      da[i] *= fac;
+
+    return *this;
+  }
+
+  inline T &operator[](const size_t index) { return da[index]; }
+
+  inline double norm(void)
+  {
+    double sum2 = 0;
+    for(int i = 0; i < 15; i++)
+      sum2 += da[i] * da[i];
+
+    return sqrt(sum2 / 15);
+  }
+};
+
+// fully symmetric 5-tensor (3x3x3x3x3)
+template <typename T>
+class symtensor5
+{
+ public:
+  T da[21];
+
+  symtensor5() {} /* constructor */
+
+  inline symtensor5(const T x) /* constructor  */
+  {
+    for(int i = 0; i < 21; i++)
+      da[i] = x;
+  }
+
+  inline symtensor5(const float *x) /* constructor  */
+  {
+    for(int i = 0; i < 21; i++)
+      da[i] = x[i];
+  }
+
+  inline symtensor5(const double *x) /* constructor  */
+  {
+    for(int i = 0; i < 21; i++)
+      da[i] = x[i];
+  }
+
+  /* implicit conversion operator to type float or double as needed */
+  typedef typename compl_return<T>::type float_or_double;
+  operator symtensor5<float_or_double>() const { return symtensor5<float_or_double>(da); }
+
+  /* constructor based on an outer product  */
+  inline symtensor5(const vector<T> &v, const symtensor4<T> &D)
+  {
+    da[rXXXXX] = D.da[sXXXX] * v.da[0];
+    da[rXXXXY] = D.da[sXXXX] * v.da[1];
+    da[rXXXXZ] = D.da[sXXXX] * v.da[2];
+    da[rXXXYY] = D.da[sXXXY] * v.da[1];
+    da[rXXXYZ] = D.da[sXXXY] * v.da[2];
+    da[rXXXZZ] = D.da[sXXXZ] * v.da[2];
+    da[rXXYYY] = D.da[sXXYY] * v.da[1];
+    da[rXXYYZ] = D.da[sXXYY] * v.da[2];
+    da[rXXYZZ] = D.da[sXXYZ] * v.da[2];
+    da[rXXZZZ] = D.da[sXXZZ] * v.da[2];
+    da[rXYYYY] = D.da[sXYYY] * v.da[1];
+    da[rXYYYZ] = D.da[sXYYY] * v.da[2];
+    da[rXYYZZ] = D.da[sXYYZ] * v.da[2];
+    da[rXYZZZ] = D.da[sXYZZ] * v.da[2];
+    da[rXZZZZ] = D.da[sXZZZ] * v.da[2];
+    da[rYYYYY] = D.da[sYYYY] * v.da[1];
+    da[rYYYYZ] = D.da[sYYYY] * v.da[2];
+    da[rYYYZZ] = D.da[sYYYZ] * v.da[2];
+    da[rYYZZZ] = D.da[sYYZZ] * v.da[2];
+    da[rYZZZZ] = D.da[sYZZZ] * v.da[2];
+    da[rZZZZZ] = D.da[sZZZZ] * v.da[2];
+  }
+
+  inline symtensor5 &operator+=(const symtensor5 &right)
+  {
+    for(int i = 0; i < 21; i++)
+      da[i] += right.da[i];
+
+    return *this;
+  }
+
+  inline symtensor5 &operator-=(const symtensor5 &right)
+  {
+    for(int i = 0; i < 21; i++)
+      da[i] -= right.da[i];
+
+    return *this;
+  }
+
+  inline symtensor5 &operator*=(const T fac)
+  {
+    for(int i = 0; i < 21; i++)
+      da[i] *= fac;
+
+    return *this;
+  }
+
+  inline T &operator[](const size_t index) { return da[index]; }
+
+  inline double norm(void)
+  {
+    double sum2 = 0;
+    for(int i = 0; i < 21; i++)
+      sum2 += da[i] * da[i];
+
+    return sqrt(sum2 / 21);
+  }
+};
+
+// fully symmetric 6-tensor (3x3x3x3x3x3)
+template <typename T>
+class symtensor6
+{
+ public:
+  T da[28];
+
+  symtensor6() {} /* constructor */
+
+  inline symtensor6(const T x) /* constructor  */
+  {
+    for(int i = 0; i < 28; i++)
+      da[i] = x;
+  }
+
+  inline symtensor6(const float *x) /* constructor  */
+  {
+    for(int i = 0; i < 28; i++)
+      da[i] = x[i];
+  }
+
+  inline symtensor6(const double *x) /* constructor  */
+  {
+    for(int i = 0; i < 28; i++)
+      da[i] = x[i];
+  }
+
+  /* implicit conversion operator to type float or double as needed */
+  typedef typename compl_return<T>::type float_or_double;
+  operator symtensor6<float_or_double>() const { return symtensor6<float_or_double>(da); }
+
+  /* constructor based on an outer product  */
+  inline symtensor6(const vector<T> &v, const symtensor5<T> &D)
+  {
+    da[pXXXXXX] = D.da[rXXXXX] * v.da[0];
+    da[pXXXXXY] = D.da[rXXXXX] * v.da[1];
+    da[pXXXXXZ] = D.da[rXXXXX] * v.da[2];
+    da[pXXXXYY] = D.da[rXXXXY] * v.da[1];
+    da[pXXXXYZ] = D.da[rXXXXY] * v.da[2];
+    da[pXXXXZZ] = D.da[rXXXXZ] * v.da[2];
+    da[pXXXYYY] = D.da[rXXXYY] * v.da[1];
+    da[pXXXYYZ] = D.da[rXXXYY] * v.da[2];
+    da[pXXXYZZ] = D.da[rXXXYZ] * v.da[2];
+    da[pXXXZZZ] = D.da[rXXXZZ] * v.da[2];
+    da[pXXYYYY] = D.da[rXXYYY] * v.da[1];
+    da[pXXYYYZ] = D.da[rXXYYY] * v.da[2];
+    da[pXXYYZZ] = D.da[rXXYYZ] * v.da[2];
+    da[pXXYZZZ] = D.da[rXXYZZ] * v.da[2];
+    da[pXXZZZZ] = D.da[rXXZZZ] * v.da[2];
+    da[pXYYYYY] = D.da[rXYYYY] * v.da[1];
+    da[pXYYYYZ] = D.da[rXYYYY] * v.da[2];
+    da[pXYYYZZ] = D.da[rXYYYZ] * v.da[2];
+    da[pXYYZZZ] = D.da[rXYYZZ] * v.da[2];
+    da[pXYZZZZ] = D.da[rXYZZZ] * v.da[2];
+    da[pXZZZZZ] = D.da[rXZZZZ] * v.da[2];
+    da[pYYYYYY] = D.da[rYYYYY] * v.da[1];
+    da[pYYYYYZ] = D.da[rYYYYY] * v.da[2];
+    da[pYYYYZZ] = D.da[rYYYYZ] * v.da[2];
+    da[pYYYZZZ] = D.da[rYYYZZ] * v.da[2];
+    da[pYYZZZZ] = D.da[rYYZZZ] * v.da[2];
+    da[pYZZZZZ] = D.da[rYZZZZ] * v.da[2];
+    da[pZZZZZZ] = D.da[rZZZZZ] * v.da[2];
+  }
+
+  inline symtensor6 &operator+=(const symtensor6 &right)
+  {
+    for(int i = 0; i < 28; i++)
+      da[i] += right.da[i];
+
+    return *this;
+  }
+
+  inline symtensor6 &operator-=(const symtensor6 &right)
+  {
+    for(int i = 0; i < 28; i++)
+      da[i] -= right.da[i];
+
+    return *this;
+  }
+
+  inline symtensor6 &operator*=(const T fac)
+  {
+    for(int i = 0; i < 28; i++)
+      da[i] *= fac;
+
+    return *this;
+  }
+
+  inline T &operator[](const size_t index) { return da[index]; }
+
+  inline double norm(void)
+  {
+    double sum2 = 0;
+    for(int i = 0; i < 28; i++)
+      sum2 += da[i] * da[i];
+
+    return sqrt(sum2 / 28);
+  }
+};
+
+// fully symmetric 7-tensor (3x3x3x3x3x3x3)
+template <typename T>
+class symtensor7
+{
+ public:
+  T da[36];
+
+  symtensor7() {} /* constructor */
+
+  inline symtensor7(const T x) /* constructor  */
+  {
+    for(int i = 0; i < 36; i++)
+      da[i] = x;
+  }
+
+  inline symtensor7(const float *x) /* constructor  */
+  {
+    for(int i = 0; i < 36; i++)
+      da[i] = x[i];
+  }
+
+  inline symtensor7(const double *x) /* constructor  */
+  {
+    for(int i = 0; i < 36; i++)
+      da[i] = x[i];
+  }
+
+  /* implicit conversion operator to type float or double as needed */
+  typedef typename compl_return<T>::type float_or_double;
+  operator symtensor7<float_or_double>() const { return symtensor7<float_or_double>(da); }
+
+  inline symtensor7 &operator+=(const symtensor7 &right)
+  {
+    for(int i = 0; i < 36; i++)
+      da[i] += right.da[i];
+
+    return *this;
+  }
+
+  inline symtensor7 &operator-=(const symtensor7 &right)
+  {
+    for(int i = 0; i < 36; i++)
+      da[i] -= right.da[i];
+
+    return *this;
+  }
+
+  inline symtensor7 &operator*=(const T fac)
+  {
+    for(int i = 0; i < 36; i++)
+      da[i] *= fac;
+
+    return *this;
+  }
+
+  inline T &operator[](const size_t index) { return da[index]; }
+
+  inline double norm(void)
+  {
+    double sum2 = 0;
+    for(int i = 0; i < 36; i++)
+      sum2 += da[i] * da[i];
+
+    return sqrt(sum2 / 36);
+  }
+};
+
+//-------------  let's defined additions of these tensors
+
+// add two vectors
+template <typename T1, typename T2>
+inline vector<typename which_return<T1, T2>::type> operator+(const vector<T1> &left, const vector<T2> &right)
+{
+  vector<typename which_return<T1, T2>::type> res;
+
+  res.da[0] = left.da[0] + right.da[0];
+  res.da[1] = left.da[1] + right.da[1];
+  res.da[2] = left.da[2] + right.da[2];
+
+  return res;
+}
+
+// add two 2-tensors
+template <typename T1, typename T2>
+inline symtensor2<typename which_return<T1, T2>::type> operator+(const symtensor2<T1> &left, const symtensor2<T2> &right)
+{
+  symtensor2<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 6; i++)
+    res.da[i] = left.da[i] + right.da[i];
+
+  return res;
+}
+
+// add two 3-tensors
+template <typename T1, typename T2>
+inline symtensor3<typename which_return<T1, T2>::type> operator+(const symtensor3<T1> &left, const symtensor3<T2> &right)
+{
+  symtensor3<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 10; i++)
+    res.da[i] = left.da[i] + right.da[i];
+
+  return res;
+}
+
+// add two 4-tensors
+template <typename T1, typename T2>
+inline symtensor4<typename which_return<T1, T2>::type> operator+(const symtensor4<T1> &left, const symtensor4<T2> &right)
+{
+  symtensor4<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 15; i++)
+    res.da[i] = left.da[i] + right.da[i];
+
+  return res;
+}
+
+// add two 5-tensors
+template <typename T1, typename T2>
+inline symtensor5<typename which_return<T1, T2>::type> operator+(const symtensor5<T1> &left, const symtensor5<T2> &right)
+{
+  symtensor5<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 21; i++)
+    res.da[i] = left.da[i] + right.da[i];
+
+  return res;
+}
+
+// add two 6-tensors
+template <typename T1, typename T2>
+inline symtensor6<typename which_return<T1, T2>::type> operator+(const symtensor6<T1> &left, const symtensor6<T2> &right)
+{
+  symtensor6<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 28; i++)
+    res.da[i] = left.da[i] + right.da[i];
+
+  return res;
+}
+
+// add two 7-tensors
+template <typename T1, typename T2>
+inline symtensor7<typename which_return<T1, T2>::type> operator+(const symtensor7<T1> &left, const symtensor7<T2> &right)
+{
+  symtensor7<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 36; i++)
+    res.da[i] = left.da[i] + right.da[i];
+
+  return res;
+}
+
+//-------------  let's defined subtractions of these tensors
+
+// subtract two vectors
+template <typename T1, typename T2>
+inline vector<typename which_return<T1, T2>::type> operator-(const vector<T1> &left, const vector<T2> &right)
+{
+  vector<typename which_return<T1, T2>::type> res;
+
+  res.da[0] = left.da[0] - right.da[0];
+  res.da[1] = left.da[1] - right.da[1];
+  res.da[2] = left.da[2] - right.da[2];
+
+  return res;
+}
+
+// subtract two 2-tensors
+template <typename T1, typename T2>
+inline symtensor2<typename which_return<T1, T2>::type> operator-(const symtensor2<T1> &left, const symtensor2<T2> &right)
+{
+  symtensor2<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 6; i++)
+    res.da[i] = left.da[i] - right.da[i];
+
+  return res;
+}
+
+// subtract two 3-tensors
+template <typename T1, typename T2>
+inline symtensor3<typename which_return<T1, T2>::type> operator-(const symtensor3<T1> &left, const symtensor3<T2> &right)
+{
+  symtensor3<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 10; i++)
+    res.da[i] = left.da[i] - right.da[i];
+
+  return res;
+}
+
+// subtract two 4-tensors
+template <typename T1, typename T2>
+inline symtensor4<typename which_return<T1, T2>::type> operator-(const symtensor4<T1> &left, const symtensor4<T2> &right)
+{
+  symtensor4<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 15; i++)
+    res.da[i] = left.da[i] - right.da[i];
+
+  return res;
+}
+
+// subtract two 5-tensors
+template <typename T1, typename T2>
+inline symtensor5<typename which_return<T1, T2>::type> operator-(const symtensor5<T1> &left, const symtensor5<T2> &right)
+{
+  symtensor5<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 21; i++)
+    res.da[i] = left.da[i] - right.da[i];
+
+  return res;
+}
+
+// subtract two 6-tensors
+template <typename T1, typename T2>
+inline symtensor6<typename which_return<T1, T2>::type> operator-(const symtensor6<T1> &left, const symtensor6<T2> &right)
+{
+  symtensor6<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 28; i++)
+    res.da[i] = left.da[i] - right.da[i];
+
+  return res;
+}
+
+// subtract two 7-tensors
+template <typename T1, typename T2>
+inline symtensor7<typename which_return<T1, T2>::type> operator-(const symtensor7<T1> &left, const symtensor7<T2> &right)
+{
+  symtensor7<typename which_return<T1, T2>::type> res;
+
+  for(int i = 0; i < 36; i++)
+    res.da[i] = left.da[i] - right.da[i];
+
+  return res;
+}
+
+//-------------  let's define multiplications with a scalar
+
+// scalar times vector multiply
+template <typename T1, typename T2>
+inline vector<typename which_return<T1, T2>::type> operator*(const T1 fac, const vector<T2> &v)
+{
+  vector<typename which_return<T1, T2>::type> res;
+  for(int i = 0; i < 3; i++)
+    res.da[i] = fac * v.da[i];
+
+  return res;
+}
+
+// scalar times 2-tensor multiply
+template <typename T1, typename T2>
+inline symtensor2<typename which_return<T1, T2>::type> operator*(const T1 fac, const symtensor2<T2> &v)
+{
+  symtensor2<typename which_return<T1, T2>::type> res;
+  for(int i = 0; i < 6; i++)
+    res.da[i] = fac * v.da[i];
+
+  return res;
+}
+
+// scalar times 3-tensor multiply
+template <typename T1, typename T2>
+inline symtensor3<typename which_return<T1, T2>::type> operator*(const T1 fac, const symtensor3<T2> &v)
+{
+  symtensor3<typename which_return<T1, T2>::type> res;
+  for(int i = 0; i < 10; i++)
+    res.da[i] = fac * v.da[i];
+
+  return res;
+}
+
+// scalar times 4-tensor multiply
+template <typename T1, typename T2>
+inline symtensor4<typename which_return<T1, T2>::type> operator*(const T1 fac, const symtensor4<T2> &v)
+{
+  symtensor4<typename which_return<T1, T2>::type> res;
+  for(int i = 0; i < 15; i++)
+    res.da[i] = fac * v.da[i];
+
+  return res;
+}
+
+// scalar times 5-tensor multiply
+template <typename T1, typename T2>
+inline symtensor5<typename which_return<T1, T2>::type> operator*(const T1 fac, const symtensor5<T2> &v)
+{
+  symtensor5<typename which_return<T1, T2>::type> res;
+  for(int i = 0; i < 21; i++)
+    res.da[i] = fac * v.da[i];
+
+  return res;
+}
+
+// scalar times 6-tensor multiply
+template <typename T1, typename T2>
+inline symtensor6<typename which_return<T1, T2>::type> operator*(const T1 fac, const symtensor6<T2> &v)
+{
+  symtensor6<typename which_return<T1, T2>::type> res;
+  for(int i = 0; i < 28; i++)
+    res.da[i] = fac * v.da[i];
+
+  return res;
+}
+
+// scalar 7-tensor multiply
+template <typename T1, typename T2>
+inline symtensor7<typename which_return<T1, T2>::type> operator*(const T1 fac, const symtensor7<T2> &v)
+{
+  symtensor7<typename which_return<T1, T2>::type> res;
+  for(int i = 0; i < 36; i++)
+    res.da[i] = fac * v.da[i];
+
+  return res;
+}
+
+//-------------  let's defined contractions of these tensors
+
+// 2-tensor contraction with a vector (ordinary matrix-vector multiplication)
+template <typename T1, typename T2>
+inline vector<typename which_return<T1, T2>::type> operator*(const symtensor2<T1> &D, const vector<T2> &v)
+{
+  vector<typename which_return<T1, T2>::type> res;
+
+  res.da[0] = D.da[qXX] * v.da[0] + D.da[qXY] * v.da[1] + D.da[qXZ] * v.da[2];
+  res.da[1] = D.da[qYX] * v.da[0] + D.da[qYY] * v.da[1] + D.da[qYZ] * v.da[2];
+  res.da[2] = D.da[qZX] * v.da[0] + D.da[qZY] * v.da[1] + D.da[qZZ] * v.da[2];
+
+  return res;
+}
+
+// scalar product of two vectors
+template <typename T1, typename T2>
+inline typename which_return<T1, T2>::type operator*(const vector<T1> &v, const vector<T2> &w)
+{
+  return v.da[0] * w.da[0] + v.da[1] * w.da[1] + v.da[2] * w.da[2];
+}
+
+// contract two 2-tensors to a scalar
+template <typename T1, typename T2>
+inline typename which_return<T1, T2>::type operator*(const symtensor2<T1> &D, const symtensor2<T2> &Q)
+{
+  return (D.da[qXX] * Q.da[qXX] + D.da[qYY] * Q.da[qYY] + D.da[qZZ] * Q.da[qZZ]) +
+         2 * (D.da[qXY] * Q.da[qXY] + D.da[qXZ] * Q.da[qXZ] + D.da[qYZ] * Q.da[qYZ]);
+}
+
+// contract two 3-tensors to yield a scalar
+template <typename T1, typename T2>
+inline typename which_return<T1, T2>::type operator*(const symtensor3<T1> &D, const symtensor3<T2> &Q)
+{
+  return D.da[dXXX] * Q.da[dXXX] + D.da[dYYY] * Q.da[dYYY] + D.da[dZZZ] * Q.da[dZZZ] +
+         3 * (D.da[dYZZ] * Q.da[dYZZ] + D.da[dYYZ] * Q.da[dYYZ] + D.da[dXZZ] * Q.da[dXZZ] + D.da[dXYY] * Q.da[dXYY] +
+              D.da[dXXY] * Q.da[dXXY] + D.da[dXXZ] * Q.da[dXXZ]) +
+         6 * D.da[dXYZ] * Q.da[dXYZ];
+  ;
+}
+
+// contract a 4-tensor with a 4-tensor to yield a scalar
+template <typename T1, typename T2>
+inline typename which_return<T1, T2>::type operator*(const symtensor4<T1> &D, const symtensor4<T2> &Q)  // checked
+{
+  return D.da[sZZZZ] * Q.da[sZZZZ] + D.da[sYYYY] * Q.da[sYYYY] + D.da[sXXXX] * Q.da[sXXXX] +
+         6 * (D.da[sYYZZ] * Q.da[sYYZZ] + D.da[sXXZZ] * Q.da[sXXZZ] + D.da[sXXYY] * Q.da[sXXYY]) +
+         4 * (D.da[sYZZZ] * Q.da[sYZZZ] + D.da[sYYYZ] * Q.da[sYYYZ] + D.da[sXZZZ] * Q.da[sXZZZ] + D.da[sXYYY] * Q.da[sXYYY] +
+              D.da[sXXXZ] * Q.da[sXXXZ] + D.da[sXXXY] * Q.da[sXXXY]) +
+         12 * (D.da[sXYZZ] * Q.da[sXYZZ] + D.da[sXYYZ] * Q.da[sXYYZ] + D.da[sXXYZ] * Q.da[sXXYZ]);
+}
+
+// contract a 3-tensor with a vector to yield a 2-tensor
+template <typename T1, typename T2>
+inline symtensor2<typename which_return<T1, T2>::type> operator*(const symtensor3<T1> &D, const vector<T2> &v)
+{
+  symtensor2<typename which_return<T1, T2>::type> res;
+
+  res.da[qXX] = D.da[dXXX] * v.da[0] + D.da[dXXY] * v.da[1] + D.da[dXXZ] * v.da[2];
+  res.da[qYY] = D.da[dYYX] * v.da[0] + D.da[dYYY] * v.da[1] + D.da[dYYZ] * v.da[2];
+  res.da[qZZ] = D.da[dZZX] * v.da[0] + D.da[dZZY] * v.da[1] + D.da[dZZZ] * v.da[2];
+  res.da[qXY] = D.da[dXYX] * v.da[0] + D.da[dXYY] * v.da[1] + D.da[dXYZ] * v.da[2];
+  res.da[qXZ] = D.da[dXZX] * v.da[0] + D.da[dXZY] * v.da[1] + D.da[dXZZ] * v.da[2];
+  res.da[qYZ] = D.da[dYZX] * v.da[0] + D.da[dYZY] * v.da[1] + D.da[dYZZ] * v.da[2];
+
+  return res;
+}
+
+// contract a 3-tensor with a 2-tensor to yield a vector
+template <typename T1, typename T2>
+inline vector<typename which_return<T1, T2>::type> operator*(const symtensor3<T1> &D, const symtensor2<T2> &Q)
+{
+  vector<typename which_return<T1, T2>::type> res;
+
+  res.da[0] = (D.da[dXXX] * Q.da[qXX] + D.da[dXYY] * Q.da[qYY] + D.da[dXZZ] * Q.da[qZZ]) +
+              2 * (D.da[dXXY] * Q.da[qXY] + D.da[dXXZ] * Q.da[qXZ] + D.da[dXYZ] * Q.da[qYZ]);
+  res.da[1] = (D.da[dYXX] * Q.da[qXX] + D.da[dYYY] * Q.da[qYY] + D.da[dYZZ] * Q.da[qZZ]) +
+              2 * (D.da[dYXY] * Q.da[qXY] + D.da[dYXZ] * Q.da[qXZ] + D.da[dYYZ] * Q.da[qYZ]);
+  res.da[2] = (D.da[dZXX] * Q.da[qXX] + D.da[dZYY] * Q.da[qYY] + D.da[dZZZ] * Q.da[qZZ]) +
+              2 * (D.da[dZXY] * Q.da[qXY] + D.da[dZXZ] * Q.da[qXZ] + D.da[dZYZ] * Q.da[qYZ]);
+
+  return res;
+}
+
+// contract a 4-tensor with a 3-tensor to yield a vector
+template <typename T1, typename T2>
+inline vector<typename which_return<T1, T2>::type> operator*(const symtensor4<T1> &D, const symtensor3<T2> &Q)
+{
+  vector<typename which_return<T1, T2>::type> res;
+
+  res[0] = D.da[sXXXX] * Q.da[dXXX] + D.da[sXYYY] * Q.da[dYYY] + D.da[sXZZZ] * Q.da[dZZZ] +
+           3 * (D.da[sXYZZ] * Q.da[dYZZ] + D.da[sXYYZ] * Q.da[dYYZ] + D.da[sXXZZ] * Q.da[dXZZ] + D.da[sXXYY] * Q.da[dXYY] +
+                D.da[sXXXY] * Q.da[dXXY] + D.da[sXXXZ] * Q.da[dXXZ]) +
+           6 * D.da[sXXYZ] * Q.da[dXYZ];
+
+  res[1] = D.da[sYXXX] * Q.da[dXXX] + D.da[sYYYY] * Q.da[dYYY] + D.da[sYZZZ] * Q.da[dZZZ] +
+           3 * (D.da[sYYZZ] * Q.da[dYZZ] + D.da[sYYYZ] * Q.da[dYYZ] + D.da[sYXZZ] * Q.da[dXZZ] + D.da[sYXYY] * Q.da[dXYY] +
+                D.da[sYXXY] * Q.da[dXXY] + D.da[sYXXZ] * Q.da[dXXZ]) +
+           6 * D.da[sYXYZ] * Q.da[dXYZ];
+
+  res[2] = D.da[sZXXX] * Q.da[dXXX] + D.da[sZYYY] * Q.da[dYYY] + D.da[sZZZZ] * Q.da[dZZZ] +
+           3 * (D.da[sZYZZ] * Q.da[dYZZ] + D.da[sZYYZ] * Q.da[dYYZ] + D.da[sZXZZ] * Q.da[dXZZ] + D.da[sZXYY] * Q.da[dXYY] +
+                D.da[sZXXY] * Q.da[dXXY] + D.da[sZXXZ] * Q.da[dXXZ]) +
+           6 * D.da[sZXYZ] * Q.da[dXYZ];
+
+  return res;
+}
+
+// contract a 4-tensor with a vector to yield a 3-tensor
+template <typename T1, typename T2>
+inline symtensor3<typename which_return<T1, T2>::type> operator*(const symtensor4<T1> &D, const vector<T2> &v)  // checked
+{
+  symtensor3<typename which_return<T1, T2>::type> res;
+
+  res.da[dXXX] = D.da[sXXXX] * v.da[0] + D.da[sXXXY] * v.da[1] + D.da[sXXXZ] * v.da[2];
+  res.da[dYYY] = D.da[sYYYX] * v.da[0] + D.da[sYYYY] * v.da[1] + D.da[sYYYZ] * v.da[2];
+  res.da[dZZZ] = D.da[sZZZX] * v.da[0] + D.da[sZZZY] * v.da[1] + D.da[sZZZZ] * v.da[2];
+  res.da[dXYY] = D.da[sXYYX] * v.da[0] + D.da[sXYYY] * v.da[1] + D.da[sXYYZ] * v.da[2];
+  res.da[dXZZ] = D.da[sXZZX] * v.da[0] + D.da[sXZZY] * v.da[1] + D.da[sXZZZ] * v.da[2];
+  res.da[dYXX] = D.da[sYXXX] * v.da[0] + D.da[sYXXY] * v.da[1] + D.da[sYXXZ] * v.da[2];
+  res.da[dYZZ] = D.da[sYZZX] * v.da[0] + D.da[sYZZY] * v.da[1] + D.da[sYZZZ] * v.da[2];
+  res.da[dZXX] = D.da[sZXXX] * v.da[0] + D.da[sZXXY] * v.da[1] + D.da[sZXXZ] * v.da[2];
+  res.da[dZYY] = D.da[sZYYX] * v.da[0] + D.da[sZYYY] * v.da[1] + D.da[sZYYZ] * v.da[2];
+  res.da[dXYZ] = D.da[sXYZX] * v.da[0] + D.da[sXYZY] * v.da[1] + D.da[sXYZZ] * v.da[2];
+
+  return res;
+}
+
+// contract a 5-tensor with a 4-tensor to yield a vector
+template <typename T1, typename T2>
+inline vector<typename which_return<T1, T2>::type> operator*(const symtensor5<T1> &D, const symtensor4<T2> &Q)
+{
+  vector<typename which_return<T1, T2>::type> res;
+
+  res.da[0] = D.da[rXZZZZ] * Q.da[sZZZZ] + D.da[rXYYYY] * Q.da[sYYYY] + D.da[rXXXXX] * Q.da[sXXXX] +
+              6 * (D.da[rXYYZZ] * Q.da[sYYZZ] + D.da[rXXXZZ] * Q.da[sXXZZ] + D.da[rXXXYY] * Q.da[sXXYY]) +
+              4 * (D.da[rXYZZZ] * Q.da[sYZZZ] + D.da[rXYYYZ] * Q.da[sYYYZ] + D.da[rXXZZZ] * Q.da[sXZZZ] + D.da[rXXYYY] * Q.da[sXYYY] +
+                   D.da[rXXXXZ] * Q.da[sXXXZ] + D.da[rXXXXY] * Q.da[sXXXY]) +
+              12 * (D.da[rXXYZZ] * Q.da[sXYZZ] + D.da[rXXYYZ] * Q.da[sXYYZ] + D.da[rXXXYZ] * Q.da[sXXYZ]);
+
+  res.da[1] = D.da[rYZZZZ] * Q.da[sZZZZ] + D.da[rYYYYY] * Q.da[sYYYY] + D.da[rYXXXX] * Q.da[sXXXX] +
+              6 * (D.da[rYYYZZ] * Q.da[sYYZZ] + D.da[rYXXZZ] * Q.da[sXXZZ] + D.da[rYXXYY] * Q.da[sXXYY]) +
+              4 * (D.da[rYYZZZ] * Q.da[sYZZZ] + D.da[rYYYYZ] * Q.da[sYYYZ] + D.da[rYXZZZ] * Q.da[sXZZZ] + D.da[rYXYYY] * Q.da[sXYYY] +
+                   D.da[rYXXXZ] * Q.da[sXXXZ] + D.da[rYXXXY] * Q.da[sXXXY]) +
+              12 * (D.da[rYXYZZ] * Q.da[sXYZZ] + D.da[rYXYYZ] * Q.da[sXYYZ] + D.da[rYXXYZ] * Q.da[sXXYZ]);
+
+  res.da[2] = D.da[rZZZZZ] * Q.da[sZZZZ] + D.da[rZYYYY] * Q.da[sYYYY] + D.da[rZXXXX] * Q.da[sXXXX] +
+              6 * (D.da[rZYYZZ] * Q.da[sYYZZ] + D.da[rZXXZZ] * Q.da[sXXZZ] + D.da[rZXXYY] * Q.da[sXXYY]) +
+              4 * (D.da[rZYZZZ] * Q.da[sYZZZ] + D.da[rZYYYZ] * Q.da[sYYYZ] + D.da[rZXZZZ] * Q.da[sXZZZ] + D.da[rZXYYY] * Q.da[sXYYY] +
+                   D.da[rZXXXZ] * Q.da[sXXXZ] + D.da[rZXXXY] * Q.da[sXXXY]) +
+              12 * (D.da[rZXYZZ] * Q.da[sXYZZ] + D.da[rZXYYZ] * Q.da[sXYYZ] + D.da[rZXXYZ] * Q.da[sXXYZ]);
+
+  return res;
+}
+
+// contract a 4-tensor with a 2-tensor to yield a 2-tensor
+template <typename T1, typename T2>
+inline symtensor2<typename which_return<T1, T2>::type> operator*(const symtensor4<T1> &D, const symtensor2<T2> &Q)  // checked
+{
+  symtensor2<typename which_return<T1, T2>::type> res;
+
+  res.da[qXX] = D.da[sXXXX] * Q.da[qXX] + D.da[sXXYY] * Q.da[qYY] + D.da[sXXZZ] * Q.da[qZZ] +
+                2 * (D.da[sXXXY] * Q.da[qXY] + D.da[sXXXZ] * Q.da[qXZ] + D.da[sXXYZ] * Q.da[qYZ]);
+  res.da[qYY] = D.da[sYYXX] * Q.da[qXX] + D.da[sYYYY] * Q.da[qYY] + D.da[sYYZZ] * Q.da[qZZ] +
+                2 * (D.da[sYYXY] * Q.da[qXY] + D.da[sYYXZ] * Q.da[qXZ] + D.da[sYYYZ] * Q.da[qYZ]);
+  res.da[qZZ] = D.da[sZZXX] * Q.da[qXX] + D.da[sZZYY] * Q.da[qYY] + D.da[sZZZZ] * Q.da[qZZ] +
+                2 * (D.da[sZZXY] * Q.da[qXY] + D.da[sZZXZ] * Q.da[qXZ] + D.da[sZZYZ] * Q.da[qYZ]);
+  res.da[qXY] = D.da[sXYXX] * Q.da[qXX] + D.da[sXYYY] * Q.da[qYY] + D.da[sXYZZ] * Q.da[qZZ] +
+                2 * (D.da[sXYXY] * Q.da[qXY] + D.da[sXYXZ] * Q.da[qXZ] + D.da[sXYYZ] * Q.da[qYZ]);
+  res.da[qXZ] = D.da[sXZXX] * Q.da[qXX] + D.da[sXZYY] * Q.da[qYY] + D.da[sXZZZ] * Q.da[qZZ] +
+                2 * (D.da[sXZXY] * Q.da[qXY] + D.da[sXZXZ] * Q.da[qXZ] + D.da[sXZYZ] * Q.da[qYZ]);
+  res.da[qYZ] = D.da[sYZXX] * Q.da[qXX] + D.da[sYZYY] * Q.da[qYY] + D.da[sYZZZ] * Q.da[qZZ] +
+                2 * (D.da[sYZXY] * Q.da[qXY] + D.da[sYZXZ] * Q.da[qXZ] + D.da[sYZYZ] * Q.da[qYZ]);
+
+  return res;
+}
+
+// contract a 5-tensor with a 3-tensor to yield a 2-tensor
+template <typename T1, typename T2>
+inline symtensor2<typename which_return<T1, T2>::type> operator*(const symtensor5<T1> &D, const symtensor3<T2> &Q)
+{
+  symtensor2<typename which_return<T1, T2>::type> res;
+
+  res.da[qXX] = D.da[rXXXXX] * Q.da[dXXX] + D.da[rXXYYY] * Q.da[dYYY] + D.da[rXXZZZ] * Q.da[dZZZ] +
+                3 * (D.da[rXXYZZ] * Q.da[dYZZ] + D.da[rXXYYZ] * Q.da[dYYZ] + D.da[rXXXZZ] * Q.da[dXZZ] + D.da[rXXXYY] * Q.da[dXYY] +
+                     D.da[rXXXXY] * Q.da[dXXY] + D.da[rXXXXZ] * Q.da[dXXZ]) +
+                6 * D.da[rXXXYZ] * Q.da[dXYZ];
+
+  res.da[qYY] = D.da[rYYXXX] * Q.da[dXXX] + D.da[rYYYYY] * Q.da[dYYY] + D.da[rYYZZZ] * Q.da[dZZZ] +
+                3 * (D.da[rYYYZZ] * Q.da[dYZZ] + D.da[rYYYYZ] * Q.da[dYYZ] + D.da[rYYXZZ] * Q.da[dXZZ] + D.da[rYYXYY] * Q.da[dXYY] +
+                     D.da[rYYXXY] * Q.da[dXXY] + D.da[rYYXXZ] * Q.da[dXXZ]) +
+                6 * D.da[rYYXYZ] * Q.da[dXYZ];
+
+  res.da[qZZ] = D.da[rZZXXX] * Q.da[dXXX] + D.da[rZZYYY] * Q.da[dYYY] + D.da[rZZZZZ] * Q.da[dZZZ] +
+                3 * (D.da[rZZYZZ] * Q.da[dYZZ] + D.da[rZZYYZ] * Q.da[dYYZ] + D.da[rZZXZZ] * Q.da[dXZZ] + D.da[rZZXYY] * Q.da[dXYY] +
+                     D.da[rZZXXY] * Q.da[dXXY] + D.da[rZZXXZ] * Q.da[dXXZ]) +
+                6 * D.da[rZZXYZ] * Q.da[dXYZ];
+
+  res.da[qXY] = D.da[rXYXXX] * Q.da[dXXX] + D.da[rXYYYY] * Q.da[dYYY] + D.da[rXYZZZ] * Q.da[dZZZ] +
+                3 * (D.da[rXYYZZ] * Q.da[dYZZ] + D.da[rXYYYZ] * Q.da[dYYZ] + D.da[rXYXZZ] * Q.da[dXZZ] + D.da[rXYXYY] * Q.da[dXYY] +
+                     D.da[rXYXXY] * Q.da[dXXY] + D.da[rXYXXZ] * Q.da[dXXZ]) +
+                6 * D.da[rXYXYZ] * Q.da[dXYZ];
+
+  res.da[qXZ] = D.da[rXZXXX] * Q.da[dXXX] + D.da[rXZYYY] * Q.da[dYYY] + D.da[rXZZZZ] * Q.da[dZZZ] +
+                3 * (D.da[rXZYZZ] * Q.da[dYZZ] + D.da[rXZYYZ] * Q.da[dYYZ] + D.da[rXZXZZ] * Q.da[dXZZ] + D.da[rXZXYY] * Q.da[dXYY] +
+                     D.da[rXZXXY] * Q.da[dXXY] + D.da[rXZXXZ] * Q.da[dXXZ]) +
+                6 * D.da[rXZXYZ] * Q.da[dXYZ];
+
+  res.da[qYZ] = D.da[rYZXXX] * Q.da[dXXX] + D.da[rYZYYY] * Q.da[dYYY] + D.da[rYZZZZ] * Q.da[dZZZ] +
+                3 * (D.da[rYZYZZ] * Q.da[dYZZ] + D.da[rYZYYZ] * Q.da[dYYZ] + D.da[rYZXZZ] * Q.da[dXZZ] + D.da[rYZXYY] * Q.da[dXYY] +
+                     D.da[rYZXXY] * Q.da[dXXY] + D.da[rYZXXZ] * Q.da[dXXZ]) +
+                6 * D.da[rYZXYZ] * Q.da[dXYZ];
+
+  return res;
+}
+
+// contract a 5-tensor with a 2-tensor to yield a 3-tensor
+template <typename T1, typename T2>
+inline symtensor3<typename which_return<T1, T2>::type> operator*(const symtensor5<T1> &D, const symtensor2<T2> &Q)
+{
+  symtensor3<typename which_return<T1, T2>::type> res;
+
+  res.da[dXXX] = (D.da[rXXXXX] * Q.da[qXX] + D.da[rXXXYY] * Q.da[qYY] + D.da[rXXXZZ] * Q.da[qZZ]) +
+                 2 * (D.da[rXXXXY] * Q.da[qXY] + D.da[rXXXXZ] * Q.da[qXZ] + D.da[rXXXYZ] * Q.da[qYZ]);
+
+  res.da[dYYY] = (D.da[rYYYXX] * Q.da[qXX] + D.da[rYYYYY] * Q.da[qYY] + D.da[rYYYZZ] * Q.da[qZZ]) +
+                 2 * (D.da[rYYYXY] * Q.da[qXY] + D.da[rYYYXZ] * Q.da[qXZ] + D.da[rYYYYZ] * Q.da[qYZ]);
+
+  res.da[dZZZ] = (D.da[rZZZXX] * Q.da[qXX] + D.da[rZZZYY] * Q.da[qYY] + D.da[rZZZZZ] * Q.da[qZZ]) +
+                 2 * (D.da[rZZZXY] * Q.da[qXY] + D.da[rZZZXZ] * Q.da[qXZ] + D.da[rZZZYZ] * Q.da[qYZ]);
+
+  res.da[dXYY] = (D.da[rXYYXX] * Q.da[qXX] + D.da[rXYYYY] * Q.da[qYY] + D.da[rXYYZZ] * Q.da[qZZ]) +
+                 2 * (D.da[rXYYXY] * Q.da[qXY] + D.da[rXYYXZ] * Q.da[qXZ] + D.da[rXYYYZ] * Q.da[qYZ]);
+
+  res.da[dXZZ] = (D.da[rXZZXX] * Q.da[qXX] + D.da[rXZZYY] * Q.da[qYY] + D.da[rXZZZZ] * Q.da[qZZ]) +
+                 2 * (D.da[rXZZXY] * Q.da[qXY] + D.da[rXZZXZ] * Q.da[qXZ] + D.da[rXZZYZ] * Q.da[qYZ]);
+
+  res.da[dYXX] = (D.da[rYXXXX] * Q.da[qXX] + D.da[rYXXYY] * Q.da[qYY] + D.da[rYXXZZ] * Q.da[qZZ]) +
+                 2 * (D.da[rYXXXY] * Q.da[qXY] + D.da[rYXXXZ] * Q.da[qXZ] + D.da[rYXXYZ] * Q.da[qYZ]);
+
+  res.da[dYZZ] = (D.da[rYZZXX] * Q.da[qXX] + D.da[rYZZYY] * Q.da[qYY] + D.da[rYZZZZ] * Q.da[qZZ]) +
+                 2 * (D.da[rYZZXY] * Q.da[qXY] + D.da[rYZZXZ] * Q.da[qXZ] + D.da[rYZZYZ] * Q.da[qYZ]);
+
+  res.da[dZXX] = (D.da[rZXXXX] * Q.da[qXX] + D.da[rZXXYY] * Q.da[qYY] + D.da[rZXXZZ] * Q.da[qZZ]) +
+                 2 * (D.da[rZXXXY] * Q.da[qXY] + D.da[rZXXXZ] * Q.da[qXZ] + D.da[rZXXYZ] * Q.da[qYZ]);
+
+  res.da[dZYY] = (D.da[rZYYXX] * Q.da[qXX] + D.da[rZYYYY] * Q.da[qYY] + D.da[rZYYZZ] * Q.da[qZZ]) +
+                 2 * (D.da[rZYYXY] * Q.da[qXY] + D.da[rZYYXZ] * Q.da[qXZ] + D.da[rZYYYZ] * Q.da[qYZ]);
+
+  res.da[dXYZ] = (D.da[rXYZXX] * Q.da[qXX] + D.da[rXYZYY] * Q.da[qYY] + D.da[rXYZZZ] * Q.da[qZZ]) +
+                 2 * (D.da[rXYZXY] * Q.da[qXY] + D.da[rXYZXZ] * Q.da[qXZ] + D.da[rXYZYZ] * Q.da[qYZ]);
+
+  return res;
+}
+
+// contract a 5-tensor with a vector to yield a 4-tensor
+template <typename T1, typename T2>
+inline symtensor4<typename which_return<T1, T2>::type> operator*(const symtensor5<T1> &D, const vector<T2> &v)
+{
+  symtensor4<typename which_return<T1, T2>::type> res;
+
+  res.da[sXXXX] = D.da[rXXXXX] * v.da[0] + D.da[rXXXXY] * v.da[1] + D.da[rXXXXZ] * v.da[2];
+  res.da[sXXXY] = D.da[rXXXYX] * v.da[0] + D.da[rXXXYY] * v.da[1] + D.da[rXXXYZ] * v.da[2];
+  res.da[sXXXZ] = D.da[rXXXZX] * v.da[0] + D.da[rXXXZY] * v.da[1] + D.da[rXXXZZ] * v.da[2];
+  res.da[sXXYY] = D.da[rXXYYX] * v.da[0] + D.da[rXXYYY] * v.da[1] + D.da[rXXYYZ] * v.da[2];
+  res.da[sXXYZ] = D.da[rXXYZX] * v.da[0] + D.da[rXXYZY] * v.da[1] + D.da[rXXYZZ] * v.da[2];
+  res.da[sXXZZ] = D.da[rXXZZX] * v.da[0] + D.da[rXXZZY] * v.da[1] + D.da[rXXZZZ] * v.da[2];
+  res.da[sXYYY] = D.da[rXYYYX] * v.da[0] + D.da[rXYYYY] * v.da[1] + D.da[rXYYYZ] * v.da[2];
+  res.da[sXYYZ] = D.da[rXYYZX] * v.da[0] + D.da[rXYYZY] * v.da[1] + D.da[rXYYZZ] * v.da[2];
+  res.da[sXYZZ] = D.da[rXYZZX] * v.da[0] + D.da[rXYZZY] * v.da[1] + D.da[rXYZZZ] * v.da[2];
+  res.da[sXZZZ] = D.da[rXZZZX] * v.da[0] + D.da[rXZZZY] * v.da[1] + D.da[rXZZZZ] * v.da[2];
+  res.da[sYYYY] = D.da[rYYYYX] * v.da[0] + D.da[rYYYYY] * v.da[1] + D.da[rYYYYZ] * v.da[2];
+  res.da[sYYYZ] = D.da[rYYYZX] * v.da[0] + D.da[rYYYZY] * v.da[1] + D.da[rYYYZZ] * v.da[2];
+  res.da[sYYZZ] = D.da[rYYZZX] * v.da[0] + D.da[rYYZZY] * v.da[1] + D.da[rYYZZZ] * v.da[2];
+  res.da[sYZZZ] = D.da[rYZZZX] * v.da[0] + D.da[rYZZZY] * v.da[1] + D.da[rYZZZZ] * v.da[2];
+  res.da[sZZZZ] = D.da[rZZZZX] * v.da[0] + D.da[rZZZZY] * v.da[1] + D.da[rZZZZZ] * v.da[2];
+
+  return res;
+}
+
+// contract a 5-tensor with a 5-tensor to yield a scalar
+template <typename T1, typename T2>
+inline typename which_return<T1, T2>::type operator*(const symtensor5<T1> &D, const symtensor5<T2> &Q)
+{
+  return D.da[rXXXXX] * Q.da[rXXXXX] + D.da[rYYYYY] * Q.da[rYYYYY] + D.da[rZZZZZ] * Q.da[rZZZZZ] +
+         5 * (D.da[rYZZZZ] * Q.da[rYZZZZ] + D.da[rXYYYY] * Q.da[rXYYYY] + D.da[rYYYYZ] * Q.da[rYYYYZ] + D.da[rXXXXZ] * Q.da[rXXXXZ] +
+              D.da[rXXXXY] * Q.da[rXXXXY] + D.da[rXZZZZ] * Q.da[rXZZZZ]) +
+         10 * (D.da[rXXZZZ] * Q.da[rXXZZZ] + D.da[rYYZZZ] * Q.da[rYYZZZ] + D.da[rYYYZZ] * Q.da[rYYYZZ] + D.da[rXXXYY] * Q.da[rXXXYY] +
+               D.da[rXXYYY] * Q.da[rXXYYY] + D.da[rXXXZZ] * Q.da[rXXXZZ]) +
+         20 * (D.da[rXYYYZ] * Q.da[rXYYYZ] + D.da[rXYZZZ] * Q.da[rXYZZZ] + D.da[rXXXYZ] * Q.da[rXXXYZ]) +
+         30 * (D.da[rXYYZZ] * Q.da[rXYYZZ] + D.da[rXXYYZ] * Q.da[rXXYYZ] + D.da[rXXYZZ] * Q.da[rXXYZZ]);
+}
+
+template <typename T1, typename T2>
+inline vector<typename which_return<T1, T2>::type> contract_twice(const symtensor3<T1> &D, const vector<T2> &v)
+{
+  typedef typename which_return<T1, T2>::type T;
+
+  symtensor2<T> Dv = D * v;
+
+  vector<T> res = Dv * v;
+
+  return res;
+}
+
+template <typename T1, typename T2>
+inline vector<typename which_return<T1, T2>::type> contract_thrice(const symtensor4<T1> &D, const vector<T2> &v)
+{
+  typedef typename which_return<T1, T2>::type T;
+
+  symtensor3<T> Dv  = D * v;
+  symtensor2<T> Dvv = Dv * v;
+
+  vector<T> res = Dvv * v;
+
+  return res;
+}
+
+template <typename T1, typename T2>
+inline vector<typename which_return<T1, T2>::type> contract_fourtimes(const symtensor5<T1> &D, const vector<T2> &v)
+{
+  typedef typename which_return<T1, T2>::type T;
+
+  symtensor4<T> Dv   = D * v;
+  symtensor3<T> Dvv  = Dv * v;
+  symtensor2<T> Dvvv = Dvv * v;
+
+  vector<T> res = Dvvv * v;
+
+  return res;
+}
+
+// contract a 6-tensor with a vector to yield a 5-tensor
+template <typename T1, typename T2>
+inline symtensor5<typename which_return<T1, T2>::type> operator*(const symtensor6<T1> &D, const vector<T2> &v)  // checked
+{
+  symtensor5<typename which_return<T1, T2>::type> res;
+
+  res.da[rXXXXX] = D.da[pXXXXXX] * v.da[0] + D.da[pXXXXXY] * v.da[1] + D.da[pXXXXXZ] * v.da[2];
+  res.da[rXXXXY] = D.da[pXXXXXY] * v.da[0] + D.da[pXXXXYY] * v.da[1] + D.da[pXXXXYZ] * v.da[2];
+  res.da[rXXXXZ] = D.da[pXXXXXZ] * v.da[0] + D.da[pXXXXYZ] * v.da[1] + D.da[pXXXXZZ] * v.da[2];
+  res.da[rXXXYY] = D.da[pXXXXYY] * v.da[0] + D.da[pXXXYYY] * v.da[1] + D.da[pXXXYYZ] * v.da[2];
+  res.da[rXXXYZ] = D.da[pXXXXYZ] * v.da[0] + D.da[pXXXYYZ] * v.da[1] + D.da[pXXXYZZ] * v.da[2];
+  res.da[rXXXZZ] = D.da[pXXXXZZ] * v.da[0] + D.da[pXXXYZZ] * v.da[1] + D.da[pXXXZZZ] * v.da[2];
+  res.da[rXXYYY] = D.da[pXXXYYY] * v.da[0] + D.da[pXXYYYY] * v.da[1] + D.da[pXXYYYZ] * v.da[2];
+  res.da[rXXYYZ] = D.da[pXXXYYZ] * v.da[0] + D.da[pXXYYYZ] * v.da[1] + D.da[pXXYYZZ] * v.da[2];
+  res.da[rXXYZZ] = D.da[pXXXYZZ] * v.da[0] + D.da[pXXYYZZ] * v.da[1] + D.da[pXXYZZZ] * v.da[2];
+  res.da[rXXZZZ] = D.da[pXXXZZZ] * v.da[0] + D.da[pXXYZZZ] * v.da[1] + D.da[pXXZZZZ] * v.da[2];
+  res.da[rXYYYY] = D.da[pXXYYYY] * v.da[0] + D.da[pXYYYYY] * v.da[1] + D.da[pXYYYYZ] * v.da[2];
+  res.da[rXYYYZ] = D.da[pXXYYYZ] * v.da[0] + D.da[pXYYYYZ] * v.da[1] + D.da[pXYYYZZ] * v.da[2];
+  res.da[rXYYZZ] = D.da[pXXYYZZ] * v.da[0] + D.da[pXYYYZZ] * v.da[1] + D.da[pXYYZZZ] * v.da[2];
+  res.da[rXYZZZ] = D.da[pXXYZZZ] * v.da[0] + D.da[pXYYZZZ] * v.da[1] + D.da[pXYZZZZ] * v.da[2];
+  res.da[rXZZZZ] = D.da[pXXZZZZ] * v.da[0] + D.da[pXYZZZZ] * v.da[1] + D.da[pXZZZZZ] * v.da[2];
+  res.da[rYYYYY] = D.da[pXYYYYY] * v.da[0] + D.da[pYYYYYY] * v.da[1] + D.da[pYYYYYZ] * v.da[2];
+  res.da[rYYYYZ] = D.da[pXYYYYZ] * v.da[0] + D.da[pYYYYYZ] * v.da[1] + D.da[pYYYYZZ] * v.da[2];
+  res.da[rYYYZZ] = D.da[pXYYYZZ] * v.da[0] + D.da[pYYYYZZ] * v.da[1] + D.da[pYYYZZZ] * v.da[2];
+  res.da[rYYZZZ] = D.da[pXYYZZZ] * v.da[0] + D.da[pYYYZZZ] * v.da[1] + D.da[pYYZZZZ] * v.da[2];
+  res.da[rYZZZZ] = D.da[pXYZZZZ] * v.da[0] + D.da[pYYZZZZ] * v.da[1] + D.da[pYZZZZZ] * v.da[2];
+  res.da[rZZZZZ] = D.da[pXZZZZZ] * v.da[0] + D.da[pYZZZZZ] * v.da[1] + D.da[pZZZZZZ] * v.da[2];
+
+  return res;
+}
+
+// contract a 7-tensor with a vector to yield a 6-tensor
+template <typename T1, typename T2>
+inline symtensor6<typename which_return<T1, T2>::type> operator*(const symtensor7<T1> &D, const vector<T2> &v)  // checked
+{
+  symtensor6<typename which_return<T1, T2>::type> res;
+
+  res.da[pXXXXXX] = D.da[tXXXXXXX] * v.da[0] + D.da[tXXXXXXY] * v.da[1] + D.da[tXXXXXXZ] * v.da[2];
+  res.da[pXXXXXY] = D.da[tXXXXXXY] * v.da[0] + D.da[tXXXXXYY] * v.da[1] + D.da[tXXXXXYZ] * v.da[2];
+  res.da[pXXXXXZ] = D.da[tXXXXXXZ] * v.da[0] + D.da[tXXXXXYZ] * v.da[1] + D.da[tXXXXXZZ] * v.da[2];
+  res.da[pXXXXYY] = D.da[tXXXXXYY] * v.da[0] + D.da[tXXXXYYY] * v.da[1] + D.da[tXXXXYYZ] * v.da[2];
+  res.da[pXXXXYZ] = D.da[tXXXXXYZ] * v.da[0] + D.da[tXXXXYYZ] * v.da[1] + D.da[tXXXXYZZ] * v.da[2];
+  res.da[pXXXXZZ] = D.da[tXXXXXZZ] * v.da[0] + D.da[tXXXXYZZ] * v.da[1] + D.da[tXXXXZZZ] * v.da[2];
+  res.da[pXXXYYY] = D.da[tXXXXYYY] * v.da[0] + D.da[tXXXYYYY] * v.da[1] + D.da[tXXXYYYZ] * v.da[2];
+  res.da[pXXXYYZ] = D.da[tXXXXYYZ] * v.da[0] + D.da[tXXXYYYZ] * v.da[1] + D.da[tXXXYYZZ] * v.da[2];
+  res.da[pXXXYZZ] = D.da[tXXXXYZZ] * v.da[0] + D.da[tXXXYYZZ] * v.da[1] + D.da[tXXXYZZZ] * v.da[2];
+  res.da[pXXXZZZ] = D.da[tXXXXZZZ] * v.da[0] + D.da[tXXXYZZZ] * v.da[1] + D.da[tXXXZZZZ] * v.da[2];
+  res.da[pXXYYYY] = D.da[tXXXYYYY] * v.da[0] + D.da[tXXYYYYY] * v.da[1] + D.da[tXXYYYYZ] * v.da[2];
+  res.da[pXXYYYZ] = D.da[tXXXYYYZ] * v.da[0] + D.da[tXXYYYYZ] * v.da[1] + D.da[tXXYYYZZ] * v.da[2];
+  res.da[pXXYYZZ] = D.da[tXXXYYZZ] * v.da[0] + D.da[tXXYYYZZ] * v.da[1] + D.da[tXXYYZZZ] * v.da[2];
+  res.da[pXXYZZZ] = D.da[tXXXYZZZ] * v.da[0] + D.da[tXXYYZZZ] * v.da[1] + D.da[tXXYZZZZ] * v.da[2];
+  res.da[pXXZZZZ] = D.da[tXXXZZZZ] * v.da[0] + D.da[tXXYZZZZ] * v.da[1] + D.da[tXXZZZZZ] * v.da[2];
+  res.da[pXYYYYY] = D.da[tXXYYYYY] * v.da[0] + D.da[tXYYYYYY] * v.da[1] + D.da[tXYYYYYZ] * v.da[2];
+  res.da[pXYYYYZ] = D.da[tXXYYYYZ] * v.da[0] + D.da[tXYYYYYZ] * v.da[1] + D.da[tXYYYYZZ] * v.da[2];
+  res.da[pXYYYZZ] = D.da[tXXYYYZZ] * v.da[0] + D.da[tXYYYYZZ] * v.da[1] + D.da[tXYYYZZZ] * v.da[2];
+  res.da[pXYYZZZ] = D.da[tXXYYZZZ] * v.da[0] + D.da[tXYYYZZZ] * v.da[1] + D.da[tXYYZZZZ] * v.da[2];
+  res.da[pXYZZZZ] = D.da[tXXYZZZZ] * v.da[0] + D.da[tXYYZZZZ] * v.da[1] + D.da[tXYZZZZZ] * v.da[2];
+  res.da[pXZZZZZ] = D.da[tXXZZZZZ] * v.da[0] + D.da[tXYZZZZZ] * v.da[1] + D.da[tXZZZZZZ] * v.da[2];
+  res.da[pYYYYYY] = D.da[tXYYYYYY] * v.da[0] + D.da[tYYYYYYY] * v.da[1] + D.da[tYYYYYYZ] * v.da[2];
+  res.da[pYYYYYZ] = D.da[tXYYYYYZ] * v.da[0] + D.da[tYYYYYYZ] * v.da[1] + D.da[tYYYYYZZ] * v.da[2];
+  res.da[pYYYYZZ] = D.da[tXYYYYZZ] * v.da[0] + D.da[tYYYYYZZ] * v.da[1] + D.da[tYYYYZZZ] * v.da[2];
+  res.da[pYYYZZZ] = D.da[tXYYYZZZ] * v.da[0] + D.da[tYYYYZZZ] * v.da[1] + D.da[tYYYZZZZ] * v.da[2];
+  res.da[pYYZZZZ] = D.da[tXYYZZZZ] * v.da[0] + D.da[tYYYZZZZ] * v.da[1] + D.da[tYYZZZZZ] * v.da[2];
+  res.da[pYZZZZZ] = D.da[tXYZZZZZ] * v.da[0] + D.da[tYYZZZZZ] * v.da[1] + D.da[tYZZZZZZ] * v.da[2];
+  res.da[pZZZZZZ] = D.da[tXZZZZZZ] * v.da[0] + D.da[tYZZZZZZ] * v.da[1] + D.da[tZZZZZZZ] * v.da[2];
+
+  return res;
+}
+
+//-------------  let's define some outer products
+
+/* produce a vector as the cross product of two vectors */
+template <typename T1, typename T2>
+inline vector<typename which_return<T1, T2>::type> operator^(const vector<T1> &v, const vector<T2> &w)
+{
+  vector<typename which_return<T1, T2>::type> res;
+
+  res.da[0] = v.da[1] * w.da[2] - v.da[2] * w.da[1];
+  res.da[1] = v.da[2] * w.da[0] - v.da[0] * w.da[2];
+  res.da[2] = v.da[0] * w.da[1] - v.da[1] * w.da[0];
+
+  return res;
+}
+
+/* produce a 2-tensor from the outer product of two vectors */
+template <typename T1, typename T2>
+inline symtensor2<typename which_return<T1, T2>::type> operator%(const vector<T1> &v, const vector<T2> &w)
+{
+  symtensor2<typename which_return<T1, T2>::type> res;
+
+  res.da[qXX] = v.da[0] * w.da[0];
+  res.da[qYY] = v.da[1] * w.da[1];
+  res.da[qZZ] = v.da[2] * w.da[2];
+  res.da[qXY] = v.da[0] * w.da[1];
+  res.da[qXZ] = v.da[0] * w.da[2];
+  res.da[qYZ] = v.da[1] * w.da[2];
+
+  return res;
+}
+
+/* produce a 3-tensor from the outer product of a vector and a 2-tensor */
+template <typename T1, typename T2>
+inline symtensor3<typename which_return<T1, T2>::type> operator%(const vector<T1> &v, const symtensor2<T2> &D)
+{
+  symtensor3<typename which_return<T1, T2>::type> res;
+
+  res.da[dXXX] = D.da[qXX] * v.da[0];
+  res.da[dXXY] = D.da[qXX] * v.da[1];
+  res.da[dXXZ] = D.da[qXX] * v.da[2];
+  res.da[dXYY] = D.da[qXY] * v.da[1];
+  res.da[dXYZ] = D.da[qXY] * v.da[2];
+  res.da[dXZZ] = D.da[qXZ] * v.da[2];
+  res.da[dYYY] = D.da[qYY] * v.da[1];
+  res.da[dYYZ] = D.da[qYY] * v.da[2];
+  res.da[dYZZ] = D.da[qYZ] * v.da[2];
+  res.da[dZZZ] = D.da[qZZ] * v.da[2];
+
+  return res;
+}
+
+/* produce a 4-tensor from the outer product of a vector and a 3-tensor */
+template <typename T1, typename T2>
+inline symtensor4<typename which_return<T1, T2>::type> operator%(const vector<T1> &v, const symtensor3<T2> &D)
+{
+  symtensor4<typename which_return<T1, T2>::type> res;
+
+  res.da[sXXXX] = D.da[dXXX] * v.da[0];
+  res.da[sXXXY] = D.da[dXXX] * v.da[1];
+  res.da[sXXXZ] = D.da[dXXX] * v.da[2];
+  res.da[sXXYY] = D.da[dXXY] * v.da[1];
+  res.da[sXXYZ] = D.da[dXXY] * v.da[2];
+  res.da[sXXZZ] = D.da[dXXZ] * v.da[2];
+  res.da[sXYYY] = D.da[dXYY] * v.da[1];
+  res.da[sXYYZ] = D.da[dXYY] * v.da[2];
+  res.da[sXYZZ] = D.da[dXYZ] * v.da[2];
+  res.da[sXZZZ] = D.da[dXZZ] * v.da[2];
+  res.da[sYYYY] = D.da[dYYY] * v.da[1];
+  res.da[sYYYZ] = D.da[dYYY] * v.da[2];
+  res.da[sYYZZ] = D.da[dYYZ] * v.da[2];
+  res.da[sYZZZ] = D.da[dYZZ] * v.da[2];
+  res.da[sZZZZ] = D.da[dZZZ] * v.da[2];
+
+  return res;
+}
+
+/* produce a 5-tensor from the outer product of a vector and a 4-tensor */
+template <typename T1, typename T2>
+inline symtensor5<typename which_return<T1, T2>::type> operator%(const vector<T1> &v, const symtensor4<T2> &D)
+{
+  symtensor5<typename which_return<T1, T2>::type> res;
+
+  res.da[rXXXXX] = D.da[sXXXX] * v.da[0];
+  res.da[rXXXXY] = D.da[sXXXX] * v.da[1];
+  res.da[rXXXXZ] = D.da[sXXXX] * v.da[2];
+  res.da[rXXXYY] = D.da[sXXXY] * v.da[1];
+  res.da[rXXXYZ] = D.da[sXXXY] * v.da[2];
+  res.da[rXXXZZ] = D.da[sXXXZ] * v.da[2];
+  res.da[rXXYYY] = D.da[sXXYY] * v.da[1];
+  res.da[rXXYYZ] = D.da[sXXYY] * v.da[2];
+  res.da[rXXYZZ] = D.da[sXXYZ] * v.da[2];
+  res.da[rXXZZZ] = D.da[sXXZZ] * v.da[2];
+  res.da[rXYYYY] = D.da[sXYYY] * v.da[1];
+  res.da[rXYYYZ] = D.da[sXYYY] * v.da[2];
+  res.da[rXYYZZ] = D.da[sXYYZ] * v.da[2];
+  res.da[rXYZZZ] = D.da[sXYZZ] * v.da[2];
+  res.da[rXZZZZ] = D.da[sXZZZ] * v.da[2];
+  res.da[rYYYYY] = D.da[sYYYY] * v.da[1];
+  res.da[rYYYYZ] = D.da[sYYYY] * v.da[2];
+  res.da[rYYYZZ] = D.da[sYYYZ] * v.da[2];
+  res.da[rYYZZZ] = D.da[sYYZZ] * v.da[2];
+  res.da[rYZZZZ] = D.da[sYZZZ] * v.da[2];
+  res.da[rZZZZZ] = D.da[sZZZZ] * v.da[2];
+
+  return res;
+}
+
+/* produce a 6-tensor from the outer product of a vector and a 5-tensor */
+template <typename T1, typename T2>
+inline symtensor6<typename which_return<T1, T2>::type> operator%(const vector<T1> &v, const symtensor5<T2> &D)
+{
+  symtensor6<typename which_return<T1, T2>::type> res;
+
+  res.da[pXXXXXX] = D.da[rXXXXX] * v.da[0];
+  res.da[pXXXXXY] = D.da[rXXXXX] * v.da[1];
+  res.da[pXXXXXZ] = D.da[rXXXXX] * v.da[2];
+  res.da[pXXXXYY] = D.da[rXXXXY] * v.da[1];
+  res.da[pXXXXYZ] = D.da[rXXXXY] * v.da[2];
+  res.da[pXXXXZZ] = D.da[rXXXXZ] * v.da[2];
+  res.da[pXXXYYY] = D.da[rXXXYY] * v.da[1];
+  res.da[pXXXYYZ] = D.da[rXXXYY] * v.da[2];
+  res.da[pXXXYZZ] = D.da[rXXXYZ] * v.da[2];
+  res.da[pXXXZZZ] = D.da[rXXXZZ] * v.da[2];
+  res.da[pXXYYYY] = D.da[rXXYYY] * v.da[1];
+  res.da[pXXYYYZ] = D.da[rXXYYY] * v.da[2];
+  res.da[pXXYYZZ] = D.da[rXXYYZ] * v.da[2];
+  res.da[pXXYZZZ] = D.da[rXXYZZ] * v.da[2];
+  res.da[pXXZZZZ] = D.da[rXXZZZ] * v.da[2];
+  res.da[pXYYYYY] = D.da[rXYYYY] * v.da[1];
+  res.da[pXYYYYZ] = D.da[rXYYYY] * v.da[2];
+  res.da[pXYYYZZ] = D.da[rXYYYZ] * v.da[2];
+  res.da[pXYYZZZ] = D.da[rXYYZZ] * v.da[2];
+  res.da[pXYZZZZ] = D.da[rXYZZZ] * v.da[2];
+  res.da[pXZZZZZ] = D.da[rXZZZZ] * v.da[2];
+  res.da[pYYYYYY] = D.da[rYYYYY] * v.da[1];
+  res.da[pYYYYYZ] = D.da[rYYYYY] * v.da[2];
+  res.da[pYYYYZZ] = D.da[rYYYYZ] * v.da[2];
+  res.da[pYYYZZZ] = D.da[rYYYZZ] * v.da[2];
+  res.da[pYYZZZZ] = D.da[rYYZZZ] * v.da[2];
+  res.da[pYZZZZZ] = D.da[rYZZZZ] * v.da[2];
+  res.da[pZZZZZZ] = D.da[rZZZZZ] * v.da[2];
+
+  return res;
+}
+
+/* produce a 7-tensor from the outer product of a vector and a 6-tensor */
+template <typename T1, typename T2>
+inline symtensor7<typename which_return<T1, T2>::type> operator%(const vector<T1> &v, const symtensor6<T2> &D)
+{
+  symtensor7<typename which_return<T1, T2>::type> res;
+
+  res.da[tXXXXXXX] = D.da[pXXXXXX] * v.da[0];
+  res.da[tXXXXXXY] = D.da[pXXXXXX] * v.da[1];
+  res.da[tXXXXXXZ] = D.da[pXXXXXX] * v.da[2];
+  res.da[tXXXXXYY] = D.da[pXXXXXY] * v.da[1];
+  res.da[tXXXXXYZ] = D.da[pXXXXXY] * v.da[2];
+  res.da[tXXXXXZZ] = D.da[pXXXXXZ] * v.da[2];
+  res.da[tXXXXYYY] = D.da[pXXXXYY] * v.da[1];
+  res.da[tXXXXYYZ] = D.da[pXXXXYY] * v.da[2];
+  res.da[tXXXXYZZ] = D.da[pXXXXYZ] * v.da[2];
+  res.da[tXXXXZZZ] = D.da[pXXXXZZ] * v.da[2];
+  res.da[tXXXYYYY] = D.da[pXXXYYY] * v.da[1];
+  res.da[tXXXYYYZ] = D.da[pXXXYYY] * v.da[2];
+  res.da[tXXXYYZZ] = D.da[pXXXYYZ] * v.da[2];
+  res.da[tXXXYZZZ] = D.da[pXXXYZZ] * v.da[2];
+  res.da[tXXXZZZZ] = D.da[pXXXZZZ] * v.da[2];
+  res.da[tXXYYYYY] = D.da[pXXYYYY] * v.da[1];
+  res.da[tXXYYYYZ] = D.da[pXXYYYY] * v.da[2];
+  res.da[tXXYYYZZ] = D.da[pXXYYYZ] * v.da[2];
+  res.da[tXXYYZZZ] = D.da[pXXYYZZ] * v.da[2];
+  res.da[tXXYZZZZ] = D.da[pXXYZZZ] * v.da[2];
+  res.da[tXXZZZZZ] = D.da[pXXZZZZ] * v.da[2];
+  res.da[tXYYYYYY] = D.da[pXYYYYY] * v.da[1];
+  res.da[tXYYYYYZ] = D.da[pXYYYYY] * v.da[2];
+  res.da[tXYYYYZZ] = D.da[pXYYYYZ] * v.da[2];
+  res.da[tXYYYZZZ] = D.da[pXYYYZZ] * v.da[2];
+  res.da[tXYYZZZZ] = D.da[pXYYZZZ] * v.da[2];
+  res.da[tXYZZZZZ] = D.da[pXYZZZZ] * v.da[2];
+  res.da[tXZZZZZZ] = D.da[pXZZZZZ] * v.da[2];
+  res.da[tYYYYYYY] = D.da[pYYYYYY] * v.da[1];
+  res.da[tYYYYYYZ] = D.da[pYYYYYY] * v.da[2];
+  res.da[tYYYYYZZ] = D.da[pYYYYYZ] * v.da[2];
+  res.da[tYYYYZZZ] = D.da[pYYYYZZ] * v.da[2];
+  res.da[tYYYZZZZ] = D.da[pYYYZZZ] * v.da[2];
+  res.da[tYYZZZZZ] = D.da[pYYZZZZ] * v.da[2];
+  res.da[tYZZZZZZ] = D.da[pYZZZZZ] * v.da[2];
+  res.da[tZZZZZZZ] = D.da[pZZZZZZ] * v.da[2];
+
+  return res;
+}
+
+/* compute the sum of the three possible outer products of a 2-tensor and a vector, yielding a symmetric 3-tensor */
+template <typename T1, typename T2>
+inline symtensor3<typename which_return<T1, T2>::type> outer_prod_sum(const symtensor2<T1> &D, const vector<T2> &v)
+{
+  symtensor3<typename which_return<T1, T2>::type> res;
+
+  res.da[dXXX] = 3 * D.da[qXX] * v.da[0];
+  res.da[dYYY] = 3 * D.da[qYY] * v.da[1];
+  res.da[dZZZ] = 3 * D.da[qZZ] * v.da[2];
+  res.da[dXYY] = 2 * D.da[qXY] * v.da[1] + D.da[qYY] * v.da[0];
+  res.da[dXZZ] = 2 * D.da[qXZ] * v.da[2] + D.da[qZZ] * v.da[0];
+  res.da[dYXX] = 2 * D.da[qXY] * v.da[0] + D.da[qXX] * v.da[1];
+  res.da[dYZZ] = 2 * D.da[qYZ] * v.da[2] + D.da[qZZ] * v.da[1];
+  res.da[dZXX] = 2 * D.da[qXZ] * v.da[0] + D.da[qXX] * v.da[2];
+  res.da[dZYY] = 2 * D.da[qYZ] * v.da[1] + D.da[qYY] * v.da[2];
+  res.da[dXYZ] = D.da[qXY] * v.da[2] + D.da[qYZ] * v.da[0] + D.da[qZX] * v.da[1];
+
+  return res;
+}
+
+/* compute the sum of the four possible outer products of a 3-tensor and a vector, yielding a symmetric 4-tensor */
+template <typename T1, typename T2>
+inline symtensor4<typename which_return<T1, T2>::type> outer_prod_sum(const symtensor3<T1> &D, const vector<T2> &v)
+{
+  symtensor4<typename which_return<T1, T2>::type> res;
+
+  res.da[sXXXX] = 4 * D.da[dXXX] * v.da[0];
+  res.da[sZZZZ] = 4 * D.da[dZZZ] * v.da[2];
+  res.da[sYYYY] = 4 * D.da[dYYY] * v.da[1];
+  res.da[sXXXY] = 3 * D.da[dXXY] * v.da[0] + D.da[dXXX] * v.da[1];
+  res.da[sXXXZ] = 3 * D.da[dXXZ] * v.da[0] + D.da[dXXX] * v.da[2];
+  res.da[sXYYY] = 3 * D.da[dXYY] * v.da[1] + D.da[dYYY] * v.da[0];
+  res.da[sYYYZ] = 3 * D.da[dYYZ] * v.da[1] + D.da[dYYY] * v.da[2];
+  res.da[sXZZZ] = 3 * D.da[dXZZ] * v.da[2] + D.da[dZZZ] * v.da[0];
+  res.da[sYZZZ] = 3 * D.da[dYZZ] * v.da[2] + D.da[dZZZ] * v.da[1];
+  res.da[sXXYY] = 2 * D.da[dXXY] * v.da[1] + 2 * D.da[dXYY] * v.da[0];
+  res.da[sXXZZ] = 2 * D.da[dXXZ] * v.da[2] + 2 * D.da[dXZZ] * v.da[0];
+  res.da[sYYZZ] = 2 * D.da[dYYZ] * v.da[2] + 2 * D.da[dYZZ] * v.da[1];
+  res.da[sXXYZ] = 2 * D.da[dXYZ] * v.da[0] + D.da[dXXY] * v.da[2] + D.da[dXXZ] * v.da[1];
+  res.da[sXYYZ] = 2 * D.da[dXYZ] * v.da[1] + D.da[dXYY] * v.da[2] + D.da[dYYZ] * v.da[0];
+  res.da[sXYZZ] = 2 * D.da[dXYZ] * v.da[2] + D.da[dXZZ] * v.da[1] + D.da[dYZZ] * v.da[0];
+
+  return res;
+}
+
+/* compute the sum of the six possible outer products of a 2-tensor with another 2-tensor, yielding a symmetric 4-tensor */
+template <typename T1, typename T2>
+inline symtensor4<typename which_return<T1, T2>::type> outer_prod_sum(const symtensor2<T1> &D, const symtensor2<T2> &S)
+{
+  symtensor4<typename which_return<T1, T2>::type> res;
+
+  res.da[sXXXX] = 6 * D.da[qXX] * S.da[qXX];
+  res.da[sYYYY] = 6 * D.da[qYY] * S.da[qYY];
+  res.da[sZZZZ] = 6 * D.da[qZZ] * S.da[qZZ];
+  res.da[sXXXY] = 3 * D.da[qXX] * S.da[qXY] + 3 * D.da[qXY] * S.da[qXX];
+  res.da[sXXXZ] = 3 * D.da[qXX] * S.da[qXZ] + 3 * D.da[qXZ] * S.da[qXX];
+  res.da[sXYYY] = 3 * D.da[qXY] * S.da[qYY] + 3 * D.da[qYY] * S.da[qXY];
+  res.da[sXZZZ] = 3 * D.da[qZZ] * S.da[qXZ] + 3 * D.da[qXZ] * S.da[qZZ];
+  res.da[sYYYZ] = 3 * D.da[qYY] * S.da[qYZ] + 3 * D.da[qYZ] * S.da[qYY];
+  res.da[sYZZZ] = 3 * D.da[qZZ] * S.da[qYZ] + 3 * D.da[qYZ] * S.da[qZZ];
+  res.da[sXXYY] = D.da[qXX] * S.da[qYY] + D.da[qYY] * S.da[qXX] + 4 * D.da[qXY] * S.da[qXY];
+  res.da[sXXZZ] = D.da[qXX] * S.da[qZZ] + D.da[qZZ] * S.da[qXX] + 4 * D.da[qXZ] * S.da[qXZ];
+  res.da[sYYZZ] = D.da[qYY] * S.da[qZZ] + D.da[qZZ] * S.da[qYY] + 4 * D.da[qYZ] * S.da[qYZ];
+  res.da[sXXYZ] = D.da[qXX] * S.da[qYZ] + 2 * D.da[qXY] * S.da[qXZ] + 2 * D.da[qXZ] * S.da[qXY] + D.da[qYZ] * S.da[qXX];
+  res.da[sXYZZ] = D.da[qZZ] * S.da[qXY] + 2 * D.da[qXZ] * S.da[qYZ] + 2 * D.da[qYZ] * S.da[qXZ] + D.da[qXY] * S.da[qZZ];
+  res.da[sXYYZ] = D.da[qYY] * S.da[qXZ] + 2 * D.da[qXY] * S.da[qYZ] + 2 * D.da[qYZ] * S.da[qXY] + D.da[qXZ] * S.da[qYY];
+
+  return res;
+}
+
+/* compute the sum of the five possible outer products of a 4-tensor and a vector, yielding a symmetric 5-tensor */
+template <typename T1, typename T2>
+inline symtensor5<typename which_return<T1, T2>::type> outer_prod_sum(const symtensor4<T1> &D, const vector<T2> &v)
+{
+  symtensor5<typename which_return<T1, T2>::type> res;
+
+  res.da[rXXXXX] = 5 * D.da[sXXXX] * v.da[0];
+  res.da[rXXXXY] = 4 * D.da[sXXXY] * v.da[0] + D.da[sXXXX] * v.da[1];
+  res.da[rXXXXZ] = 4 * D.da[sXXXZ] * v.da[0] + D.da[sXXXX] * v.da[2];
+  res.da[rXXXYX] = 3 * D.da[sXXYX] * v.da[0] + D.da[sXXXX] * v.da[1] + D.da[sXXXY] * v.da[0];
+  res.da[rXXXYY] = 3 * D.da[sXXYY] * v.da[0] + 2 * D.da[sXXXY] * v.da[1];
+  res.da[rXXXYZ] = 3 * D.da[sXXYZ] * v.da[0] + D.da[sXXXZ] * v.da[1] + D.da[sXXXY] * v.da[2];
+  res.da[rXXXZX] = 3 * D.da[sXXZX] * v.da[0] + D.da[sXXXX] * v.da[2] + D.da[sXXXZ] * v.da[0];
+  res.da[rXXXZY] = 3 * D.da[sXXZY] * v.da[0] + D.da[sXXXY] * v.da[2] + D.da[sXXXZ] * v.da[1];
+  res.da[rXXXZZ] = 3 * D.da[sXXZZ] * v.da[0] + 2 * D.da[sXXXZ] * v.da[2];
+  res.da[rXXYYY] = 2 * D.da[sXYYY] * v.da[0] + 3 * D.da[sXXYY] * v.da[1];
+  res.da[rXXYYZ] = 2 * D.da[sXYYZ] * v.da[0] + 2 * D.da[sXXYZ] * v.da[1] + D.da[sXXYY] * v.da[2];
+  res.da[rXXYZY] = 2 * D.da[sXYZY] * v.da[0] + D.da[sXXZY] * v.da[1] + D.da[sXXYY] * v.da[2] + D.da[sXXYZ] * v.da[1];
+  res.da[rXXYZZ] = 2 * D.da[sXYZZ] * v.da[0] + D.da[sXXZZ] * v.da[1] + 2 * D.da[sXXYZ] * v.da[2];
+  res.da[rXXZZZ] = 2 * D.da[sXZZZ] * v.da[0] + 3 * D.da[sXXZZ] * v.da[2];
+  res.da[rXYYYY] = D.da[sYYYY] * v.da[0] + 4 * D.da[sXYYY] * v.da[1];
+  res.da[rXYYYZ] = D.da[sYYYZ] * v.da[0] + 3 * D.da[sXYYZ] * v.da[1] + D.da[sXYYY] * v.da[2];
+  res.da[rXYYZY] = D.da[sYYZY] * v.da[0] + 2 * D.da[sXYZY] * v.da[1] + D.da[sXYYY] * v.da[2] + D.da[sXYYZ] * v.da[1];
+  res.da[rXYYZZ] = D.da[sYYZZ] * v.da[0] + 2 * D.da[sXYZZ] * v.da[1] + 2 * D.da[sXYYZ] * v.da[2];
+  res.da[rXYZZZ] = D.da[sYZZZ] * v.da[0] + D.da[sXZZZ] * v.da[1] + 3 * D.da[sXYZZ] * v.da[2];
+  res.da[rXZZZZ] = D.da[sZZZZ] * v.da[0] + 4 * D.da[sXZZZ] * v.da[2];
+  res.da[rYYYYY] = 5 * D.da[sYYYY] * v.da[1];
+  res.da[rYYYYZ] = 4 * D.da[sYYYZ] * v.da[1] + D.da[sYYYY] * v.da[2];
+  res.da[rYYYZY] = 3 * D.da[sYYZY] * v.da[1] + D.da[sYYYY] * v.da[2] + D.da[sYYYZ] * v.da[1];
+  res.da[rYYYZZ] = 3 * D.da[sYYZZ] * v.da[1] + 2 * D.da[sYYYZ] * v.da[2];
+  res.da[rYYZZZ] = 2 * D.da[sYZZZ] * v.da[1] + 3 * D.da[sYYZZ] * v.da[2];
+  res.da[rYZZZZ] = D.da[sZZZZ] * v.da[1] + 4 * D.da[sYZZZ] * v.da[2];
+  res.da[rZZZZZ] = 5 * D.da[sZZZZ] * v.da[2];
+
+  return res;
+}
+
+/* compute the sum of the 10 possible outer products of a 3-tensor with another 2-tensor, yielding a symmetric 5-tensor */
+template <typename T1, typename T2>
+inline symtensor5<typename which_return<T1, T2>::type> outer_prod_sum(const symtensor3<T1> &D, const symtensor2<T2> &S)
+{
+  symtensor5<typename which_return<T1, T2>::type> res;
+
+  res.da[rXXXXX] = 10 * D.da[dXXX] * S.da[qXX];
+  res.da[rXXXXY] = 6 * D.da[dXXY] * S.da[qXX] + 4 * D.da[dXXX] * S.da[qXY];
+  res.da[rXXXXZ] = 6 * D.da[dXXZ] * S.da[qXX] + 4 * D.da[dXXX] * S.da[qXZ];
+  res.da[rXXXYX] = 3 * D.da[dXYX] * S.da[qXX] + 3 * D.da[dXXX] * S.da[qXY] + 3 * D.da[dXXY] * S.da[qXX] + D.da[dXXX] * S.da[qYX];
+  res.da[rXXXYY] = 3 * D.da[dXYY] * S.da[qXX] + 6 * D.da[dXXY] * S.da[qXY] + D.da[dXXX] * S.da[qYY];
+  res.da[rXXXYZ] = 3 * D.da[dXYZ] * S.da[qXX] + 3 * D.da[dXXZ] * S.da[qXY] + 3 * D.da[dXXY] * S.da[qXZ] + D.da[dXXX] * S.da[qYZ];
+  res.da[rXXXZX] = 3 * D.da[dXZX] * S.da[qXX] + 3 * D.da[dXXX] * S.da[qXZ] + 3 * D.da[dXXZ] * S.da[qXX] + D.da[dXXX] * S.da[qZX];
+  res.da[rXXXZY] = 3 * D.da[dXZY] * S.da[qXX] + 3 * D.da[dXXY] * S.da[qXZ] + 3 * D.da[dXXZ] * S.da[qXY] + D.da[dXXX] * S.da[qZY];
+  res.da[rXXXZZ] = 3 * D.da[dXZZ] * S.da[qXX] + 6 * D.da[dXXZ] * S.da[qXZ] + D.da[dXXX] * S.da[qZZ];
+  res.da[rXXYYY] = D.da[dYYY] * S.da[qXX] + 6 * D.da[dXYY] * S.da[qXY] + 3 * D.da[dXXY] * S.da[qYY];
+  res.da[rXXYYZ] = D.da[dYYZ] * S.da[qXX] + 4 * D.da[dXYZ] * S.da[qXY] + 2 * D.da[dXYY] * S.da[qXZ] + D.da[dXXZ] * S.da[qYY] +
+                   2 * D.da[dXXY] * S.da[qYZ];
+  res.da[rXXYZY] = D.da[dYZY] * S.da[qXX] + 2 * D.da[dXZY] * S.da[qXY] + 2 * D.da[dXYY] * S.da[qXZ] + 2 * D.da[dXYZ] * S.da[qXY] +
+                   D.da[dXXY] * S.da[qYZ] + D.da[dXXZ] * S.da[qYY] + D.da[dXXY] * S.da[qZY];
+  res.da[rXXYZZ] = D.da[dYZZ] * S.da[qXX] + 2 * D.da[dXZZ] * S.da[qXY] + 4 * D.da[dXYZ] * S.da[qXZ] + 2 * D.da[dXXZ] * S.da[qYZ] +
+                   D.da[dXXY] * S.da[qZZ];
+  res.da[rXXZZZ] = D.da[dZZZ] * S.da[qXX] + 6 * D.da[dXZZ] * S.da[qXZ] + 3 * D.da[dXXZ] * S.da[qZZ];
+  res.da[rXYYYY] = 4 * D.da[dYYY] * S.da[qXY] + 6 * D.da[dXYY] * S.da[qYY];
+  res.da[rXYYYZ] = 3 * D.da[dYYZ] * S.da[qXY] + D.da[dYYY] * S.da[qXZ] + 3 * D.da[dXYZ] * S.da[qYY] + 3 * D.da[dXYY] * S.da[qYZ];
+  res.da[rXYYZY] = 2 * D.da[dYZY] * S.da[qXY] + D.da[dYYY] * S.da[qXZ] + D.da[dYYZ] * S.da[qXY] + D.da[dXZY] * S.da[qYY] +
+                   2 * D.da[dXYY] * S.da[qYZ] + 2 * D.da[dXYZ] * S.da[qYY] + D.da[dXYY] * S.da[qZY];
+  res.da[rXYYZZ] = 2 * D.da[dYZZ] * S.da[qXY] + 2 * D.da[dYYZ] * S.da[qXZ] + D.da[dXZZ] * S.da[qYY] + 4 * D.da[dXYZ] * S.da[qYZ] +
+                   D.da[dXYY] * S.da[qZZ];
+  res.da[rXYZZZ] = D.da[dZZZ] * S.da[qXY] + 3 * D.da[dYZZ] * S.da[qXZ] + 3 * D.da[dXZZ] * S.da[qYZ] + 3 * D.da[dXYZ] * S.da[qZZ];
+  res.da[rXZZZZ] = 4 * D.da[dZZZ] * S.da[qXZ] + 6 * D.da[dXZZ] * S.da[qZZ];
+  res.da[rYYYYY] = 10 * D.da[dYYY] * S.da[qYY];
+  res.da[rYYYYZ] = 6 * D.da[dYYZ] * S.da[qYY] + 4 * D.da[dYYY] * S.da[qYZ];
+  res.da[rYYYZY] = 3 * D.da[dYZY] * S.da[qYY] + 3 * D.da[dYYY] * S.da[qYZ] + 3 * D.da[dYYZ] * S.da[qYY] + D.da[dYYY] * S.da[qZY];
+  res.da[rYYYZZ] = 3 * D.da[dYZZ] * S.da[qYY] + 6 * D.da[dYYZ] * S.da[qYZ] + D.da[dYYY] * S.da[qZZ];
+  res.da[rYYZZZ] = D.da[dZZZ] * S.da[qYY] + 6 * D.da[dYZZ] * S.da[qYZ] + 3 * D.da[dYYZ] * S.da[qZZ];
+  res.da[rYZZZZ] = 4 * D.da[dZZZ] * S.da[qYZ] + 6 * D.da[dYZZ] * S.da[qZZ];
+  res.da[rZZZZZ] = 10 * D.da[dZZZ] * S.da[qZZ];
+
+  return res;
+}
+
+enum setup_options
+{
+  INIT,
+  ADD
+};
+
+template <typename T, typename TypeGfac>
+inline void setup_D3(enum setup_options opt, symtensor3<T> &D3, vector<T> &dxyz, symtensor2<T> &aux2, symtensor3<T> &aux3, TypeGfac g2,
+                     TypeGfac g3)
+{
+  // Note: dxyz, aux2  are input parameters, whereas aux3 is an output parameter!
+
+  aux3 = dxyz % aux2;  // construct outer product of the two vectors
+  if(opt == INIT)
+    D3 = g3 * aux3;
+  else
+    D3 += g3 * aux3;
+
+  vector<T> g2_dxyz = g2 * dxyz;
+
+  D3[dXXX] += 3 * g2_dxyz[0];
+  D3[dYYY] += 3 * g2_dxyz[1];
+  D3[dZZZ] += 3 * g2_dxyz[2];
+
+  D3[dXXY] += g2_dxyz[1];
+  D3[dXXZ] += g2_dxyz[2];
+  D3[dXYY] += g2_dxyz[0];
+  D3[dXZZ] += g2_dxyz[0];
+  D3[dYYZ] += g2_dxyz[2];
+  D3[dYZZ] += g2_dxyz[1];
+}
+
+template <typename T, typename TypeGfac>
+inline void setup_D4(enum setup_options opt, symtensor4<T> &D4, vector<T> &dxyz, symtensor2<T> &aux2, symtensor3<T> &aux3,
+                     symtensor4<T> &aux4, TypeGfac g2, TypeGfac g3, TypeGfac g4)
+{
+  // Note: dxyz, aux2, and aux3 are input parameters, whereas aux4 is an output parameter!
+
+  aux4 = dxyz % aux3;  // construct outer product
+  if(opt == INIT)
+    D4 = g4 * aux4;
+  else
+    D4 += g4 * aux4;
+
+  D4[sXXXX] += 3 * g2;
+  D4[sYYYY] += 3 * g2;
+  D4[sZZZZ] += 3 * g2;
+  D4[sXXYY] += g2;
+  D4[sXXZZ] += g2;
+  D4[sYYZZ] += g2;
+
+  symtensor2<T> g3aux2 = g3 * aux2;
+
+  D4[sXXXX] += 6 * g3aux2[qXX];
+  D4[sYYYY] += 6 * g3aux2[qYY];
+  D4[sZZZZ] += 6 * g3aux2[qZZ];
+
+  D4[sXXXY] += 3 * g3aux2[qXY];
+  D4[sXYYY] += 3 * g3aux2[qXY];
+  D4[sXXXZ] += 3 * g3aux2[qXZ];
+  D4[sXZZZ] += 3 * g3aux2[qXZ];
+  D4[sYYYZ] += 3 * g3aux2[qYZ];
+  D4[sYZZZ] += 3 * g3aux2[qYZ];
+
+  D4[sXXYY] += g3aux2[qXX] + g3aux2[qYY];
+  D4[sXXZZ] += g3aux2[qXX] + g3aux2[qZZ];
+  D4[sYYZZ] += g3aux2[qYY] + g3aux2[qZZ];
+
+  D4[sXXYZ] += g3aux2[qYZ];
+  D4[sXYYZ] += g3aux2[qXZ];
+  D4[sXYZZ] += g3aux2[qXY];
+}
+
+template <typename T, typename TypeGfac>
+inline void setup_D5(enum setup_options opt, symtensor5<T> &D5, vector<T> &dxyz, symtensor3<T> &aux3, symtensor4<T> &aux4,
+                     symtensor5<T> &aux5, TypeGfac g3, TypeGfac g4, TypeGfac g5)
+{
+  // Note: dxyz, aux3, and aux4 are input parameters, whereas aux5 is an output parameter!
+
+  aux5 = dxyz % aux4;  // construct outer product
+  if(opt == INIT)
+    D5 = g5 * aux5;
+  else
+    D5 += g5 * aux5;
+
+  vector<T> g3_dxyz = g3 * dxyz;
+
+  D5[rXXXXX] += 15 * g3_dxyz[0];
+  D5[rYYYYY] += 15 * g3_dxyz[1];
+  D5[rZZZZZ] += 15 * g3_dxyz[2];
+
+  D5[rXXXXY] += 3 * g3_dxyz[1];
+  D5[rXXXXZ] += 3 * g3_dxyz[2];
+  D5[rXYYYY] += 3 * g3_dxyz[0];
+  D5[rXZZZZ] += 3 * g3_dxyz[0];
+  D5[rYYYYZ] += 3 * g3_dxyz[2];
+  D5[rYZZZZ] += 3 * g3_dxyz[1];
+
+  D5[rXXXYY] += 3 * g3_dxyz[0];
+  D5[rXXXZZ] += 3 * g3_dxyz[0];
+  D5[rXXYYY] += 3 * g3_dxyz[1];
+  D5[rXXZZZ] += 3 * g3_dxyz[2];
+  D5[rYYYZZ] += 3 * g3_dxyz[1];
+  D5[rYYZZZ] += 3 * g3_dxyz[2];
+
+  D5[rXXYZZ] += g3_dxyz[1];
+  D5[rXXYYZ] += g3_dxyz[2];
+  D5[rXYYZZ] += g3_dxyz[0];
+
+  D5[rXXXYZ] += 0;
+  D5[rXYYYZ] += 0;
+  D5[rXYZZZ] += 0;
+
+  ///// ----
+  symtensor3<T> g4aux3 = g4 * aux3;
+
+  D5[rXXXXX] += 10 * g4aux3[dXXX];
+  D5[rYYYYY] += 10 * g4aux3[dYYY];
+  D5[rZZZZZ] += 10 * g4aux3[dZZZ];
+
+  D5[rXXXXY] += 6 * g4aux3[dXXY];
+  D5[rXXXXZ] += 6 * g4aux3[dXXZ];
+  D5[rXYYYY] += 6 * g4aux3[dYYX];
+  D5[rXZZZZ] += 6 * g4aux3[dZZX];
+  D5[rYYYYZ] += 6 * g4aux3[dYYZ];
+  D5[rYZZZZ] += 6 * g4aux3[dZZY];
+
+  D5[rXXXYY] += g4aux3[dXXX] + 3 * g4aux3[dXYY];
+  D5[rXXXZZ] += g4aux3[dXXX] + 3 * g4aux3[dXZZ];
+  D5[rXXYYY] += g4aux3[dYYY] + 3 * g4aux3[dYXX];
+  D5[rXXZZZ] += g4aux3[dZZZ] + 3 * g4aux3[dZXX];
+  D5[rYYYZZ] += g4aux3[dYYY] + 3 * g4aux3[dYZZ];
+  D5[rYYZZZ] += g4aux3[dZZZ] + 3 * g4aux3[dZYY];
+
+  D5[rXXYZZ] += g4aux3[dYZZ] + g4aux3[dXXY];
+  D5[rXXYYZ] += g4aux3[dYYZ] + g4aux3[dXXZ];
+  D5[rXYYZZ] += g4aux3[dXZZ] + g4aux3[dXYY];
+
+  D5[rXXXYZ] += 3 * g4aux3[dXYZ];
+  D5[rXYYYZ] += 3 * g4aux3[dXYZ];
+  D5[rXYZZZ] += 3 * g4aux3[dXYZ];
+}
+
+template <typename T, typename TypeGfac>
+inline void setup_D6(enum setup_options opt, symtensor6<T> &D6, vector<T> &dxyz, TypeGfac g3, TypeGfac g4, TypeGfac g5, TypeGfac g6)
+{
+#define X 0
+#define Y 1
+#define Z 2
+
+  if(opt == INIT)
+    D6 = static_cast<T>(0);
+
+  D6[pXXXXXX] += 15 * g3 + g4 * (45 * dxyz[X] * dxyz[X]) + g5 * (15 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X];
+
+  D6[pXXXXXY] += g4 * (15 * dxyz[X] * dxyz[Y]) + g5 * (10 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y];
+
+  D6[pXXXXXZ] += g4 * (15 * dxyz[X] * dxyz[Z]) + g5 * (10 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z];
+
+  D6[pXXXXYY] += 3 * g3 + g4 * (6 * dxyz[X] * dxyz[X] + 3 * dxyz[Y] * dxyz[Y]) +
+                 g5 * (6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] + dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y];
+
+  D6[pXXXXYZ] += g4 * (3 * dxyz[Y] * dxyz[Z]) + g5 * (6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z];
+
+  D6[pXXXXZZ] += 3 * g3 + g4 * (6 * dxyz[X] * dxyz[X] + 3 * dxyz[Z] * dxyz[Z]) +
+                 g5 * (6 * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] + dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z];
+
+  D6[pXXXYYY] += g4 * (9 * dxyz[X] * dxyz[Y]) +
+                 g5 * (3 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] + 3 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y];
+
+  D6[pXXXYYZ] += g4 * (3 * dxyz[X] * dxyz[Z]) +
+                 g5 * (3 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z] + dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z];
+
+  D6[pXXXYZZ] += g4 * (3 * dxyz[X] * dxyz[Y]) +
+                 g5 * (3 * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] + dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z];
+
+  D6[pXXXZZZ] += g4 * (9 * dxyz[X] * dxyz[Z]) +
+                 g5 * (3 * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 3 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D6[pXXYYYY] += 3 * g3 + g4 * (3 * dxyz[X] * dxyz[X] + 6 * dxyz[Y] * dxyz[Y]) +
+                 g5 * (dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] + 6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y];
+
+  D6[pXXYYYZ] += g4 * (3 * dxyz[Y] * dxyz[Z]) +
+                 g5 * (dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] + 3 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z];
+
+  D6[pXXYYZZ] +=
+      1 * g3 + g4 * (dxyz[X] * dxyz[X] + dxyz[Y] * dxyz[Y] + dxyz[Z] * dxyz[Z]) +
+      g5 * (dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] + dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] + dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y]) +
+      g6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z];
+
+  D6[pXXYZZZ] += g4 * (3 * dxyz[Y] * dxyz[Z]) +
+                 g5 * (dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 3 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D6[pXXZZZZ] += 3 * g3 + g4 * (3 * dxyz[X] * dxyz[X] + 6 * dxyz[Z] * dxyz[Z]) +
+                 g5 * (dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 6 * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z]) +
+                 g6 * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D6[pXYYYYY] += g4 * (15 * dxyz[X] * dxyz[Y]) + g5 * (10 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                 g6 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y];
+
+  D6[pXYYYYZ] += g4 * (3 * dxyz[X] * dxyz[Z]) + g5 * (6 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                 g6 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z];
+
+  D6[pXYYYZZ] += g4 * (3 * dxyz[X] * dxyz[Y]) +
+                 g5 * (3 * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] + dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                 g6 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z];
+
+  D6[pXYYZZZ] += g4 * (3 * dxyz[X] * dxyz[Z]) +
+                 g5 * (dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 3 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                 g6 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D6[pXYZZZZ] += g4 * (3 * dxyz[X] * dxyz[Y]) + g5 * (6 * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z]) +
+                 g6 * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D6[pXZZZZZ] += g4 * (15 * dxyz[X] * dxyz[Z]) + g5 * (10 * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                 g6 * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D6[pYYYYYY] += 15 * g3 + g4 * (45 * dxyz[Y] * dxyz[Y]) + g5 * (15 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                 g6 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y];
+
+  D6[pYYYYYZ] += g4 * (15 * dxyz[Y] * dxyz[Z]) + g5 * (10 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                 g6 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z];
+
+  D6[pYYYYZZ] += 3 * g3 + g4 * (6 * dxyz[Y] * dxyz[Y] + 3 * dxyz[Z] * dxyz[Z]) +
+                 g5 * (6 * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] + dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                 g6 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z];
+
+  D6[pYYYZZZ] += g4 * (9 * dxyz[Y] * dxyz[Z]) +
+                 g5 * (3 * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 3 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                 g6 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D6[pYYZZZZ] += 3 * g3 + g4 * (3 * dxyz[Y] * dxyz[Y] + 6 * dxyz[Z] * dxyz[Z]) +
+                 g5 * (dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 6 * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z]) +
+                 g6 * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D6[pYZZZZZ] += g4 * (15 * dxyz[Y] * dxyz[Z]) + g5 * (10 * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                 g6 * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D6[pZZZZZZ] += 15 * g3 + g4 * (45 * dxyz[Z] * dxyz[Z]) + g5 * (15 * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                 g6 * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+#undef X
+#undef Y
+#undef Z
+}
+
+template <typename T, typename TypeGfac>
+inline void setup_D7(enum setup_options opt, symtensor7<T> &D7, vector<T> &dxyz, TypeGfac g4, TypeGfac g5, TypeGfac g6, TypeGfac g7)
+{
+#define X 0
+#define Y 1
+#define Z 2
+
+  if(opt == INIT)
+    D7 = static_cast<T>(0);
+
+  D7[tXXXXXXX] += g4 * (105 * dxyz[X]) + g5 * (105 * dxyz[X] * dxyz[X] * dxyz[X]) +
+                  g6 * (21 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X];
+
+  D7[tXXXXXXY] += g4 * (15 * dxyz[Y]) + g5 * (45 * dxyz[X] * dxyz[X] * dxyz[Y]) +
+                  g6 * (15 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y];
+
+  D7[tXXXXXXZ] += g4 * (15 * dxyz[Z]) + g5 * (45 * dxyz[X] * dxyz[X] * dxyz[Z]) +
+                  g6 * (15 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z];
+
+  D7[tXXXXXYY] += g4 * (15 * dxyz[X]) + g5 * (10 * dxyz[X] * dxyz[X] * dxyz[X] + 15 * dxyz[X] * dxyz[Y] * dxyz[Y]) +
+                  g6 * (10 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] + dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y];
+
+  D7[tXXXXXYZ] += g5 * (15 * dxyz[X] * dxyz[Y] * dxyz[Z]) + g6 * (10 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z];
+
+  D7[tXXXXXZZ] += g4 * (15 * dxyz[X]) + g5 * (10 * dxyz[X] * dxyz[X] * dxyz[X] + 15 * dxyz[X] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (10 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] + dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z];
+
+  D7[tXXXXYYY] += g4 * (9 * dxyz[Y]) + g5 * (18 * dxyz[X] * dxyz[X] * dxyz[Y] + 3 * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                  g6 * (6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] + 3 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y];
+
+  D7[tXXXXYYZ] += g4 * (3 * dxyz[Z]) + g5 * (6 * dxyz[X] * dxyz[X] * dxyz[Z] + 3 * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                  g6 * (6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z] + dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z];
+
+  D7[tXXXXYZZ] += g4 * (3 * dxyz[Y]) + g5 * (6 * dxyz[X] * dxyz[X] * dxyz[Y] + 3 * dxyz[Y] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] + dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z];
+
+  D7[tXXXXZZZ] += g4 * (9 * dxyz[Z]) + g5 * (18 * dxyz[X] * dxyz[X] * dxyz[Z] + 3 * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (6 * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 3 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tXXXYYYY] += g4 * (9 * dxyz[X]) + g5 * (3 * dxyz[X] * dxyz[X] * dxyz[X] + 18 * dxyz[X] * dxyz[Y] * dxyz[Y]) +
+                  g6 * (3 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] + 6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y];
+
+  D7[tXXXYYYZ] += g5 * (9 * dxyz[X] * dxyz[Y] * dxyz[Z]) +
+                  g6 * (3 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] + 3 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z];
+
+  D7[tXXXYYZZ] += g4 * (3 * dxyz[X]) +
+                  g5 * (dxyz[X] * dxyz[X] * dxyz[X] + 3 * dxyz[X] * dxyz[Y] * dxyz[Y] + 3 * dxyz[X] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (3 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] + dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] +
+                        dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z];
+
+  D7[tXXXYZZZ] += g5 * (9 * dxyz[X] * dxyz[Y] * dxyz[Z]) +
+                  g6 * (3 * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 3 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tXXXZZZZ] += g4 * (9 * dxyz[X]) + g5 * (3 * dxyz[X] * dxyz[X] * dxyz[X] + 18 * dxyz[X] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (3 * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 6 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tXXYYYYY] += g4 * (15 * dxyz[Y]) + g5 * (15 * dxyz[X] * dxyz[X] * dxyz[Y] + 10 * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                  g6 * (dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] + 10 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y];
+
+  D7[tXXYYYYZ] += g4 * (3 * dxyz[Z]) + g5 * (3 * dxyz[X] * dxyz[X] * dxyz[Z] + 6 * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                  g6 * (dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] + 6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z];
+
+  D7[tXXYYYZZ] += g4 * (3 * dxyz[Y]) +
+                  g5 * (3 * dxyz[X] * dxyz[X] * dxyz[Y] + dxyz[Y] * dxyz[Y] * dxyz[Y] + 3 * dxyz[Y] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] + 3 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] +
+                        dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z];
+
+  D7[tXXYYZZZ] += g4 * (3 * dxyz[Z]) +
+                  g5 * (3 * dxyz[X] * dxyz[X] * dxyz[Z] + 3 * dxyz[Y] * dxyz[Y] * dxyz[Z] + dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] + dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] +
+                        3 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tXXYZZZZ] += g4 * (3 * dxyz[Y]) + g5 * (3 * dxyz[X] * dxyz[X] * dxyz[Y] + 6 * dxyz[Y] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 6 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tXXZZZZZ] += g4 * (15 * dxyz[Z]) + g5 * (15 * dxyz[X] * dxyz[X] * dxyz[Z] + 10 * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 10 * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tXYYYYYY] += g4 * (15 * dxyz[X]) + g5 * (45 * dxyz[X] * dxyz[Y] * dxyz[Y]) +
+                  g6 * (15 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                  g7 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y];
+
+  D7[tXYYYYYZ] += g5 * (15 * dxyz[X] * dxyz[Y] * dxyz[Z]) + g6 * (10 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z];
+
+  D7[tXYYYYZZ] += g4 * (3 * dxyz[X]) + g5 * (6 * dxyz[X] * dxyz[Y] * dxyz[Y] + 3 * dxyz[X] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (6 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] + dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                  g7 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z];
+
+  D7[tXYYYZZZ] += g5 * (9 * dxyz[X] * dxyz[Y] * dxyz[Z]) +
+                  g6 * (3 * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 3 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tXYYZZZZ] += g4 * (3 * dxyz[X]) + g5 * (3 * dxyz[X] * dxyz[Y] * dxyz[Y] + 6 * dxyz[X] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 6 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tXYZZZZZ] += g5 * (15 * dxyz[X] * dxyz[Y] * dxyz[Z]) + g6 * (10 * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tXZZZZZZ] += g4 * (15 * dxyz[X]) + g5 * (45 * dxyz[X] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (15 * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g7 * dxyz[X] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tYYYYYYY] += g4 * (105 * dxyz[Y]) + g5 * (105 * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                  g6 * (21 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                  g7 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y];
+
+  D7[tYYYYYYZ] += g4 * (15 * dxyz[Z]) + g5 * (45 * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                  g6 * (15 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                  g7 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z];
+
+  D7[tYYYYYZZ] += g4 * (15 * dxyz[Y]) + g5 * (10 * dxyz[Y] * dxyz[Y] * dxyz[Y] + 15 * dxyz[Y] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (10 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] + dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y]) +
+                  g7 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z];
+
+  D7[tYYYYZZZ] += g4 * (9 * dxyz[Z]) + g5 * (18 * dxyz[Y] * dxyz[Y] * dxyz[Z] + 3 * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (6 * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 3 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z]) +
+                  g7 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tYYYZZZZ] += g4 * (9 * dxyz[Y]) + g5 * (3 * dxyz[Y] * dxyz[Y] * dxyz[Y] + 18 * dxyz[Y] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (3 * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 6 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z]) +
+                  g7 * dxyz[Y] * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tYYZZZZZ] += g4 * (15 * dxyz[Z]) + g5 * (15 * dxyz[Y] * dxyz[Y] * dxyz[Z] + 10 * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] + 10 * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g7 * dxyz[Y] * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tYZZZZZZ] += g4 * (15 * dxyz[Y]) + g5 * (45 * dxyz[Y] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (15 * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g7 * dxyz[Y] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+  D7[tZZZZZZZ] += g4 * (105 * dxyz[Z]) + g5 * (105 * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g6 * (21 * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z]) +
+                  g7 * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z] * dxyz[Z];
+
+#undef X
+#undef Y
+#undef Z
+}
+
+#endif
diff --git a/src/data/test_symtensors.cc b/src/data/test_symtensors.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7a2ecf43c98b7a85eb4be9173fb93920d7e08f00
--- /dev/null
+++ b/src/data/test_symtensors.cc
@@ -0,0 +1,352 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file test_symtensors.cc
+ *
+ *  \brief some test routines for the symmetric tensor implementation
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef DEBUG_SYMTENSORS
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/symtensors.h"
+#include "../sort/cxxsort.h"
+#include "../system/system.h"
+
+static bool compare_list(const int &a, const int &b) { return a < b; }
+
+static void symtensor_test_tensor4_contraction_with_tensor1(void)
+{
+  symtensor4<double> T4;
+  vector<double> T1;
+
+  int I4[3][3][3][3];
+  int I3[3][3][3];
+  int I1[3] = {0, 1, 2};
+
+  int count = 0;
+
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      for(int k = 0; k < 3; k++)
+        for(int l = 0; l < 3; l++)
+          {
+            if(i <= j && j <= k && k <= l)
+              {
+                I4[i][j][k][l] = count++;
+              }
+            else
+              {
+                int li[] = {i, j, k, l};
+
+                mycxxsort(li, li + 4, compare_list);
+
+                I4[i][j][k][l] = I4[li[0]][li[1]][li[2]][li[3]];
+              }
+
+            // printf("%c%c%c%c:  %d\n", 'X' + i, 'X' + j, 'X' + k, 'X' + l, I4[i][j][k][l]);
+          }
+
+  count = 0;
+
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      for(int k = 0; k < 3; k++)
+        {
+          if(i <= j && j <= k)
+            {
+              I3[i][j][k] = count++;
+            }
+          else
+            {
+              int li[] = {i, j, k};
+
+              mycxxsort(li, li + 3, compare_list);
+
+              I3[i][j][k] = I3[li[0]][li[1]][li[2]];
+            }
+
+          // printf("%c%c:  %d\n", 'X' + i, 'X' + j, I2[i][j]);
+        }
+
+  for(int i = 0; i < 15; i++)
+    T4[i] = get_random_number();
+
+  for(int i = 0; i < 3; i++)
+    T1[i] = get_random_number();
+
+  /* now fill in the whole general matrix based on the symmetric one */
+  double M4[3][3][3][3];
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      for(int k = 0; k < 3; k++)
+        for(int l = 0; l < 3; l++)
+          M4[i][j][k][l] = T4[I4[i][j][k][l]];
+
+  /* now fill in the whole general matrix based on the symmetric one */
+  double M1[3];
+  for(int i = 0; i < 3; i++)
+    M1[i] = T1[I1[i]];
+
+  /* result of full matrix reduction */
+  double R3[3][3][3];
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      for(int k = 0; k < 3; k++)
+        {
+          R3[i][j][k] = 0;
+
+          for(int l = 0; l < 3; l++)
+            R3[i][j][k] += M4[i][j][k][l] * M1[l];
+        }
+
+  /* now let's compare the result */
+
+  symtensor3<double> S3 = T4 * T1;  // reduction of symmetric 4-tensor with symmetric 2-tensor
+
+  printf("Result comparison:\n");
+
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      for(int k = 0; k < 3; k++)
+        {
+          if(i <= j && j <= k)
+            {
+              printf("%c%c%c:  %g   %g\n", 'X' + i, 'X' + j, 'X' + k, S3[I3[i][j][k]], R3[i][j][k]);
+            }
+        }
+
+  printf("\n");
+}
+
+static void symtensor_test_tensor4_contraction_with_tensor2(void)
+{
+  symtensor4<double> T4;
+  symtensor2<double> T2;
+
+  int I4[3][3][3][3];
+  int I2[3][3];
+
+  int count = 0;
+
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      for(int k = 0; k < 3; k++)
+        for(int l = 0; l < 3; l++)
+          {
+            if(i <= j && j <= k && k <= l)
+              {
+                I4[i][j][k][l] = count++;
+              }
+            else
+              {
+                int li[] = {i, j, k, l};
+
+                mycxxsort(li, li + 4, compare_list);
+
+                I4[i][j][k][l] = I4[li[0]][li[1]][li[2]][li[3]];
+              }
+
+            // printf("%c%c%c%c:  %d\n", 'X' + i, 'X' + j, 'X' + k, 'X' + l, I4[i][j][k][l]);
+          }
+
+  // printf("count=%d\n\n", count);
+
+  count = 0;
+
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      {
+        if(i <= j)
+          {
+            I2[i][j] = count++;
+          }
+        else
+          {
+            int li[] = {i, j};
+
+            mycxxsort(li, li + 2, compare_list);
+
+            I2[i][j] = I2[li[0]][li[1]];
+          }
+
+        // printf("%c%c:  %d\n", 'X' + i, 'X' + j, I2[i][j]);
+      }
+
+  // printf("count=%d\n\n", count);
+
+  for(int i = 0; i < 15; i++)
+    T4[i] = get_random_number();
+
+  for(int i = 0; i < 6; i++)
+    T2[i] = get_random_number();
+
+  /* now fill in the whole general matrix based on the symmetric one */
+  double M4[3][3][3][3];
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      for(int k = 0; k < 3; k++)
+        for(int l = 0; l < 3; l++)
+          M4[i][j][k][l] = T4[I4[i][j][k][l]];
+
+  /* now fill in the whole general matrix based on the symmetric one */
+  double M2[3][3];
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      M2[i][j] = T2[I2[i][j]];
+
+  /* result of full matrix reduction */
+  double R2[3][3];
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      {
+        R2[i][j] = 0;
+
+        for(int k = 0; k < 3; k++)
+          for(int l = 0; l < 3; l++)
+            R2[i][j] += M4[i][j][k][l] * M2[k][l];
+      }
+
+  /* now let's compare the result */
+
+  symtensor2<double> S2 = T4 * T2;  // reduction of symmetric 4-tensor with symmetric 2-tensor
+
+  printf("Result comparison:\n");
+
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      {
+        if(i <= j)
+          {
+            printf("%c%c:  %g   %g\n", 'X' + i, 'X' + j, S2[I2[i][j]], R2[i][j]);
+          }
+      }
+
+  printf("\n");
+}
+
+static void symtensor_test_tensor4_contraction_with_tensor3(void)
+{
+  symtensor4<double> T4;
+  symtensor3<double> T3;
+
+  int I4[3][3][3][3];
+  int I3[3][3][3];
+
+  int count = 0;
+
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      for(int k = 0; k < 3; k++)
+        for(int l = 0; l < 3; l++)
+          {
+            if(i <= j && j <= k && k <= l)
+              {
+                I4[i][j][k][l] = count++;
+              }
+            else
+              {
+                int li[] = {i, j, k, l};
+
+                mycxxsort(li, li + 4, compare_list);
+
+                I4[i][j][k][l] = I4[li[0]][li[1]][li[2]][li[3]];
+              }
+
+            // printf("%c%c%c%c:  %d\n", 'X' + i, 'X' + j, 'X' + k, 'X' + l, I4[i][j][k][l]);
+          }
+
+  // printf("count=%d\n\n", count);
+
+  count = 0;
+
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      for(int k = 0; k < 3; k++)
+        {
+          if(i <= j && j <= k)
+            {
+              I3[i][j][k] = count++;
+            }
+          else
+            {
+              int li[] = {i, j, k};
+
+              mycxxsort(li, li + 3, compare_list);
+
+              I3[i][j][k] = I3[li[0]][li[1]][li[2]];
+            }
+
+          // printf("%c%c:  %d\n", 'X' + i, 'X' + j, I2[i][j]);
+        }
+
+  // printf("count=%d\n\n", count);
+
+  for(int i = 0; i < 15; i++)
+    T4[i] = get_random_number();
+
+  for(int i = 0; i < 10; i++)
+    T3[i] = get_random_number();
+
+  /* now fill in the whole general matrix based on the symmetric one */
+  double M4[3][3][3][3];
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      for(int k = 0; k < 3; k++)
+        for(int l = 0; l < 3; l++)
+          M4[i][j][k][l] = T4[I4[i][j][k][l]];
+
+  /* now fill in the whole general matrix based on the symmetric one */
+  double M3[3][3][3];
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      for(int k = 0; k < 3; k++)
+        M3[i][j][k] = T3[I3[i][j][k]];
+
+  /* result of full matrix reduction */
+  double R1[3];
+  for(int i = 0; i < 3; i++)
+    {
+      R1[i] = 0;
+
+      for(int j = 0; j < 3; j++)
+        for(int k = 0; k < 3; k++)
+          for(int l = 0; l < 3; l++)
+            R1[i] += M4[i][j][k][l] * M3[j][k][l];
+    }
+
+  /* now let's compare the result */
+
+  vector<double> S1 = T4 * T3;  // reduction of symmetric 4-tensor with symmetric 2-tensor
+
+  printf("Result comparison:\n");
+
+  for(int i = 0; i < 3; i++)
+    printf("%c:  %g   %g\n", 'X' + i, S1[i], R1[i]);
+
+  printf("\n");
+}
+
+void symtensor_test(void)
+{
+  symtensor_test_tensor4_contraction_with_tensor1();
+  symtensor_test_tensor4_contraction_with_tensor2();
+  symtensor_test_tensor4_contraction_with_tensor3();
+
+  Terminate("Done with test");
+}
+
+#endif
diff --git a/src/debug_md5/Md5.cc b/src/debug_md5/Md5.cc
new file mode 100644
index 0000000000000000000000000000000000000000..16649ea3a726a5cc7f0241973028530acd4daa41
--- /dev/null
+++ b/src/debug_md5/Md5.cc
@@ -0,0 +1,296 @@
+#include "gadgetconfig.h"
+
+/*! \file Md5.cc
+ *
+ *  \brief implementation code of MD5 checksum computation for blocks of memory
+ */
+
+#ifdef DEBUG_MD5
+
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+
+/*
+ **********************************************************************
+ ** md5.c                                                            **
+ ** RSA Data Security, Inc. MD5 Message Digest Algorithm             **
+ ** Created: 2/17/90 RLR                                             **
+ ** Revised: 1/91 SRD,AJ,BSK,JT Reference C Version                  **
+ ** Revised: 09/2018 by V. Springel to allow C++ compilation         **
+ **********************************************************************
+ */
+
+/*
+ **********************************************************************
+ ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
+ **                                                                  **
+ ** License to copy and use this software is granted provided that   **
+ ** it is identified as the "RSA Data Security, Inc. MD5 Message     **
+ ** Digest Algorithm" in all material mentioning or referencing this **
+ ** software or this function.                                       **
+ **                                                                  **
+ ** License is also granted to make and use derivative works         **
+ ** provided that such works are identified as "derived from the RSA **
+ ** Data Security, Inc. MD5 Message Digest Algorithm" in all         **
+ ** material mentioning or referencing the derived work.             **
+ **                                                                  **
+ ** RSA Data Security, Inc. makes no representations concerning      **
+ ** either the merchantability of this software or the suitability   **
+ ** of this software for any particular purpose.  It is provided "as **
+ ** is" without express or implied warranty of any kind.             **
+ **                                                                  **
+ ** These notices must be retained in any copies of any part of this **
+ ** documentation and/or software.                                   **
+ **********************************************************************
+ */
+
+/* -- include the following line if the md5.h header file is separate -- */
+#include "Md5.h"
+
+/* forward declaration */
+static void Transform(UINT4 *buf, UINT4 *in);
+static void MD5Update(MD5_CTX *mdContext, unsigned char *inBuf, unsigned int inLen);
+
+static unsigned char PADDING[64] = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+
+/* F, G and H are basic MD5 functions: selection, majority, parity */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+
+/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
+/* Rotation is separate from addition to prevent recomputation */
+#define FF(a, b, c, d, x, s, ac)                 \
+  {                                              \
+    (a) += F((b), (c), (d)) + (x) + (UINT4)(ac); \
+    (a) = ROTATE_LEFT((a), (s));                 \
+    (a) += (b);                                  \
+  }
+#define GG(a, b, c, d, x, s, ac)                 \
+  {                                              \
+    (a) += G((b), (c), (d)) + (x) + (UINT4)(ac); \
+    (a) = ROTATE_LEFT((a), (s));                 \
+    (a) += (b);                                  \
+  }
+#define HH(a, b, c, d, x, s, ac)                 \
+  {                                              \
+    (a) += H((b), (c), (d)) + (x) + (UINT4)(ac); \
+    (a) = ROTATE_LEFT((a), (s));                 \
+    (a) += (b);                                  \
+  }
+#define II(a, b, c, d, x, s, ac)                 \
+  {                                              \
+    (a) += I((b), (c), (d)) + (x) + (UINT4)(ac); \
+    (a) = ROTATE_LEFT((a), (s));                 \
+    (a) += (b);                                  \
+  }
+
+void MD5Init(MD5_CTX *mdContext)
+{
+  mdContext->i[0] = mdContext->i[1] = (UINT4)0;
+
+  /* Load magic initialization constants.
+   */
+  mdContext->buf[0] = (UINT4)0x67452301;
+  mdContext->buf[1] = (UINT4)0xefcdab89;
+  mdContext->buf[2] = (UINT4)0x98badcfe;
+  mdContext->buf[3] = (UINT4)0x10325476;
+}
+
+void MD5UpdateLong(MD5_CTX *mdContext, unsigned char *inBuf, unsigned long long inLenLong)
+{
+  while(inLenLong > 0)
+    {
+      unsigned int inLen = 0x10000000;
+      if(inLen > inLenLong)
+        inLen = inLenLong;
+      MD5Update(mdContext, inBuf, inLen);
+      inBuf += inLen;
+      inLenLong -= inLen;
+    }
+}
+
+void MD5Update(MD5_CTX *mdContext, unsigned char *inBuf, unsigned int inLen)
+{
+  UINT4 in[16];
+  int mdi;
+  unsigned int i, ii;
+
+  /* compute number of bytes mod 64 */
+  mdi = (int)((mdContext->i[0] >> 3) & 0x3F);
+
+  /* update number of bits */
+  if((mdContext->i[0] + ((UINT4)inLen << 3)) < mdContext->i[0])
+    mdContext->i[1]++;
+  mdContext->i[0] += ((UINT4)inLen << 3);
+  mdContext->i[1] += ((UINT4)inLen >> 29);
+
+  while(inLen--)
+    {
+      /* add new character to buffer, increment mdi */
+      mdContext->in[mdi++] = *inBuf++;
+
+      /* transform if necessary */
+      if(mdi == 0x40)
+        {
+          for(i = 0, ii = 0; i < 16; i++, ii += 4)
+            in[i] = (((UINT4)mdContext->in[ii + 3]) << 24) | (((UINT4)mdContext->in[ii + 2]) << 16) |
+                    (((UINT4)mdContext->in[ii + 1]) << 8) | ((UINT4)mdContext->in[ii]);
+          Transform(mdContext->buf, in);
+          mdi = 0;
+        }
+    }
+}
+
+void MD5Final(MD5_CTX *mdContext)
+{
+  UINT4 in[16];
+  int mdi;
+  unsigned int i, ii;
+  unsigned int padLen;
+
+  /* save number of bits */
+  in[14] = mdContext->i[0];
+  in[15] = mdContext->i[1];
+
+  /* compute number of bytes mod 64 */
+  mdi = (int)((mdContext->i[0] >> 3) & 0x3F);
+
+  /* pad out to 56 mod 64 */
+  padLen = (mdi < 56) ? (56 - mdi) : (120 - mdi);
+  MD5Update(mdContext, PADDING, padLen);
+
+  /* append length in bits and transform */
+  for(i = 0, ii = 0; i < 14; i++, ii += 4)
+    in[i] = (((UINT4)mdContext->in[ii + 3]) << 24) | (((UINT4)mdContext->in[ii + 2]) << 16) | (((UINT4)mdContext->in[ii + 1]) << 8) |
+            ((UINT4)mdContext->in[ii]);
+  Transform(mdContext->buf, in);
+
+  /* store buffer in digest */
+  for(i = 0, ii = 0; i < 4; i++, ii += 4)
+    {
+      mdContext->digest[ii]     = (unsigned char)(mdContext->buf[i] & 0xFF);
+      mdContext->digest[ii + 1] = (unsigned char)((mdContext->buf[i] >> 8) & 0xFF);
+      mdContext->digest[ii + 2] = (unsigned char)((mdContext->buf[i] >> 16) & 0xFF);
+      mdContext->digest[ii + 3] = (unsigned char)((mdContext->buf[i] >> 24) & 0xFF);
+    }
+}
+
+/* Basic MD5 step. Transform buf based on in.
+ */
+static void Transform(UINT4 *buf, UINT4 *in)
+{
+  UINT4 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
+
+  /* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+  FF(a, b, c, d, in[0], S11, 3614090360);  /* 1 */
+  FF(d, a, b, c, in[1], S12, 3905402710);  /* 2 */
+  FF(c, d, a, b, in[2], S13, 606105819);   /* 3 */
+  FF(b, c, d, a, in[3], S14, 3250441966);  /* 4 */
+  FF(a, b, c, d, in[4], S11, 4118548399);  /* 5 */
+  FF(d, a, b, c, in[5], S12, 1200080426);  /* 6 */
+  FF(c, d, a, b, in[6], S13, 2821735955);  /* 7 */
+  FF(b, c, d, a, in[7], S14, 4249261313);  /* 8 */
+  FF(a, b, c, d, in[8], S11, 1770035416);  /* 9 */
+  FF(d, a, b, c, in[9], S12, 2336552879);  /* 10 */
+  FF(c, d, a, b, in[10], S13, 4294925233); /* 11 */
+  FF(b, c, d, a, in[11], S14, 2304563134); /* 12 */
+  FF(a, b, c, d, in[12], S11, 1804603682); /* 13 */
+  FF(d, a, b, c, in[13], S12, 4254626195); /* 14 */
+  FF(c, d, a, b, in[14], S13, 2792965006); /* 15 */
+  FF(b, c, d, a, in[15], S14, 1236535329); /* 16 */
+
+  /* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+  GG(a, b, c, d, in[1], S21, 4129170786);  /* 17 */
+  GG(d, a, b, c, in[6], S22, 3225465664);  /* 18 */
+  GG(c, d, a, b, in[11], S23, 643717713);  /* 19 */
+  GG(b, c, d, a, in[0], S24, 3921069994);  /* 20 */
+  GG(a, b, c, d, in[5], S21, 3593408605);  /* 21 */
+  GG(d, a, b, c, in[10], S22, 38016083);   /* 22 */
+  GG(c, d, a, b, in[15], S23, 3634488961); /* 23 */
+  GG(b, c, d, a, in[4], S24, 3889429448);  /* 24 */
+  GG(a, b, c, d, in[9], S21, 568446438);   /* 25 */
+  GG(d, a, b, c, in[14], S22, 3275163606); /* 26 */
+  GG(c, d, a, b, in[3], S23, 4107603335);  /* 27 */
+  GG(b, c, d, a, in[8], S24, 1163531501);  /* 28 */
+  GG(a, b, c, d, in[13], S21, 2850285829); /* 29 */
+  GG(d, a, b, c, in[2], S22, 4243563512);  /* 30 */
+  GG(c, d, a, b, in[7], S23, 1735328473);  /* 31 */
+  GG(b, c, d, a, in[12], S24, 2368359562); /* 32 */
+
+  /* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+  HH(a, b, c, d, in[5], S31, 4294588738);  /* 33 */
+  HH(d, a, b, c, in[8], S32, 2272392833);  /* 34 */
+  HH(c, d, a, b, in[11], S33, 1839030562); /* 35 */
+  HH(b, c, d, a, in[14], S34, 4259657740); /* 36 */
+  HH(a, b, c, d, in[1], S31, 2763975236);  /* 37 */
+  HH(d, a, b, c, in[4], S32, 1272893353);  /* 38 */
+  HH(c, d, a, b, in[7], S33, 4139469664);  /* 39 */
+  HH(b, c, d, a, in[10], S34, 3200236656); /* 40 */
+  HH(a, b, c, d, in[13], S31, 681279174);  /* 41 */
+  HH(d, a, b, c, in[0], S32, 3936430074);  /* 42 */
+  HH(c, d, a, b, in[3], S33, 3572445317);  /* 43 */
+  HH(b, c, d, a, in[6], S34, 76029189);    /* 44 */
+  HH(a, b, c, d, in[9], S31, 3654602809);  /* 45 */
+  HH(d, a, b, c, in[12], S32, 3873151461); /* 46 */
+  HH(c, d, a, b, in[15], S33, 530742520);  /* 47 */
+  HH(b, c, d, a, in[2], S34, 3299628645);  /* 48 */
+
+  /* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+  II(a, b, c, d, in[0], S41, 4096336452);  /* 49 */
+  II(d, a, b, c, in[7], S42, 1126891415);  /* 50 */
+  II(c, d, a, b, in[14], S43, 2878612391); /* 51 */
+  II(b, c, d, a, in[5], S44, 4237533241);  /* 52 */
+  II(a, b, c, d, in[12], S41, 1700485571); /* 53 */
+  II(d, a, b, c, in[3], S42, 2399980690);  /* 54 */
+  II(c, d, a, b, in[10], S43, 4293915773); /* 55 */
+  II(b, c, d, a, in[1], S44, 2240044497);  /* 56 */
+  II(a, b, c, d, in[8], S41, 1873313359);  /* 57 */
+  II(d, a, b, c, in[15], S42, 4264355552); /* 58 */
+  II(c, d, a, b, in[6], S43, 2734768916);  /* 59 */
+  II(b, c, d, a, in[13], S44, 1309151649); /* 60 */
+  II(a, b, c, d, in[4], S41, 4149444226);  /* 61 */
+  II(d, a, b, c, in[11], S42, 3174756917); /* 62 */
+  II(c, d, a, b, in[2], S43, 718787259);   /* 63 */
+  II(b, c, d, a, in[9], S44, 3951481745);  /* 64 */
+
+  buf[0] += a;
+  buf[1] += b;
+  buf[2] += c;
+  buf[3] += d;
+}
+
+/*
+ **********************************************************************
+ ** End of md5.c                                                     **
+ ******************************* (cut) ********************************
+ */
+
+#endif
diff --git a/src/debug_md5/Md5.h b/src/debug_md5/Md5.h
new file mode 100644
index 0000000000000000000000000000000000000000..88865ab4d0fe7cb3e4f9846b9a235ff4415d099a
--- /dev/null
+++ b/src/debug_md5/Md5.h
@@ -0,0 +1,67 @@
+
+/*! \file Md5.h
+ *
+ *  \brief definition and prototypes for MD5 routines
+ */
+
+/*
+ **********************************************************************
+ ** md5.h -- Header file for implementation of MD5                   **
+ ** RSA Data Security, Inc. MD5 Message Digest Algorithm             **
+ ** Created: 2/17/90 RLR                                             **
+ ** Revised: 12/27/90 SRD,AJ,BSK,JT Reference C version              **
+ ** Revised (for MD5): RLR 4/27/91                                   **
+ **   -- G modified to have y&~z instead of y&z                      **
+ **   -- FF, GG, HH modified to add in last register done            **
+ **   -- Access pattern: round 2 works mod 5, round 3 works mod 3    **
+ **   -- distinct additive constant for each step                    **
+ **   -- round 4 added, working mod 7                                **
+ **********************************************************************
+ */
+
+/*
+ **********************************************************************
+ ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
+ **                                                                  **
+ ** License to copy and use this software is granted provided that   **
+ ** it is identified as the "RSA Data Security, Inc. MD5 Message     **
+ ** Digest Algorithm" in all material mentioning or referencing this **
+ ** software or this function.                                       **
+ **                                                                  **
+ ** License is also granted to make and use derivative works         **
+ ** provided that such works are identified as "derived from the RSA **
+ ** Data Security, Inc. MD5 Message Digest Algorithm" in all         **
+ ** material mentioning or referencing the derived work.             **
+ **                                                                  **
+ ** RSA Data Security, Inc. makes no representations concerning      **
+ ** either the merchantability of this software or the suitability   **
+ ** of this software for any particular purpose.  It is provided "as **
+ ** is" without express or implied warranty of any kind.             **
+ **                                                                  **
+ ** These notices must be retained in any copies of any part of this **
+ ** documentation and/or software.                                   **
+ **********************************************************************
+ */
+
+/* typedef a 32 bit type */
+typedef unsigned long int UINT4;
+
+/* Data structure for MD5 (Message Digest) computation */
+struct MD5_CTX
+{
+  UINT4 i[2];               /* number of _bits_ handled mod 2^64 */
+  UINT4 buf[4];             /* scratch buffer */
+  unsigned char in[64];     /* input buffer */
+  unsigned char digest[16]; /* actual digest after MD5Final call */
+};
+
+void MD5Final(MD5_CTX *mdContext);
+// void MD5Update(MD5_CTX * mdContext, unsigned char *inBuf, unsigned int inLen);
+void MD5UpdateLong(MD5_CTX *mdContext, unsigned char *inBuf, unsigned long long inLenLong);
+void MD5Init(MD5_CTX *mdContext);
+
+/*
+ **********************************************************************
+ ** End of md5.h                                                     **
+ ******************************* (cut) ********************************
+ */
diff --git a/src/debug_md5/calc_checksum.cc b/src/debug_md5/calc_checksum.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e86d077d77004af83467df39b4ce06f1586689a6
--- /dev/null
+++ b/src/debug_md5/calc_checksum.cc
@@ -0,0 +1,117 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file calc_checksum.cc
+ *
+ *  \brief auxiliary routines to compute MD5 checksums for blocks of memory
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef DEBUG_MD5
+
+#include <mpi.h>
+#include <stdio.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../debug_md5/Md5.h"
+#include "../logs/logs.h"
+#include "../main/simulation.h"
+
+void logs::block_checksum(void *base, size_t bytes, int res[4])
+{
+  MD5_CTX sum;
+  union
+  {
+    unsigned char digest[16];
+    int val[4];
+  } u;
+
+  MD5Init(&sum);
+  MD5UpdateLong(&sum, (unsigned char *)base, bytes);
+  MD5Final(&sum);
+
+  for(int i = 0; i < 16; i++)
+    u.digest[i] = sum.digest[i];
+
+  for(int i = 0; i < 4; i++)
+    res[i] = u.val[i];
+}
+
+void logs::calc_memory_checksum(const char *msg, void *base, size_t bytes)
+{
+  MD5_CTX sum;
+  union
+  {
+    unsigned char digest[16];
+    int val[4];
+  } u, uglob;
+
+  MD5Init(&sum);
+  MD5UpdateLong(&sum, (unsigned char *)base, bytes);
+  MD5Final(&sum);
+
+  for(int i = 0; i < 16; i++)
+    u.digest[i] = sum.digest[i];
+
+  MPI_Allreduce(u.val, uglob.val, 4, MPI_INT, MPI_SUM, Communicator);
+
+  if(ThisTask == 0)
+    {
+      fprintf(FdDebug, "\n");
+      fprintf(FdDebug, "Step=%8d  P[]     %s:   ", All.NumCurrentTiStep, msg);
+      for(int i = 0; i < 16; i++)
+        fprintf(FdDebug, "%02x", uglob.digest[i]);
+      fprintf(FdDebug, "\n");
+    }
+}
+
+void logs::log_debug_md5(const char *msg)
+{
+  MD5_CTX sum;
+  union
+  {
+    unsigned char digest[16];
+    int val[4];
+  } u, uglob_P, uglob_SphP;
+
+  MD5Init(&sum);
+  MD5UpdateLong(&sum, (unsigned char *)Sp->P, Sp->NumPart * sizeof(particle_data));
+  MD5Final(&sum);
+
+  for(int i = 0; i < 16; i++)
+    u.digest[i] = sum.digest[i];
+
+  MPI_Allreduce(u.val, uglob_P.val, 4, MPI_INT, MPI_SUM, Communicator);
+
+  MD5Init(&sum);
+  MD5UpdateLong(&sum, (unsigned char *)Sp->SphP, Sp->NumGas * sizeof(sph_particle_data));
+  MD5Final(&sum);
+
+  for(int i = 0; i < 16; i++)
+    u.digest[i] = sum.digest[i];
+
+  MPI_Allreduce(u.val, uglob_SphP.val, 4, MPI_INT, MPI_SUM, Communicator);
+
+  if(ThisTask == 0)
+    {
+      fprintf(FdDebug, "\n");
+      fprintf(FdDebug, "Step=%8d  P[]     %s:   ", All.NumCurrentTiStep, msg);
+      for(int i = 0; i < 16; i++)
+        fprintf(FdDebug, "%02x", uglob_P.digest[i]);
+      fprintf(FdDebug, "\n");
+
+      fprintf(FdDebug, "               SphP[]  %s:   ", msg);
+      for(int i = 0; i < 16; i++)
+        fprintf(FdDebug, "%02x", uglob_SphP.digest[i]);
+      fprintf(FdDebug, "\n");
+
+      fflush(FdDebug);
+    }
+}
+
+#endif
diff --git a/src/domain/domain.cc b/src/domain/domain.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79f86661e458faea095e98e64f445160ff904a5b
--- /dev/null
+++ b/src/domain/domain.cc
@@ -0,0 +1,550 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file domain.cc
+ *
+ *  \brief code for work-load balanced domain decomposition
+ *
+ *  This class contains the code for the domain decomposition of the
+ *  simulation volume.  The domains are constructed from disjoint subsets
+ *  of leaves of a fiducial top-level tree that covers the full
+ *  simulation volume. Domain boundaries hence run along tree-node
+ *  divisions of a fiducial global BH oct-tree. As a result of this method, the
+ *  gravity forces are in principle strictly independent of the way the domains
+ *  are cut. The domain decomposition can be carried out for an arbitrary
+ *  number of CPUs. Individual domain pieces are not cubical, but spatially
+ *  coherent since the leaves are traversed in a Peano-Hilbert order and
+ *  individual domains form segments along this order.  This also ensures
+ *  that each domain has a small surface to volume ratio, which reduces
+ *  communication.
+ */
+
+#include "gadgetconfig.h"
+
+#include <mpi.h>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../sort/peano.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/*! This is the main routine for the domain decomposition.  It acts as a
+ *  driver routine that allocates various temporary buffers, maps the
+ *  particles back onto the periodic box if needed, and then does the
+ *  domain decomposition, followed by a final Peano-Hilbert order of all particles
+ *  as a tuning measure.
+ */
+template <typename partset>
+void domain<partset>::domain_decomposition(domain_options mode)
+{
+  Mode = mode;
+
+  TIMER_START(CPU_DOMAIN);
+
+  double t0 = Logs.second();
+
+  domain_printf("DOMAIN: Begin domain decomposition (sync-point %d).\n", All.NumCurrentTiStep);
+
+  /* map the particles by a shift vector back if desired */
+  do_box_wrapping();
+
+  /* determine which time bins need to be balanced */
+  domain_init_sum_cost();
+
+  /* find total cost factors, including a determination of MultipleDomains */
+  domain_find_total_cost();
+
+  /* allocate some fields that need to stay */
+  domain_allocate();
+
+  /* allocate some arrays we are going to use */
+  domain_key = (peanokey *)Mem.mymalloc_movable(&domain_key, "domain_key", (sizeof(peanokey) * Tp->MaxPart));
+  domain_leaf_cost =
+      (domain_cost_data *)Mem.mymalloc_movable(&domain_leaf_cost, "domain_leaf_cost", (MaxTopNodes * sizeof(domain_cost_data)));
+
+  topNodes = (local_topnode_data *)Mem.mymalloc_movable(&topNodes, "topNodes", (MaxTopNodes * sizeof(local_topnode_data)));
+
+  /* determine top-level tree */
+  domain_determineTopTree();
+
+  /* combine on each MPI task several of the domains (namely the number MultipleDomains) */
+  domain_combine_multipledomains();
+
+  Mem.myfree(topNodes);
+  Mem.myfree(domain_leaf_cost);
+
+  /* move stars out of gas block if present */
+  domain_rearrange_particle_sequence();
+
+  /* finally, carry out the actual particle exchange */
+  if(Mode == STANDARD)
+    domain_exchange();
+  else if(Mode == COLL_SUBFIND)
+    domain_coll_subfind_prepare_exchange();
+
+  double t1 = Logs.second();
+
+  domain_printf("DOMAIN: domain decomposition done. (took in total %g sec)\n", Logs.timediff(t0, t1));
+
+  if(Mode == STANDARD)
+    {
+      TIMER_STOPSTART(CPU_DOMAIN, CPU_PEANO);
+
+      peano_hilbert_order(domain_key);
+
+      TIMER_STOPSTART(CPU_PEANO, CPU_DOMAIN);
+    }
+
+  Mem.myfree(domain_key);
+
+  Mem.myfree(ListOfTopleaves);
+
+  TaskOfLeaf = (int *)Mem.myrealloc_movable(TaskOfLeaf, NTopleaves * sizeof(int));
+  TopNodes   = (topnode_data *)Mem.myrealloc_movable(TopNodes, NTopnodes * sizeof(topnode_data));
+
+  ListOfTopleaves = (int *)Mem.mymalloc_movable(&ListOfTopleaves, "ListOfTopleaves", (NTopleaves * sizeof(int)));
+
+  memset(NumTopleafOfTask, 0, NTask * sizeof(int));
+
+  for(int i = 0; i < NTopleaves; i++)
+    NumTopleafOfTask[TaskOfLeaf[i]]++;
+
+  FirstTopleafOfTask[0] = 0;
+  for(int i = 1; i < NTask; i++)
+    FirstTopleafOfTask[i] = FirstTopleafOfTask[i - 1] + NumTopleafOfTask[i - 1];
+
+  memset(NumTopleafOfTask, 0, NTask * sizeof(int));
+
+  for(int i = 0; i < NTopleaves; i++)
+    {
+      int task             = TaskOfLeaf[i];
+      int off              = FirstTopleafOfTask[task] + NumTopleafOfTask[task]++;
+      ListOfTopleaves[off] = i;
+    }
+
+  if(Mode == STANDARD)
+    {
+      /* the following will reconstruct the timebins and report the balance
+       * in case we are dealing with simparticles, for lcparticles nothing will be done
+       */
+      domain_report_balance();
+    }
+
+#ifdef DOMAIN_SPECIAL_CHECK
+  if(ThisTask == 0 && All.NumCurrentTiStep == 4)
+    Terminate("stop");
+#endif
+
+  TIMER_STOP(CPU_DOMAIN);
+}
+
+/*! This function allocates all the stuff that will be required for the tree-construction/walk later on */
+template <typename partset>
+void domain<partset>::domain_allocate(int maxtopnodes)
+{
+  MaxTopNodes = maxtopnodes;
+
+  if(FirstTopleafOfTask)
+    Terminate("domain storage already allocated");
+
+  FirstTopleafOfTask = (int *)Mem.mymalloc_movable(&FirstTopleafOfTask, "FirstTopleafOfTask", NTask * sizeof(int));
+  NumTopleafOfTask   = (int *)Mem.mymalloc_movable(&NumTopleafOfTask, "NumTopleafOfTask", NTask * sizeof(int));
+  TopNodes           = (topnode_data *)Mem.mymalloc_movable(&TopNodes, "TopNodes", (MaxTopNodes * sizeof(topnode_data)));
+  TaskOfLeaf         = (int *)Mem.mymalloc_movable(&TaskOfLeaf, "TaskOfLeaf", (MaxTopNodes * sizeof(int)));
+  ListOfTopleaves    = (int *)Mem.mymalloc_movable(&ListOfTopleaves, "DomainListOfLocalTopleaves", (MaxTopNodes * sizeof(int)));
+}
+
+/*! This function allocates all the stuff that will be required for the tree-construction/walk later on */
+template <typename partset>
+void domain<partset>::domain_allocate(void)
+{
+  int maxtopnodes = All.TopNodeAllocFactor * std::max<int>(All.TopNodeFactor * MultipleDomains * NTask, BASENUMBER);
+  domain_allocate(maxtopnodes);
+}
+
+template <typename partset>
+void domain<partset>::domain_free(void)
+{
+  if(!FirstTopleafOfTask)
+    Terminate("domain storage not allocated");
+
+  Mem.myfree_movable(ListOfTopleaves);
+  Mem.myfree_movable(TaskOfLeaf);
+  Mem.myfree_movable(TopNodes);
+  Mem.myfree_movable(NumTopleafOfTask);
+  Mem.myfree_movable(FirstTopleafOfTask);
+
+  ListOfTopleaves    = NULL;
+  TaskOfLeaf         = NULL;
+  TopNodes           = NULL;
+  NumTopleafOfTask   = NULL;
+  FirstTopleafOfTask = NULL;
+}
+
+template <typename partset>
+void domain<partset>::domain_printf(char *buf)
+{
+  if(All.RestartFlag == RST_BEGIN || All.RestartFlag == RST_RESUME || All.RestartFlag == RST_STARTFROMSNAP)
+    {
+      fprintf(Logs.FdDomain, "%s", buf);
+    }
+}
+
+template <>
+void domain<simparticles>::domain_find_total_cost(void)
+{
+  /* for each timebin that should be balanced, collect the gravity cost of
+   * the particles active in that timebin
+   */
+
+  for(int n = 0; n < NumTimeBinsToBeBalanced; n++)
+    {
+      GravCostPerListedTimeBin[n]    = 0.0;
+      MaxGravCostPerListedTimeBin[n] = 0.0;
+      /* do the same for the hydrodynamics
+       */
+      HydroCostPerListedTimeBin[n] = 0.0;
+    }
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      if(Tp->P[i].GravCost == 0)
+        Tp->P[i].GravCost = 1;
+
+      for(int n = 0; n < NumTimeBinsToBeBalanced; n++)
+        {
+          int bin = ListOfTimeBinsToBeBalanced[n];
+
+          if(bin >= Tp->P[i].TimeBinGrav)
+            {
+              GravCostPerListedTimeBin[n] += Tp->P[i].GravCost;
+              if(MaxGravCostPerListedTimeBin[n] < Tp->P[i].GravCost)
+                MaxGravCostPerListedTimeBin[n] = Tp->P[i].GravCost;
+            }
+
+          if(Tp->P[i].getType() == 0)
+            {
+#ifdef SUBFIND
+              if(Mode == COLL_SUBFIND)
+                {
+                  if(Tp->PS[i].DomainFlag)
+                    HydroCostPerListedTimeBin[n] += 1.0;
+                }
+              else
+#endif
+                {
+                  if(bin >= Tp->P[i].getTimeBinHydro())
+                    HydroCostPerListedTimeBin[n] += 1.0;
+                }
+            }
+        }
+    }
+
+  long long sum[2] = {Tp->NumPart, Tp->NumGas};
+
+  MPI_Allreduce(MPI_IN_PLACE, sum, 2, MPI_LONG_LONG, MPI_SUM, Communicator);
+
+  NormFactorLoad    = 1.0 / sum[0];
+  NormFactorLoadSph = sum[1] > 0.0 ? 1.0 / sum[1] : 0.0;
+
+  MultipleDomains = 0;
+  TotalCost       = 0.0;
+
+  if(NormFactorLoad > 0.0)
+    {
+      MultipleDomains += 1;
+      TotalCost += 1.0;
+    }
+
+  if(NormFactorLoadSph > 0.0)
+    {
+      MultipleDomains += 1;
+      TotalCost += 1.0;
+    }
+
+  MPI_Allreduce(MPI_IN_PLACE, GravCostPerListedTimeBin, NumTimeBinsToBeBalanced, MPI_DOUBLE, MPI_SUM, Communicator);
+  MPI_Allreduce(MPI_IN_PLACE, HydroCostPerListedTimeBin, NumTimeBinsToBeBalanced, MPI_DOUBLE, MPI_SUM, Communicator);
+
+  MPI_Allreduce(MPI_IN_PLACE, MaxGravCostPerListedTimeBin, NumTimeBinsToBeBalanced, MPI_DOUBLE, MPI_MAX, Communicator);
+
+  double limit = 1.0 / (All.TopNodeFactor * MultipleDomains * NTask);
+
+  for(int n = 0; n < NumTimeBinsToBeBalanced; n++)
+    {
+      if(GravCostPerListedTimeBin[n] > 0.0)
+        MultipleDomains += 1;
+
+      if(HydroCostPerListedTimeBin[n] > 0.0)
+        MultipleDomains += 1;
+
+      GravCostNormFactors[n]  = GravCostPerListedTimeBin[n] > 0.0 ? 1.0 / GravCostPerListedTimeBin[n] : 0.0;
+      HydroCostNormFactors[n] = HydroCostPerListedTimeBin[n] > 0.0 ? 1.0 / HydroCostPerListedTimeBin[n] : 0.0;
+
+      if(MaxGravCostPerListedTimeBin[n] * GravCostNormFactors[n] > limit)
+        GravCostNormFactors[n] = limit / MaxGravCostPerListedTimeBin[n];
+
+      if(HydroCostNormFactors[n] > limit)
+        HydroCostNormFactors[n] = limit;
+
+      TotalCost += GravCostPerListedTimeBin[n] * GravCostNormFactors[n];
+      TotalCost += HydroCostPerListedTimeBin[n] * HydroCostNormFactors[n];
+    }
+}
+
+#ifdef LIGHTCONE_PARTICLES
+template <>
+void domain<lcparticles>::domain_find_total_cost(void)
+{
+  long long sum[2] = {Tp->NumPart, Tp->NumGas};
+
+  MPI_Allreduce(MPI_IN_PLACE, sum, 2, MPI_LONG_LONG, MPI_SUM, Communicator);
+
+  NormFactorLoad    = 1.0 / sum[0];
+  NormFactorLoadSph = sum[1] > 0.0 ? 1.0 / sum[1] : 0.0;
+
+  MultipleDomains = 0;
+  TotalCost       = 0.0;
+
+  if(NormFactorLoad > 0.0)
+    {
+      MultipleDomains += 1;
+      TotalCost += 1.0;
+    }
+
+  if(NormFactorLoadSph > 0.0)
+    {
+      MultipleDomains += 1;
+      TotalCost += 1.0;
+    }
+
+  NumTimeBinsToBeBalanced = 0;
+}
+#endif
+
+template <>
+void domain<simparticles>::domain_rearrange_particle_sequence(void)
+{
+  if(Mode != STANDARD)
+    return;
+
+  for(int i = 0; i < Tp->NumGas; i++)
+    if(Tp->P[i].getType() != 0) /*If not a gas particle, swap to the end of the list */
+      {
+        particle_data psave = Tp->P[i];
+        peanokey key        = domain_key[i];
+
+        Tp->P[i]      = Tp->P[Tp->NumGas - 1];
+        Tp->SphP[i]   = Tp->SphP[Tp->NumGas - 1];
+        domain_key[i] = domain_key[Tp->NumGas - 1];
+
+        Tp->P[Tp->NumGas - 1]      = psave;
+        domain_key[Tp->NumGas - 1] = key;
+
+        Tp->NumGas--;
+        i--;
+      }
+  /*Now we have rearranged the particles,
+   *we don't need to do it again unless there are more stars*/
+}
+
+template <>
+void domain<simparticles>::domain_report_balance(void)
+{
+  Tp->reconstruct_timebins();
+
+  TIMER_STOPSTART(CPU_DOMAIN, CPU_LOGS);
+
+  if(Mode != STANDARD)
+    return;
+
+  /* now execute code to report balance */
+
+  /* get total particle counts */
+  long long loc_count[2 * TIMEBINS], glob_count[2 * TIMEBINS];
+
+  for(int i = 0; i < TIMEBINS; i++)
+    {
+      loc_count[i]            = Tp->TimeBinsGravity.TimeBinCount[i];
+      loc_count[TIMEBINS + i] = Tp->TimeBinsHydro.TimeBinCount[i];
+    }
+
+  MPI_Reduce(loc_count, glob_count, 2 * TIMEBINS, MPI_LONG_LONG_INT, MPI_SUM, 0, Communicator);
+
+  double loc_max_data[2 * TIMEBINS + 3], glob_max_data[2 * TIMEBINS + 3];
+  loc_max_data[2 * TIMEBINS + 0] = Tp->NumPart;
+  loc_max_data[2 * TIMEBINS + 1] = Tp->NumGas;
+  loc_max_data[2 * TIMEBINS + 2] = Tp->NumPart - Tp->NumGas;
+
+  double glob_sum_data[2 * TIMEBINS];
+
+  double *loc_HydroCost  = &loc_max_data[0];
+  double *loc_GravCost   = &loc_max_data[TIMEBINS];
+  double *max_HydroCost  = &glob_max_data[0];
+  double *max_GravCost   = &glob_max_data[TIMEBINS];
+  double *glob_HydroCost = &glob_sum_data[0];
+  double *glob_GravCost  = &glob_sum_data[TIMEBINS];
+
+  for(int i = 0; i < TIMEBINS; i++)
+    {
+      loc_GravCost[i]  = 0;
+      loc_HydroCost[i] = 0;
+    }
+
+#ifdef SELFGRAVITY
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      for(int bin = Tp->P[i].TimeBinGrav; bin <= All.HighestOccupiedTimeBin; bin++)
+        {
+          loc_GravCost[bin] += MIN_FLOAT_NUMBER + domain_grav_weight[bin] * Tp->P[i].GravCost;
+        }
+    }
+#endif
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    if(Tp->P[i].getType() == 0)
+      loc_HydroCost[Tp->P[i].getTimeBinHydro()] += 1.0;
+
+  /* now determine the cumulative cost for the hydrodynamics */
+  for(int i = 1; i <= All.HighestOccupiedTimeBin; i++)
+    loc_HydroCost[i] += loc_HydroCost[i - 1];
+
+  MPI_Reduce(loc_max_data, glob_sum_data, 2 * TIMEBINS, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+  MPI_Reduce(loc_max_data, glob_max_data, 2 * TIMEBINS + 3, MPI_DOUBLE, MPI_MAX, 0, Communicator);
+
+  if(ThisTask == 0)
+    {
+      double max_tot = glob_max_data[2 * TIMEBINS + 0];
+      double max_sph = glob_max_data[2 * TIMEBINS + 1];
+      double max_dm  = glob_max_data[2 * TIMEBINS + 2];
+
+      long long *tot_count     = &glob_count[0];
+      long long *tot_count_sph = &glob_count[TIMEBINS];
+
+      long long tot_cumulative[TIMEBINS];
+      tot_cumulative[0] = tot_count[0];
+
+      for(int i = 1; i < TIMEBINS; i++)
+        tot_cumulative[i] = tot_count[i] + tot_cumulative[i - 1];
+
+      double tot_gravcost = 0, max_gravcost = 0, tot_hydrocost = 0, max_hydrocost = 0;
+
+      for(int i = 0; i < TIMEBINS; i++)
+        {
+          tot_gravcost += domain_to_be_balanced[i] * glob_GravCost[i] / NTask;
+          max_gravcost += domain_to_be_balanced[i] * max_GravCost[i];
+
+          tot_hydrocost += domain_to_be_balanced[i] * glob_HydroCost[i] / NTask;
+          max_hydrocost += domain_to_be_balanced[i] * max_HydroCost[i];
+        }
+
+      double bal_grav_bin[TIMEBINS], bal_grav_bin_rel[TIMEBINS];
+      double bal_hydro_bin[TIMEBINS], bal_hydro_bin_rel[TIMEBINS];
+
+      for(int i = 0; i < TIMEBINS; i++)
+        {
+          if(tot_count[i] > 0)
+            {
+              bal_grav_bin[i] = max_GravCost[i] / (glob_GravCost[i] / NTask + SMALLNUM);
+              bal_grav_bin_rel[i] =
+                  (tot_gravcost + domain_to_be_balanced[i] * (max_GravCost[i] - glob_GravCost[i] / NTask)) / (tot_gravcost + SMALLNUM);
+            }
+          else
+            {
+              bal_grav_bin[i]     = 0.0;
+              bal_grav_bin_rel[i] = 0.0;
+            }
+
+          if(tot_count_sph[i] > 0)
+            {
+              bal_hydro_bin[i]     = max_HydroCost[i] / (glob_HydroCost[i] / NTask + SMALLNUM);
+              bal_hydro_bin_rel[i] = (tot_hydrocost + domain_to_be_balanced[i] * (max_HydroCost[i] - glob_HydroCost[i] / NTask)) /
+                                     (tot_hydrocost + SMALLNUM);
+            }
+          else
+            {
+              bal_hydro_bin[i]     = 0.0;
+              bal_hydro_bin_rel[i] = 0.0;
+            }
+        }
+
+      char buf[MAXLEN_PATH];
+      sprintf(buf, "\nDOMAIN BALANCE, Sync-Point %d, Time: %g\n", All.NumCurrentTiStep, All.Time);
+      domain_printf(buf);
+      sprintf(buf, "Timebins:       Gravity       Hydro  cumulative      grav-balance       hydro-balance\n");
+      domain_printf(buf);
+
+      long long tot = 0, tot_sph = 0;
+
+      for(int i = TIMEBINS - 1; i >= 0; i--)
+        {
+#if(defined(SELFGRAVITY) || defined(EXTERNALGRAVITY))
+          if(tot_count_sph[i] > 0 || tot_count[i] > 0)
+#else
+          if(tot_count[i] > 0)
+            tot += tot_count[i];
+
+          if(tot_count_sph[i] > 0)
+#endif
+            {
+              char buf[MAXLEN_PATH];
+              sprintf(buf, "%c%cbin=%2d     %10llu  %10llu  %10llu    %6.3f |%6.3f  %c   %6.3f |%6.3f\n",
+                      i == All.HighestActiveTimeBin ? '>' : ' ', i >= All.SmallestTimeBinWithDomainDecomposition ? '|' : ' ', i,
+                      tot_count[i], tot_count_sph[i], tot_cumulative[i], bal_grav_bin[i], bal_grav_bin_rel[i],
+                      domain_to_be_balanced[i] > 0 ? '*' : ' ', bal_hydro_bin[i], bal_hydro_bin_rel[i]);
+              domain_printf(buf);
+
+              tot += tot_count[i];
+              tot_sph += tot_count_sph[i];
+            }
+        }
+
+      sprintf(buf, "-------------------------------------------------------------------------------------\n");
+      domain_printf(buf);
+      sprintf(buf, "BALANCE,  LOAD:  %6.3f      %6.3f      %6.3f  WORK:     %6.3f              %6.3f\n",
+              max_dm / (tot - tot_sph + SMALLNUM) * NTask, max_sph / (tot_sph + SMALLNUM) * NTask, max_tot / (tot + SMALLNUM) * NTask,
+              max_gravcost / (tot_gravcost + SMALLNUM), max_hydrocost / (tot_hydrocost + SMALLNUM));
+      domain_printf(buf);
+      sprintf(buf, "-------------------------------------------------------------------------------------\n");
+      domain_printf(buf);
+      sprintf(buf, "\n");
+      domain_printf(buf);
+      myflush(Logs.FdDomain);
+    }
+
+  TIMER_STOPSTART(CPU_LOGS, CPU_DOMAIN);
+}
+
+#ifdef LIGHTCONE_PARTICLES
+
+template <>
+void domain<lcparticles>::domain_rearrange_particle_sequence(void)
+{
+}
+
+template <>
+void domain<lcparticles>::domain_report_balance(void)
+{
+}
+
+#endif
+
+#include "../data/simparticles.h"
+template class domain<simparticles>;
+
+#ifdef LIGHTCONE_PARTICLES
+#include "../data/lcparticles.h"
+template class domain<lcparticles>;
+#endif
diff --git a/src/domain/domain.h b/src/domain/domain.h
new file mode 100644
index 0000000000000000000000000000000000000000..58d42041cad5112fb46031cd9dbf4b432bce7279
--- /dev/null
+++ b/src/domain/domain.h
@@ -0,0 +1,284 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file domain.h
+ *
+ *  \brief declares the class used for the domain decomposition
+ */
+
+#ifndef ALLVARS_H
+#include "../data/allvars.h"
+#endif
+#ifndef DOMAIN_H
+#define DOMAIN_H
+
+#include "../data/dtypes.h"
+#include "../mpi_utils/setcomm.h"
+
+enum domain_options
+{
+  STANDARD,
+  COLL_SUBFIND,
+  SERIAL_SUBFIND
+};
+
+template <typename partset> /* partset will either be the 'simparticles' or the 'lightconeparticles' class is the matching
+                               particle_data struct */
+class domain : public setcomm
+{
+ private:
+  partset *Tp;
+
+ public:
+  domain(MPI_Comm comm, partset *Tp_ptr) : setcomm(comm) /* constructor */
+  {
+    Tp = Tp_ptr;
+
+    ListOfTopleaves    = NULL;
+    TaskOfLeaf         = NULL;
+    TopNodes           = NULL;
+    NumTopleafOfTask   = NULL;
+    FirstTopleafOfTask = NULL;
+  }
+
+  typedef typename partset::pdata pdata;
+
+ public:
+  domain_options Mode;
+
+  int NTopnodes;
+  int NTopleaves;
+
+  int MultipleDomains = 1;
+
+  int MaxTopNodes; /**< Maximum number of nodes in the top-level tree used for domain decomposition */
+
+  int *ListOfTopleaves;
+  int *NumTopleafOfTask;
+  int *FirstTopleafOfTask;
+
+  int *TaskOfLeaf;
+
+  /** The top node structure is an octree used for encoding the domain
+      decomposition. Its leaf nodes are the units into which the domain
+      is decomposed. */
+  struct topnode_data
+  {
+    int Daughter; /*!< index of first daughter cell (out of 8) of top-level node */
+    int Leaf;     /*!< if the node is a leaf, this gives its number when all leaves are traversed in Peano-Hilbert order */
+  };
+
+  topnode_data *TopNodes;
+
+  /** Array of task numbers holding the respective top-level nodes. For
+      the topnodes entries, it is indexed by the Leaf member, for
+      pseudoparticles it is indexed by the node
+      number-MaxPart-MaxNodes.  */
+
+  void domain_decomposition(domain_options mode);
+  void domain_allocate(void);
+  void domain_allocate(int maxtopnodes);
+  void domain_free(void);
+  void domain_resize_storage(int count_get, int count_get_sph, int option_flag);
+
+  size_t domain_sizeof_topnode_data(void) { return sizeof(topnode_data); }
+
+ private:
+  struct local_topnode_data
+  {
+    double Cost;
+    long long CountTot; /*!< counts the global number of particles in this top-level node */
+    peanokey StartKey;  /*!< first Peano-Hilbert key in top-level node */
+    int Count;          /*!< counts the local number of particles in this top-level node */
+    int Level;          /*!< encodes side-length of the node, level 0 is the root node */
+    int PIndex;         /*!< first particle in node */
+  };
+
+  struct domain_peano_hilbert_data
+  {
+    peanokey key;
+    float cost;
+  } * mp;
+
+  struct domain_cost_data
+  {
+    double Cost;
+  };
+
+  struct domain_count_data
+  {
+    int task;
+    int count;
+    int origintask;
+  };
+
+  struct domain_segments_data
+  {
+    int task, start, end, used;
+    double bin_GravCost[TIMEBINS];
+    double bin_HydroCost[TIMEBINS];
+    double load;
+    double loadsph;
+  };
+
+  // domain_segments_data *domainAssign;
+
+  domain_cost_data *domain_leaf_cost;
+
+  local_topnode_data *topNodes;
+
+  peanokey *domain_key;
+
+  int NumTimeBinsToBeBalanced;
+  int ListOfTimeBinsToBeBalanced[TIMEBINS];
+  double GravCostPerListedTimeBin[TIMEBINS];
+  double MaxGravCostPerListedTimeBin[TIMEBINS];
+  double GravCostNormFactors[TIMEBINS];
+  double HydroCostPerListedTimeBin[TIMEBINS];
+  double HydroCostNormFactors[TIMEBINS];
+  double NormFactorLoad;
+  double NormFactorLoadSph;
+  double TotalCost;
+
+  int domain_grav_weight[TIMEBINS];
+  int domain_hydro_weight[TIMEBINS];
+  int domain_to_be_balanced[TIMEBINS];
+
+  void domain_find_total_cost(void);
+  void domain_report_balance(void);
+  void domain_combine_multipledomains(void);
+  void domain_init_sum_cost(void);
+  void domain_countToGo(int *toGoDM, int *toGoSph);
+  void domain_exchange(void);
+  void domain_determineTopTree(void);
+
+  void domain_printf(char *buf);
+  void domain_rearrange_particle_sequence(void);
+
+  void do_box_wrapping(void);
+  void domain_coll_subfind_prepare_exchange(void);
+  void domain_do_local_refine(int n, int *list);
+  void domain_walktoptree(int no);
+  double domain_get_cost_summed_over_timebins(int i);
+
+  /* note: static here needed to suppress the hidden 'this' argument so that we can use this in a call of sort */
+  static bool domain_compare_count(const domain_count_data &a, const domain_count_data &b) { return a.count > b.count; }
+
+  static bool domain_compare_key(const domain_peano_hilbert_data &a, const domain_peano_hilbert_data &b) { return a.key < b.key; }
+
+  static bool domain_sort_candidates(const int &a, const int &b) { return a < b; }
+
+  struct cost_queue_data
+  {
+    double value;
+#ifdef SIMPLE_DOMAIN_AGGREGATION
+    double aggregated_value;
+#endif
+    int index;
+  };
+
+  static bool domain_sort_cost_queue_data(const cost_queue_data &a, const cost_queue_data &b)
+  {
+#ifdef SIMPLE_DOMAIN_AGGREGATION
+    if(a.aggregated_value > b.aggregated_value)
+      return true;
+    else if(a.aggregated_value < b.aggregated_value)
+      return false;
+#endif
+    return a.value > b.value;
+  }
+
+  void domain_determinate_aggregated_value(cost_queue_data *data, int ndomains);
+
+  inline int n_to_no(int n)
+  {
+    int no       = 0;
+    peanokey key = domain_key[n];
+
+    while(TopNodes[no].Daughter >= 0)
+      {
+        unsigned int off = ((key.hs & (~((~((MyIntPosType)0)) >> 3))) >> (BITS_FOR_POSITIONS - 3));
+
+        no = TopNodes[no].Daughter + off;
+
+        key.hs <<= 3;
+        key.hs |= (key.is & (~((~((MyIntPosType)0)) >> 3))) >> (BITS_FOR_POSITIONS - 3);
+
+        key.is <<= 3;
+        key.is |= (key.ls & (~((~((MyIntPosType)0)) >> 3))) >> (BITS_FOR_POSITIONS - 3);
+
+        key.ls <<= 3;
+      }
+
+    return TopNodes[no].Leaf;
+  }
+
+  struct peano_hilbert_data
+  {
+    peanokey key;
+    int index;
+  };
+
+  static bool compare_peano_hilbert_data(const peano_hilbert_data &a, const peano_hilbert_data &b) { return a.key < b.key; }
+
+  void peano_hilbert_order(peanokey *key);
+
+  struct balance_try_data
+  {
+    int nextra;
+    double try_balance;
+  };
+
+  static bool domain_compare_trybalance(const balance_try_data &a, const balance_try_data &b) { return a.try_balance < b.try_balance; }
+
+ public:
+  void reorder_particles(int *Id, int Nstart, int N);
+  void reorder_gas(int *Id);
+  void reorder_PS(int *Id, int Nstart, int N);
+  void reorder_P_and_PS(int *Id);
+  void reorder_P_PS(int NumGas, int NumPart);
+
+  void particle_exchange_based_on_PS(MPI_Comm Communicator);
+
+ private:
+  struct local_sort_data
+  {
+    int targetindex;
+    int index;
+  };
+
+  static inline bool compare_local_sort_data_targetindex(const local_sort_data &a, const local_sort_data &b)
+  {
+    return a.targetindex < b.targetindex;
+  }
+
+#if defined(PLACEHIGHRESREGION) || defined(RANDOMIZE_DOMAINCENTER)
+  MyIntPosType domainInnersize;
+  MyIntPosType domainReferenceIntPos[3];
+  MySignedIntPosType domainXmintot[3], domainXmaxtot[3];
+
+  void domain_find_type_extension(void);
+  int domain_type_extension_overlap(int j);
+#endif
+
+#ifdef DOMAIN_SPECIAL_CHECK
+  void domain_special_check(int mode, int ndomains);
+#endif
+
+  void domain_printf(const char *fmt, ...)
+  {
+    if((Mode == STANDARD && ThisTask == 0))  // || (Mode == COLL_SUBFIND))
+      {
+        va_list l;
+        va_start(l, fmt);
+        vprintf(fmt, l);
+        //        myflush(stdout);
+        va_end(l);
+      }
+  }
+};
+
+#endif
diff --git a/src/domain/domain_balance.cc b/src/domain/domain_balance.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b769ba77f04e10d0301f9bd7a4c573758bfb287e
--- /dev/null
+++ b/src/domain/domain_balance.cc
@@ -0,0 +1,763 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file domain_balance.cc
+ *
+ *  \brief contains routines to improve the domain balance by combining several patches per MPI rank
+ */
+
+#include "gadgetconfig.h"
+
+#include <mpi.h>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/cxxsort.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/** This function uses the cumulative cost function (which weights work-load and memory-load equally) to subdivide
+ *  the list of top-level leaf nodes into pieces that are (approximately) equal in size.
+ */
+
+#ifdef DOMAIN_SPECIAL_CHECK
+
+template <typename partset>
+void domain<partset>::domain_special_check(int mode, int ndomains)
+{
+  double *cost_data = (double *)Mem.mymalloc_clear("cost_data", sizeof(double) * 2 * ndomains * (NumTimeBinsToBeBalanced + 1));
+
+  double *load         = cost_data;
+  double *loadsph      = cost_data + ndomains;
+  double *binGravCost  = cost_data + 2 * ndomains;
+  double *binHydroCost = cost_data + 2 * ndomains + ndomains * NumTimeBinsToBeBalanced;
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      int no = n_to_no(i);
+
+      int n;
+
+      if(mode == 0)
+        n = no;
+      else
+        n = TaskOfLeaf[no];
+
+      if(n < 0 || n >= ndomains)
+        Terminate("strange");
+
+#ifdef SELFGRAVITY
+      for(int k = 0; k < NumTimeBinsToBeBalanced; k++)
+        {
+          int bin = ListOfTimeBinsToBeBalanced[k];
+
+          if(bin >= Tp->P[i].getTimeBinGrav())
+            binGravCost[k * ndomains + n] += GravCostNormFactors[k] * Tp->P[i].getGravCost();
+        }
+#endif
+
+      load[n] += NormFactorLoad;
+
+      if(Tp->P[i].getType() == 0)
+        {
+          for(int k = 0; k < NumTimeBinsToBeBalanced; k++)
+            {
+              int bin = ListOfTimeBinsToBeBalanced[k];
+
+              if(bin >= Tp->P[i].getTimeBinHydro())
+                binHydroCost[k * ndomains + n] += HydroCostNormFactors[k];
+            }
+
+          loadsph[n] += NormFactorLoadSph;
+        }
+    }
+
+  MPI_Allreduce(MPI_IN_PLACE, cost_data, 2 * ndomains * (NumTimeBinsToBeBalanced + 1), MPI_DOUBLE, MPI_SUM, Communicator);
+
+  if(All.NumCurrentTiStep == 0 || All.NumCurrentTiStep == 2 || All.NumCurrentTiStep == 4)
+    {
+      if(ThisTask == 0)
+        {
+          char buf[1000];
+          sprintf(buf, "%s/domain_data_%d_step%d.txt", All.OutputDir, mode, All.NumCurrentTiStep);
+          FILE *fd = fopen(buf, "w");
+          fprintf(fd, "%d %d\n", ndomains, NumTimeBinsToBeBalanced);
+          for(int n = 0; n < ndomains; n++)
+            {
+              fprintf(fd, "%g  ", load[n]);
+              for(int k = 0; k < NumTimeBinsToBeBalanced; k++)
+                fprintf(fd, "%g  ", binGravCost[k * ndomains + n]);
+              fprintf(fd, "\n");
+            }
+          fclose(fd);
+        }
+    }
+
+  Mem.myfree(cost_data);
+}
+
+#endif
+
+/** This function assigns the domain pieces to individual MPI tasks with the goal to balance the work-load
+ *  on different timebins. The algorithm used works as follows:
+ *
+ *  The domains are assigned to the CPUs in sequence of decreasing "effective load", which is a simple combined measure of
+ *  relative total gravity, hydro and memory load. For each assignment, a number of possible target CPUs are evaluated, and
+ *  the assignment leading to the lowest total runtime is adopted.
+ *  The set of target CPUs that is tested in each step is the one that
+ *  consists of the CPUs that currently have the lowest load in the set of primary tasks that are examined.
+ */
+template <typename partset>
+void domain<partset>::domain_combine_multipledomains(void)
+{
+  double t0 = Logs.second();
+
+  /* we first determine the detailed cost of all the domain pieces (which are the top leaves in the tree), so that we can combine them
+   * later on efficiently for different choices of nextra
+   */
+
+  double *cost_data = (double *)Mem.mymalloc_clear("cost_data", sizeof(double) * 2 * NTopleaves * (NumTimeBinsToBeBalanced + 1));
+
+  double *load         = cost_data;
+  double *loadsph      = cost_data + NTopleaves;
+  double *binGravCost  = cost_data + 2 * NTopleaves;
+  double *binHydroCost = cost_data + 2 * NTopleaves + NTopleaves * NumTimeBinsToBeBalanced;
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      int no = n_to_no(i);  // get the leave node
+
+#ifdef SELFGRAVITY
+      for(int k = 0; k < NumTimeBinsToBeBalanced; k++)
+        {
+          int bin = ListOfTimeBinsToBeBalanced[k];
+
+          if(bin >= Tp->P[i].getTimeBinGrav())
+            binGravCost[k * NTopleaves + no] += GravCostNormFactors[k] * Tp->P[i].getGravCost();
+        }
+#endif
+
+      load[no] += NormFactorLoad;
+
+      if(Tp->P[i].getType() == 0)
+        {
+          for(int k = 0; k < NumTimeBinsToBeBalanced; k++)
+            {
+              int bin = ListOfTimeBinsToBeBalanced[k];
+
+              if(bin >= Tp->P[i].getTimeBinHydro())
+                binHydroCost[k * NTopleaves + no] += HydroCostNormFactors[k];
+            }
+
+          loadsph[no] += NormFactorLoadSph;
+        }
+    }
+
+  allreduce_sum<double>(cost_data, 2 * NTopleaves * (NumTimeBinsToBeBalanced + 1), Communicator);
+  /*
+  MPI_Allreduce(MPI_IN_PLACE, cost_data, 2 * NTopleaves * (NumTimeBinsToBeBalanced + 1), MPI_DOUBLE, MPI_SUM, Communicator);
+*/
+
+#ifdef DOMAIN_SPECIAL_CHECK
+  if(All.NumCurrentTiStep == 0 || All.NumCurrentTiStep == 2 || All.NumCurrentTiStep == 4)
+    {
+      if(ThisTask == 0)
+        {
+          char buf[1000];
+          sprintf(buf, "%s/domain_data_0_step%d.txt", All.OutputDir, All.NumCurrentTiStep);
+          FILE *fd = fopen(buf, "w");
+          fprintf(fd, "%d %d\n", NTopleaves, NumTimeBinsToBeBalanced);
+          for(int n = 0; n < NTopleaves; n++)
+            {
+              fprintf(fd, "%g  ", load[n]);
+              for(int k = 0; k < NumTimeBinsToBeBalanced; k++)
+                fprintf(fd, "%g  ", binGravCost[k * NTopleaves + n]);
+              fprintf(fd, "\n");
+            }
+          fclose(fd);
+        }
+    }
+#endif
+
+  /* let's now find the optimum combination */
+
+  /* first, enumerate the possibilities that we are going to try */
+
+  int cnt_try                   = 0;
+  balance_try_data *balance_try = NULL;
+
+  for(int rep = 0; rep < 2; rep++)  // we repeat this twice, first just counting, then allocating and filling the list
+    {
+      cnt_try = 0;
+
+      double fac      = 0;
+      int nextra      = 0;
+      int nextra_prev = -1;
+
+      while(nextra <= NTask)
+        {
+          if(nextra != nextra_prev)
+            {
+              double base_balance = ((double)(MultipleDomains * NTask)) / (MultipleDomains * NTask - nextra);
+
+              double excess_balance = 0.01;
+
+              while(base_balance + excess_balance < 2.5)
+                {
+                  if(rep == 1)
+                    {
+                      balance_try[cnt_try].nextra      = nextra;
+                      balance_try[cnt_try].try_balance = base_balance + excess_balance;
+                    }
+
+                  cnt_try++;
+
+                  excess_balance *= 1.25;
+                }
+
+              nextra_prev = nextra;
+            }
+
+          if(fac == 0)
+            fac = 0.01;
+          else
+            fac *= 1.25;
+
+          nextra = fac * NTask;
+        }
+
+      if(rep == 0)
+        balance_try = (balance_try_data *)Mem.mymalloc("balance_try", cnt_try * sizeof(balance_try_data));
+    }
+
+  if(NumNodes == 0)
+    determine_compute_nodes();
+
+  domain_printf("DOMAIN: we are going to try at most %d different settings for combining the domains on tasks=%d, nnodes=%d\n",
+                cnt_try, NTask, NumNodes);
+
+  /* sort the combinations such that we first try those yielding a lower imbalance */
+  mycxxsort(balance_try, balance_try + cnt_try, domain_compare_trybalance);
+
+  int start_try = 0;
+  int completed = 0;
+  int niter     = 0;
+
+  while(completed == 0 && start_try < cnt_try)
+    {
+      double glob_max_cost = 0;
+
+      if(start_try + ThisTask < cnt_try)
+        {
+          int nextra         = balance_try[start_try + ThisTask].nextra;
+          double try_balance = balance_try[start_try + ThisTask].try_balance;
+
+          int ndomains = MultipleDomains * NTask - nextra;
+
+          domain_segments_data *domainAssign =
+              (domain_segments_data *)Mem.mymalloc_clear("domainAssign", ndomains * sizeof(domain_segments_data));
+
+          /* consolidate the finely split domains pieces into larger chunks, exactly ndomains of them */
+          {
+            double max_cost       = 0;
+            double costhalfnode   = (0.5 * TotalCost) / NTopleaves;
+            double costavg        = TotalCost / ndomains;
+            double cost_before    = 0;
+            double costavg_before = 0;
+            int start             = 0;
+
+            double total_cost = 0, total_load = 0;
+
+            for(int n = 0; n < ndomains; n++)
+              {
+                int end = start;
+
+                double cost = domain_leaf_cost[end].Cost;
+
+                while((cost + cost_before + (end + 1 < NTopleaves ? domain_leaf_cost[end + 1].Cost : 0) <
+                       costavg + costavg_before + costhalfnode) ||
+                      (n == ndomains - 1 && end < NTopleaves - 1))
+                  {
+                    if((NTopleaves - end) > (ndomains - n))
+                      end++;
+                    else
+                      break;
+
+                    cost += domain_leaf_cost[end].Cost;
+                  }
+
+                domainAssign[n].start = start;
+                domainAssign[n].end   = end;
+
+                /* let's also determine the grav-cost and hydro-cost separately for each timebin of all the domain-pieces */
+                for(int no = domainAssign[n].start; no <= domainAssign[n].end; no++)
+                  {
+                    domainAssign[n].load += load[no];
+                    domainAssign[n].loadsph += loadsph[no];
+
+                    total_load += load[no] + loadsph[no];
+                    total_cost += load[no] + loadsph[no];
+
+                    for(int i = 0; i < NumTimeBinsToBeBalanced; i++)
+                      {
+                        domainAssign[n].bin_GravCost[i] += binGravCost[i * NTopleaves + no];
+                        domainAssign[n].bin_HydroCost[i] += binHydroCost[i * NTopleaves + no];
+
+                        total_cost += binGravCost[i * NTopleaves + no] + binHydroCost[i * NTopleaves + no];
+                      }
+                  }
+
+                cost_before += cost;
+                costavg_before += costavg;
+
+                start = end + 1;
+
+                if(max_cost < cost)
+                  max_cost = cost;
+              }
+
+            domain_printf("DOMAIN: total_cost=%g  total_load=%g\n", total_cost, total_load);
+          }
+
+          /* now start to map the domain pieces onto different tasks
+           */
+
+          struct tasklist_data
+          {
+            double bin_GravCost[TIMEBINS];
+            double bin_HydroCost[TIMEBINS];
+            double load;
+            double loadsph;
+          };
+
+          tasklist_data *tasklist = (tasklist_data *)Mem.mymalloc_clear("tasklist", NTask * sizeof(tasklist_data));
+
+          int n_cost_items = 0;
+          cost_queue_data *cost_queues[2 * TIMEBINS + 2];
+          int first_unusued_in_cost_queue[2 * TIMEBINS + 2];
+
+          for(int n = 0; n < NumTimeBinsToBeBalanced; n++)
+            {
+              if(GravCostPerListedTimeBin[n] > 0.0)
+                {
+                  cost_queues[n_cost_items] = (cost_queue_data *)Mem.mymalloc("cost_queues", ndomains * sizeof(cost_queue_data));
+                  for(int i = 0; i < ndomains; i++)
+                    {
+                      cost_queues[n_cost_items][i].value = domainAssign[i].bin_GravCost[n];
+                      cost_queues[n_cost_items][i].index = i;
+                    }
+#ifdef SIMPLE_DOMAIN_AGGREGATION
+                  domain_determinate_aggregated_value(cost_queues[n_cost_items], ndomains);
+#endif
+                  mycxxsort(cost_queues[n_cost_items], cost_queues[n_cost_items] + ndomains, domain_sort_cost_queue_data);
+                  first_unusued_in_cost_queue[n_cost_items] = 0;
+
+                  n_cost_items++;
+                }
+
+              if(HydroCostNormFactors[n] > 0.0)
+                {
+                  cost_queues[n_cost_items] = (cost_queue_data *)Mem.mymalloc("cost_queues", ndomains * sizeof(cost_queue_data));
+                  for(int i = 0; i < ndomains; i++)
+                    {
+                      cost_queues[n_cost_items][i].value = domainAssign[i].bin_HydroCost[n];
+                      cost_queues[n_cost_items][i].index = i;
+                    }
+#ifdef SIMPLE_DOMAIN_AGGREGATION
+                  domain_determinate_aggregated_value(cost_queues[n_cost_items], ndomains);
+#endif
+                  mycxxsort(cost_queues[n_cost_items], cost_queues[n_cost_items] + ndomains, domain_sort_cost_queue_data);
+                  first_unusued_in_cost_queue[n_cost_items] = 0;
+
+                  n_cost_items++;
+                }
+            }
+
+          if(NormFactorLoad > 0.0)
+            {
+              cost_queues[n_cost_items] = (cost_queue_data *)Mem.mymalloc("cost_queues", ndomains * sizeof(cost_queue_data));
+              for(int i = 0; i < ndomains; i++)
+                {
+                  cost_queues[n_cost_items][i].value = domainAssign[i].load;
+                  cost_queues[n_cost_items][i].index = i;
+                }
+#ifdef SIMPLE_DOMAIN_AGGREGATION
+              domain_determinate_aggregated_value(cost_queues[n_cost_items], ndomains);
+#endif
+              mycxxsort(cost_queues[n_cost_items], cost_queues[n_cost_items] + ndomains, domain_sort_cost_queue_data);
+              first_unusued_in_cost_queue[n_cost_items] = 0;
+
+              n_cost_items++;
+            }
+
+          if(NormFactorLoadSph > 0.0)
+            {
+              cost_queues[n_cost_items] = (cost_queue_data *)Mem.mymalloc("cost_queues", ndomains * sizeof(cost_queue_data));
+              for(int i = 0; i < ndomains; i++)
+                {
+                  cost_queues[n_cost_items][i].value = domainAssign[i].loadsph;
+                  cost_queues[n_cost_items][i].index = i;
+                }
+#ifdef SIMPLE_DOMAIN_AGGREGATION
+              domain_determinate_aggregated_value(cost_queues[n_cost_items], ndomains);
+#endif
+              mycxxsort(cost_queues[n_cost_items], cost_queues[n_cost_items] + ndomains, domain_sort_cost_queue_data);
+              first_unusued_in_cost_queue[n_cost_items] = 0;
+
+              n_cost_items++;
+            }
+
+          int nextqueue     = 0;
+          int ndomains_left = ndomains;
+          int target        = 0;
+
+          for(target = 0; target < NTask && ndomains_left > 0; target++)
+            {
+              int count    = 0;  // number of pieces added to this target task
+              int failures = 0;
+
+              while(ndomains_left > 0)
+                {
+                  int k = first_unusued_in_cost_queue[nextqueue];
+                  while(domainAssign[cost_queues[nextqueue][k].index].used)
+                    {
+                      if(k == ndomains - 1)
+                        Terminate("target=%d   nextqueue=%d  ndomains_left=%d  k == ndomains - 1", target, nextqueue, ndomains_left);
+
+                      k++;
+                      first_unusued_in_cost_queue[nextqueue]++;
+                    }
+
+                  /* this is our next candidate for adding */
+                  int n = cost_queues[nextqueue][k].index;
+
+                  nextqueue = (nextqueue + 1) % n_cost_items;
+
+                  // Let's see what imbalance we would get
+                  double max_cost = 0;
+
+                  if(max_cost < tasklist[target].load + domainAssign[n].load)
+                    max_cost = tasklist[target].load + domainAssign[n].load;
+
+                  if(max_cost < tasklist[target].loadsph + domainAssign[n].loadsph)
+                    max_cost = tasklist[target].loadsph + domainAssign[n].loadsph;
+
+                  for(int bin = 0; bin < NumTimeBinsToBeBalanced; bin++)
+                    {
+                      if(max_cost < tasklist[target].bin_GravCost[bin] + domainAssign[n].bin_GravCost[bin])
+                        max_cost = tasklist[target].bin_GravCost[bin] + domainAssign[n].bin_GravCost[bin];
+
+                      if(max_cost < tasklist[target].bin_HydroCost[bin] + domainAssign[n].bin_HydroCost[bin])
+                        max_cost = tasklist[target].bin_HydroCost[bin] + domainAssign[n].bin_HydroCost[bin];
+                    }
+
+                  if(count > 0)
+                    {
+                      if(max_cost * NTask > try_balance)
+                        {
+                          failures++;
+
+                          if(failures > n_cost_items)
+                            break;
+                          else
+                            continue;
+                        }
+                    }
+
+                  if(max_cost > glob_max_cost)
+                    glob_max_cost = max_cost;
+
+                  domainAssign[n].task = target;
+                  domainAssign[n].used = 1;
+
+                  tasklist[target].load += domainAssign[n].load;
+                  tasklist[target].loadsph += domainAssign[n].loadsph;
+                  for(int bin = 0; bin < NumTimeBinsToBeBalanced; bin++)
+                    {
+                      tasklist[target].bin_GravCost[bin] += domainAssign[n].bin_GravCost[bin];
+                      tasklist[target].bin_HydroCost[bin] += domainAssign[n].bin_HydroCost[bin];
+                    }
+
+                  ndomains_left--;
+                  count++;
+
+                  // if the following condition holds, no reason any further to try to stuff more than one piece together
+                  if(ndomains_left < NTask - target)
+                    break;
+                }
+
+              // do an extra skip, so that we do not typically start with the same queue
+              nextqueue = (nextqueue + 1) % n_cost_items;
+            }
+
+          if(ndomains_left == 0)
+            {
+              domain_printf("DOMAIN: combining multiple-domains succeeded, target=%d  NTask=%d\n", target, NTask);
+              completed = 1;
+
+#ifdef DOMAIN_SPECIAL_CHECK
+              if(All.NumCurrentTiStep == 0 || All.NumCurrentTiStep == 2 || All.NumCurrentTiStep == 4)
+                {
+                  char buf[1000];
+                  sprintf(buf, "%s/domain_data_1_step%d_task%d.txt", All.OutputDir, All.NumCurrentTiStep, ThisTask);
+                  FILE *fd = fopen(buf, "w");
+                  fprintf(fd, "%d %d\n", ndomains, NumTimeBinsToBeBalanced);
+                  for(int n = 0; n < ndomains; n++)
+                    {
+                      fprintf(fd, "%g  ", domainAssign[n].load);
+                      for(int k = 0; k < NumTimeBinsToBeBalanced; k++)
+                        fprintf(fd, "%g  ", domainAssign[n].bin_GravCost[k]);
+                      fprintf(fd, "\n");
+                    }
+                  fclose(fd);
+                }
+              if(All.NumCurrentTiStep == 0 || All.NumCurrentTiStep == 2 || All.NumCurrentTiStep == 4)
+                {
+                  char buf[1000];
+                  sprintf(buf, "%s/domain_data_2_step%d_task%d.txt", All.OutputDir, All.NumCurrentTiStep, ThisTask);
+                  FILE *fd = fopen(buf, "w");
+                  fprintf(fd, "%d %d\n", NTask, NumTimeBinsToBeBalanced);
+                  for(int n = 0; n < NTask; n++)
+                    {
+                      fprintf(fd, "%g  ", tasklist[n].load);
+                      for(int k = 0; k < NumTimeBinsToBeBalanced; k++)
+                        fprintf(fd, "%g  ", tasklist[n].bin_GravCost[k]);
+                      fprintf(fd, "\n");
+                    }
+                  fprintf(fd, "%g\n", glob_max_cost * NTask);
+                  fclose(fd);
+                }
+#endif
+
+              /* store the mapping of the topleaves to tasks */
+              for(int n = 0; n < ndomains; n++)
+                {
+                  for(int i = domainAssign[n].start; i <= domainAssign[n].end; i++)
+                    TaskOfLeaf[i] = domainAssign[n].task;
+                }
+            }
+          else
+            {
+              glob_max_cost = MAX_DOUBLE_NUMBER;
+            }
+
+          for(int num = n_cost_items - 1; num >= 0; num--)
+            Mem.myfree(cost_queues[num]);
+
+          Mem.myfree(tasklist);
+          Mem.myfree(domainAssign);
+        }
+      else
+        glob_max_cost = MAX_DOUBLE_NUMBER; /* this processor was not doing anything */
+
+      struct
+      {
+        double cost;
+        int rank;
+      } global = {glob_max_cost, ThisTask};
+
+      MPI_Allreduce(MPI_IN_PLACE, &global, 1, MPI_DOUBLE_INT, MPI_MINLOC, Communicator);
+
+      MPI_Allreduce(MPI_IN_PLACE, &completed, 1, MPI_INT, MPI_MAX, Communicator);
+
+      niter++;
+
+      if(completed)
+        {
+          domain_printf(
+              "DOMAIN: best solution found after %d iterations by task=%d for nextra=%d, reaching maximum imbalance of %g|%g\n", niter,
+              global.rank, balance_try[start_try + global.rank].nextra, global.cost * NTask,
+              balance_try[start_try + global.rank].try_balance);
+
+          MPI_Bcast(TaskOfLeaf, NTopleaves, MPI_INT, global.rank, Communicator);
+          break;
+        }
+
+      start_try += NTask;
+    }
+
+  if(completed == 0)
+    Terminate("domain balancing failed");
+
+  Mem.myfree(balance_try);
+
+  Mem.myfree(cost_data);
+
+  double t1 = Logs.second();
+  domain_printf("DOMAIN: combining multiple-domains took %g sec\n", Logs.timediff(t0, t1));
+}
+
+#ifdef SIMPLE_DOMAIN_AGGREGATION
+template <typename partset>
+void domain<partset>::domain_determinate_aggregated_value(cost_queue_data *data, int ndomains)
+{
+  if(NumNodes < 1)
+    Terminate("NumNodes=%d\n", NumNodes);
+
+  int nbase      = ndomains / NumNodes;
+  int additional = ndomains % NumNodes;
+  int start      = 0;
+  int end        = 0;
+
+  for(int i = 0; i < NumNodes; i++)
+    {
+      start = end;
+      end   = start + nbase;
+
+      if(additional > 0)
+        {
+          end++;
+          additional--;
+        }
+
+      double aggregated_value = 0;
+      for(int n = start; n < end; n++)
+        aggregated_value += data[n].value;
+
+      for(int n = start; n < end; n++)
+        data[n].aggregated_value = aggregated_value;
+    }
+
+  if(end != ndomains)
+    Terminate("end != ndomains");
+}
+#endif
+
+/** This function prepares the measurement of the total cost on each domain.
+ *  In particular, we determine how the timebins are mapped to the explicit measurements
+ *  of the gravity cost stored in the P.GravCost[] array (which in general will only be available for a subset
+ *  of all timebins). For the unmatched timebins, a closest bin is selected that is the most similar in terms
+ *  of particle number on the bin. Finally, the routine also determines how often each timebin is executed in
+ *  one cycle associated with the highest occupied timebin.
+ */
+template <>
+void domain<simparticles>::domain_init_sum_cost(void)
+{
+  long long tot_count[TIMEBINS], tot_count_sph[TIMEBINS];
+
+  sumup_large_ints(TIMEBINS, Tp->TimeBinsGravity.TimeBinCount, tot_count, Communicator);
+  sumup_large_ints(TIMEBINS, Tp->TimeBinsHydro.TimeBinCount, tot_count_sph, Communicator);
+
+  for(int i = 0; i < TIMEBINS; i++)
+    {
+      domain_to_be_balanced[i] = 0;
+      domain_grav_weight[i]    = 1;
+      domain_hydro_weight[i]   = 1;
+    }
+
+  domain_to_be_balanced[All.HighestActiveTimeBin] = 1;
+  domain_grav_weight[All.HighestActiveTimeBin]    = 1;
+  domain_hydro_weight[All.HighestActiveTimeBin]   = 1;
+
+  ListOfTimeBinsToBeBalanced[0] = All.HighestActiveTimeBin;
+  NumTimeBinsToBeBalanced       = 1;
+
+  if(Mode == COLL_SUBFIND)
+    return;
+
+#ifdef HIERARCHICAL_GRAVITY
+
+  for(int j = All.HighestActiveTimeBin - 1; j >= All.LowestOccupiedTimeBin; j--)
+    {
+      if(tot_count[j] > 0 || tot_count_sph[j] > 0)
+        {
+          ListOfTimeBinsToBeBalanced[NumTimeBinsToBeBalanced++] = j;
+
+          domain_to_be_balanced[j] = 1;
+        }
+
+      domain_grav_weight[j] += 2;
+    }
+
+  for(int i = All.SmallestTimeBinWithDomainDecomposition - 1, weight = 1; i >= All.LowestOccupiedTimeBin; i--, weight *= 2)
+    {
+      if(tot_count[i] > 0)
+        {
+          domain_grav_weight[i] = weight;
+
+          for(int j = i - 1; j >= All.LowestOccupiedTimeBin; j--)
+            domain_grav_weight[j] += 2 * weight;
+        }
+
+      if(tot_count_sph[i] > 0)
+        domain_hydro_weight[i] = weight;
+    }
+
+#else
+
+  for(int i = All.SmallestTimeBinWithDomainDecomposition - 1, weight = 1; i >= All.LowestOccupiedTimeBin; i--, weight *= 2)
+    {
+      if(tot_count[i] > 0 || tot_count_sph[i] > 0)
+        {
+          ListOfTimeBinsToBeBalanced[NumTimeBinsToBeBalanced++] = i;
+          domain_to_be_balanced[i]                              = 1;
+        }
+
+      if(tot_count[i] > 0)
+        domain_grav_weight[i] = weight;
+
+      if(tot_count_sph[i] > 0)
+        domain_hydro_weight[i] = weight;
+    }
+
+#endif
+}
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES)
+
+/*
+template <>
+double domain<lcparticles>::domain_grav_tot_costfactor(int i)
+{
+  return 1.0;
+}
+
+template <>
+double domain<lcparticles>::domain_hydro_tot_costfactor(int i)
+{
+  return 0;
+}
+*/
+/*
+template <>
+void domain<lcparticles>::domain_combine_multipledomains(void)
+{
+}
+*/
+
+template <>
+void domain<lcparticles>::domain_init_sum_cost(void)
+{
+  for(int i = 0; i < TIMEBINS; i++)
+    {
+      //   domain_bintolevel[i]     = -1;
+      //   domain_refbin[i]         = -1;
+      domain_to_be_balanced[i] = 0;
+      domain_grav_weight[i]    = 1;
+      domain_hydro_weight[i]   = 1;
+    }
+
+  domain_to_be_balanced[0] = 1;
+}
+
+#endif
+
+#include "../data/simparticles.h"
+template class domain<simparticles>;
+
+#ifdef LIGHTCONE_PARTICLES
+#include "../data/lcparticles.h"
+template class domain<lcparticles>;
+#endif
diff --git a/src/domain/domain_box.cc b/src/domain/domain_box.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c1520c11647ad6b52bab7681ebf355a1d7b7ffc
--- /dev/null
+++ b/src/domain/domain_box.cc
@@ -0,0 +1,322 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file domain_box.cc
+ *
+ *  \brief routines for finding domain extension, random shifting, and periodic wrapping if needed
+ */
+
+#include "gadgetconfig.h"
+
+#include <mpi.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../pm/pm.h"
+#include "../system/system.h"
+
+using namespace std;
+
+/*! \file domain_box.cc
+ *  \brief finds extension of particle set and/or wraps them back into the fundamental periodic box
+ */
+
+/*! This function makes sure that all particle coordinates (Pos) are
+ *  periodically mapped onto the interval [0, BoxSize].  After this function
+ *  has been called, a new domain decomposition should be done, which will
+ *  also force a new tree construction.
+ */
+template <>
+void domain<simparticles>::do_box_wrapping(void)
+{
+  if(Mode != STANDARD)
+    return;
+
+#ifdef RANDOMIZE_DOMAINCENTER
+  /* remove previous shift vector */
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      for(int j = 0; j < 3; j++)
+        Tp->P[i].IntPos[j] -= Tp->CurrentShiftVector[j];
+
+      Tp->constrain_intpos(Tp->P[i].IntPos);
+    }
+
+  for(int j = 0; j < 3; j++)
+    Tp->CurrentShiftVector[j] = 0;
+#endif
+
+#ifndef PERIODIC
+  /* If we don't use periodic boundaries, check whether we lie outside the central 3/8 of the region chosen for the root node.
+   * This is based on the notion of using 1/4 for the initial region, leaving 1/4 as a safe buffer. Once half of this buffer
+   * is used up, we trigger an adjustment. */
+
+  MyIntPosType leftbound  = 5 * (((MyIntPosType)1) << (BITS_FOR_POSITIONS - 4));  /* 5/16 of full box length */
+  MyIntPosType rightbound = 11 * (((MyIntPosType)1) << (BITS_FOR_POSITIONS - 4)); /* 11/16 of full box length */
+
+  int flag = 0;
+  int iter = 0;
+
+  do
+    {
+      flag = 0;
+      for(int i = 0; i < Tp->NumPart; i++)
+        for(int j = 0; j < 3; j++)
+          {
+            if(Tp->P[i].IntPos[j] < leftbound)
+              flag = 1;
+
+            if(Tp->P[i].IntPos[j] > rightbound)
+              flag = 1;
+          }
+
+      MPI_Allreduce(MPI_IN_PLACE, &flag, 1, MPI_INT, MPI_MAX, Communicator);
+
+      if(flag)
+        {
+          domain_printf("DOMAIN: Simulation region has enlarged, need to adjusted mapping.\n");
+
+          for(int i = 0; i < Tp->NumPart; i++)
+            for(int j = 0; j < 3; j++)
+              Tp->P[i].IntPos[j] = (Tp->P[i].IntPos[j] >> 1) + (((MyIntPosType)1) << (BITS_FOR_POSITIONS - 2));
+
+          Tp->FacCoordToInt *= 0.5;
+          Tp->FacIntToCoord *= 2.0;
+          Tp->RegionLen *= 2.0;
+
+          for(int j = 0; j < 3; j++)
+            Tp->RegionCorner[j] = Tp->RegionCenter[j] - 0.5 * Tp->RegionLen;
+
+#if defined(PMGRID) && (!defined(PERIODIC) || defined(PLACEHIGHRESREGION))
+          /* make sure that we compute new kernels the next time we execute a non-periodic pm calculation */
+          Tp->OldMeshSize[0] = 0;
+          Tp->OldMeshSize[1] = 0;
+#endif
+
+          iter++;
+        }
+
+      if(iter > 5)
+        Terminate("too many iterations");
+    }
+  while(flag);
+
+#endif
+
+#ifdef RANDOMIZE_DOMAINCENTER
+  /* determine new shift vector */
+
+#if defined(PLACEHIGHRESREGION)
+  domain_find_type_extension();
+#endif
+
+  if(ThisTask == 0)
+    {
+#if defined(PLACEHIGHRESREGION)
+      int count = 0;
+#endif
+
+      for(int j = 0; j < 3; j++)
+        {
+          Tp->CurrentShiftVector[j] = get_random_number() * pow(2.0, 32);
+#ifdef POSITIONS_IN_64BIT
+          Tp->CurrentShiftVector[j] <<= 32;
+          Tp->CurrentShiftVector[j] += get_random_number() * pow(2.0, 32);
+#endif
+#ifdef POSITIONS_IN_128BIT
+          Tp->CurrentShiftVector[j] <<= 32;
+          Tp->CurrentShiftVector[j] += get_random_number() * pow(2.0, 32);
+          Tp->CurrentShiftVector[j] <<= 32;
+          Tp->CurrentShiftVector[j] += get_random_number() * pow(2.0, 32);
+          Tp->CurrentShiftVector[j] <<= 32;
+          Tp->CurrentShiftVector[j] += get_random_number() * pow(2.0, 32);
+#endif
+
+#if defined(PLACEHIGHRESREGION)
+          MyIntPosType boxoff   = (Tp->CurrentShiftVector[j] & Tp->PlacingMask);
+          MyIntPosType inboxoff = (Tp->CurrentShiftVector[j] - boxoff) % (Tp->PlacingBlocksize - domainInnersize);
+
+          MyIntPosType off = domainXmintot[j] + domainReferenceIntPos[j] + Tp->CurrentShiftVector[j];
+          Tp->CurrentShiftVector[j] -= off;  // now we have the high-res region aligned with the left box size
+          Tp->CurrentShiftVector[j] += boxoff + inboxoff;
+
+          if(domain_type_extension_overlap(j))
+            {
+              domain_printf("(Tp->PlacingBlocksize - domainInnersize)=%g  %g\n",
+                            (Tp->PlacingBlocksize - domainInnersize) * Tp->FacIntToCoord, inboxoff * Tp->FacIntToCoord);
+              domain_printf("DOMAIN: Need to draw shift vector again for j=%d\n", j);
+              Terminate("we should not get here anymore\n");
+              j--;  // causes a repeat of the loop for the same index
+              count++;
+              if(count > 1000)
+                Terminate("too many repeats");
+              continue;
+            }
+#endif
+        }
+    }
+
+#ifdef GRAVITY_TALLBOX
+  Tp->CurrentShiftVector[GRAVITY_TALLBOX] = 0;
+#endif
+
+  MPI_Bcast(Tp->CurrentShiftVector, 3 * sizeof(MyIntPosType), MPI_BYTE, 0, Communicator);
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      for(int j = 0; j < 3; j++)
+        Tp->P[i].IntPos[j] += Tp->CurrentShiftVector[j];
+
+      Tp->constrain_intpos(Tp->P[i].IntPos);
+    }
+
+  domain_printf("DOMAIN: New shift vector determined (%g %g %g)\n",
+                ((MySignedIntPosType)Tp->CurrentShiftVector[0]) * Tp->FacIntToCoord,
+                ((MySignedIntPosType)Tp->CurrentShiftVector[1]) * Tp->FacIntToCoord,
+                ((MySignedIntPosType)Tp->CurrentShiftVector[2]) * Tp->FacIntToCoord);
+#endif
+}
+
+#if defined(PLACEHIGHRESREGION)
+
+template <typename partset>
+int domain<partset>::domain_type_extension_overlap(int j)
+{
+  MyIntPosType *xmintot = (MyIntPosType *)domainXmintot;
+  MyIntPosType *xmaxtot = (MyIntPosType *)domainXmaxtot;
+
+  MyIntPosType xmin = xmintot[j] + domainReferenceIntPos[j] + Tp->CurrentShiftVector[j];
+  MyIntPosType xmax = xmaxtot[j] + domainReferenceIntPos[j] + Tp->CurrentShiftVector[j];
+
+  if((xmin & Tp->PlacingMask) != (xmax & Tp->PlacingMask))
+    return 1;
+  else
+    return 0;
+}
+
+template <typename partset>
+void domain<partset>::domain_find_type_extension(void)
+{
+  /* first, find a reference coordinate by selecting an arbitrary particle in the respective region. For definiteness, we choose the
+   * first particle */
+
+  int have_high_mesh = NTask; /* default is we don't have a particle */
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      if(((1 << Tp->P[i].getType()) & (PLACEHIGHRESREGION)))
+        {
+          for(int j = 0; j < 3; j++)
+            domainReferenceIntPos[j] = Tp->P[i].IntPos[j];
+
+          have_high_mesh = ThisTask;
+          break;
+        }
+    }
+
+  int have_global[2] = {have_high_mesh, ThisTask};
+
+  MPI_Allreduce(MPI_IN_PLACE, have_global, 1, MPI_2INT, MPI_MINLOC, Communicator);
+
+  if(have_global[0] >= NTask)
+    Terminate("have_global[0]=%d  >= NTask=%d: Don't we have any particle?  Note: PLACEHIGHRESREGION=%d is a bitmask", have_global[0],
+              NTask, PLACEHIGHRESREGION);
+
+  MPI_Bcast(domainReferenceIntPos, 3 * sizeof(MyIntPosType), MPI_BYTE, have_global[1], Communicator);
+
+  /* find enclosing rectangle */
+
+  MySignedIntPosType xmin[3], xmax[3];
+
+  for(int j = 0; j < 3; j++)
+    {
+      xmin[j] = 0;
+      xmax[j] = 0;
+    }
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      if(((1 << Tp->P[i].getType()) & (PLACEHIGHRESREGION)))
+        {
+          MyIntPosType diff[3] = {Tp->P[i].IntPos[0] - domainReferenceIntPos[0], Tp->P[i].IntPos[1] - domainReferenceIntPos[1],
+                                  Tp->P[i].IntPos[2] - domainReferenceIntPos[2]};
+
+          MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+          for(int j = 0; j < 3; j++)
+            {
+              if(delta[j] > xmax[j])
+                xmax[j] = delta[j];
+              if(delta[j] < xmin[j])
+                xmin[j] = delta[j];
+            }
+        }
+    }
+
+  MPI_Allreduce(xmin, domainXmintot, 3, MPI_MyIntPosType, MPI_MIN_MySignedIntPosType, Communicator);
+  MPI_Allreduce(xmax, domainXmaxtot, 3, MPI_MyIntPosType, MPI_MAX_MySignedIntPosType, Communicator);
+
+  for(int j = 0; j < 3; j++)
+    domainXmaxtot[j] += 1; /* so that all particles fulfill   xmin <= pos < xmax instead of xmin <= pos <= xmax*/
+
+  domainInnersize = domainXmaxtot[0] - domainXmintot[0];
+
+  if((MyIntPosType)(domainXmaxtot[1] - domainXmintot[1]) > domainInnersize)
+    domainInnersize = domainXmaxtot[1] - domainXmintot[1];
+
+  if((MyIntPosType)(domainXmaxtot[2] - domainXmintot[2]) > domainInnersize)
+    domainInnersize = domainXmaxtot[2] - domainXmintot[2];
+
+  domain_printf("DOMAIN: Shrink-wrap region size for PLACEHIGHRESREGION is %g\n", domainInnersize * Tp->FacIntToCoord);
+
+  if(domainInnersize * Tp->FacIntToCoord >= 0.125 * All.BoxSize)
+    Terminate("inappropriately big region selection for PLACEHIGHRESREGION");
+
+  /* increase the region by at least 1/8 of its size to still allow some randomness in placing the particles within the high-res node
+   */
+  MyIntPosType ref_size = domainInnersize + (domainInnersize >> 3);
+
+  Tp->PlacingBlocksize = 1;
+  Tp->PlacingMask      = ~((MyIntPosType)0);
+
+  for(int i = 0; i < BITS_FOR_POSITIONS; i++)
+    {
+      if(Tp->PlacingBlocksize >= ref_size)
+        break;
+
+      Tp->PlacingBlocksize <<= 1;
+      Tp->PlacingMask <<= 1;
+    }
+
+  domain_printf("DOMAIN: We enlarge this to %g    (%g times smaller than boxsize)\n", Tp->PlacingBlocksize * Tp->FacIntToCoord,
+                All.BoxSize / (Tp->PlacingBlocksize * Tp->FacIntToCoord));
+}
+#endif
+
+#ifdef LIGHTCONE_PARTICLES
+template <>
+void domain<lcparticles>::do_box_wrapping(void)
+{
+}
+#endif
+
+#include "../data/simparticles.h"
+template class domain<simparticles>;
+
+#ifdef LIGHTCONE_PARTICLES
+#include "../data/lcparticles.h"
+template class domain<lcparticles>;
+#endif
diff --git a/src/domain/domain_exchange.cc b/src/domain/domain_exchange.cc
new file mode 100644
index 0000000000000000000000000000000000000000..69d7c01a2fea7558f44cf61c9129d2e57b62d015
--- /dev/null
+++ b/src/domain/domain_exchange.cc
@@ -0,0 +1,893 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file domain_exchange.cc
+ *
+ *  \brief routines for moving particle data between MPI ranks
+ */
+
+#include "gadgetconfig.h"
+
+#include <mpi.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../sort/cxxsort.h"
+#include "../system/system.h"
+
+/*! \file domain_exchange.c
+ *  \brief exchanges particle data according to the new domain decomposition
+ */
+template <typename partset>
+void domain<partset>::domain_resize_storage(int count_get_total, int count_get_sph, int option_flag)
+{
+  int max_load, load       = count_get_total;
+  int max_sphload, sphload = count_get_sph;
+  MPI_Allreduce(&load, &max_load, 1, MPI_INT, MPI_MAX, Communicator);
+  MPI_Allreduce(&sphload, &max_sphload, 1, MPI_INT, MPI_MAX, Communicator);
+
+  if(max_load > (1.0 - ALLOC_TOLERANCE) * Tp->MaxPart || max_load < (1.0 - 3 * ALLOC_TOLERANCE) * Tp->MaxPart)
+    {
+      Tp->reallocate_memory_maxpart(max_load / (1.0 - 2 * ALLOC_TOLERANCE));
+
+      if(option_flag == 1)
+        domain_key = (peanokey *)Mem.myrealloc_movable(domain_key, sizeof(peanokey) * Tp->MaxPart);
+    }
+
+  if(max_sphload > (1.0 - ALLOC_TOLERANCE) * Tp->MaxPartSph || max_sphload < (1.0 - 3 * ALLOC_TOLERANCE) * Tp->MaxPartSph)
+    {
+      int maxpartsphNew = max_sphload / (1.0 - 2 * ALLOC_TOLERANCE);
+      if(option_flag == 2)
+        {
+          /*
+          if(maxpartsphNew > NgbTree.MaxPart)
+            ngb_treemodifylength(maxpartsphNew - NgbTree.MaxPart);
+          */
+          Terminate("need to reactivate this");
+        }
+      Tp->reallocate_memory_maxpartsph(maxpartsphNew);
+    }
+}
+
+/*! This function determines how many particles that are currently stored
+ *  on the local CPU have to be moved off according to the domain
+ *  decomposition.
+ */
+template <typename partset>
+void domain<partset>::domain_countToGo(int *toGoDM, int *toGoSph)
+{
+  for(int n = 0; n < NTask; n++)
+    {
+      toGoDM[n] = toGoSph[n] = 0;
+    }
+
+  for(int n = 0; n < Tp->NumPart; n++)
+    {
+      int no = n_to_no(n);
+
+      // FIXME: Why is this check to stay on local domain disabled?
+      //        if(TaskOfLeaf[no] != ThisTask)
+
+      if(Tp->P[n].getType() == 0)
+        toGoSph[TaskOfLeaf[no]]++;
+      else
+        toGoDM[TaskOfLeaf[no]]++;
+    }
+}
+
+template <typename partset>
+void domain<partset>::domain_coll_subfind_prepare_exchange(void)
+{
+#ifdef SUBFIND
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      int task = TaskOfLeaf[n_to_no(i)];
+
+      Tp->PS[i].TargetTask  = task;
+      Tp->PS[i].TargetIndex = 0; /* unimportant here */
+    }
+#endif
+}
+
+template <typename partset>
+void domain<partset>::domain_exchange(void)
+{
+  double t0 = Logs.second();
+
+  int *toGoDM   = (int *)Mem.mymalloc_movable(&toGoDM, "toGoDM", NTask * sizeof(int));
+  int *toGoSph  = (int *)Mem.mymalloc_movable(&toGoSph, "toGoSph", NTask * sizeof(int));
+  int *toGetDM  = (int *)Mem.mymalloc_movable(&toGetDM, "toGetDM", NTask * sizeof(int));
+  int *toGetSph = (int *)Mem.mymalloc_movable(&toGetSph, "toGetSph", NTask * sizeof(int));
+
+  domain_countToGo(toGoDM, toGoSph);
+
+  int *toGo  = (int *)Mem.mymalloc("toGo", 2 * NTask * sizeof(int));
+  int *toGet = (int *)Mem.mymalloc("toGet", 2 * NTask * sizeof(int));
+
+  for(int i = 0; i < NTask; ++i)
+    {
+      toGo[2 * i]     = toGoDM[i];
+      toGo[2 * i + 1] = toGoSph[i];
+    }
+  MPI_Alltoall(toGo, 2, MPI_INT, toGet, 2, MPI_INT, Communicator);
+  for(int i = 0; i < NTask; ++i)
+    {
+      toGetDM[i]  = toGet[2 * i];
+      toGetSph[i] = toGet[2 * i + 1];
+    }
+  Mem.myfree(toGet);
+  Mem.myfree(toGo);
+
+  int count_togo_dm = 0, count_togo_sph = 0, count_get_dm = 0, count_get_sph = 0;
+  for(int i = 0; i < NTask; i++)
+    {
+      count_togo_dm += toGoDM[i];
+      count_togo_sph += toGoSph[i];
+      count_get_dm += toGetDM[i];
+      count_get_sph += toGetSph[i];
+    }
+
+  long long sumtogo = count_togo_dm;
+  sumup_longs(1, &sumtogo, &sumtogo, Communicator);
+
+  domain_printf("DOMAIN: exchange of %lld particles\n", sumtogo);
+
+  if(Tp->NumPart != count_togo_dm + count_togo_sph)
+    Terminate("NumPart != count_togo");
+
+  int *send_sph_offset = (int *)Mem.mymalloc_movable(&send_sph_offset, "send_sph_offset", NTask * sizeof(int));
+  int *send_dm_offset  = (int *)Mem.mymalloc_movable(&send_dm_offset, "send_dm_offset", NTask * sizeof(int));
+  int *recv_sph_offset = (int *)Mem.mymalloc_movable(&recv_sph_offset, "recv_sph_offset", NTask * sizeof(int));
+  int *recv_dm_offset  = (int *)Mem.mymalloc_movable(&recv_dm_offset, "recv_dm_offset", NTask * sizeof(int));
+
+  send_sph_offset[0] = send_dm_offset[0] = recv_sph_offset[0] = recv_dm_offset[0] = 0;
+  for(int i = 1; i < NTask; i++)
+    {
+      send_sph_offset[i] = send_sph_offset[i - 1] + toGoSph[i - 1];
+      send_dm_offset[i]  = send_dm_offset[i - 1] + toGoDM[i - 1];
+
+      recv_sph_offset[i] = recv_sph_offset[i - 1] + toGetSph[i - 1];
+      recv_dm_offset[i]  = recv_dm_offset[i - 1] + toGetDM[i - 1];
+    }
+
+  for(int i = 0; i < NTask; i++)
+    {
+      send_dm_offset[i] += count_togo_sph;
+      recv_dm_offset[i] += count_get_sph;
+    }
+
+  pdata *partBuf =
+      (typename partset::pdata *)Mem.mymalloc_movable_clear(&partBuf, "partBuf", (count_togo_dm + count_togo_sph) * sizeof(pdata));
+  sph_particle_data *sphBuf =
+      (sph_particle_data *)Mem.mymalloc_movable_clear(&sphBuf, "sphBuf", count_togo_sph * sizeof(sph_particle_data));
+  peanokey *keyBuf = (peanokey *)Mem.mymalloc_movable_clear(&keyBuf, "keyBuf", (count_togo_dm + count_togo_sph) * sizeof(peanokey));
+
+  for(int i = 0; i < NTask; i++)
+    toGoSph[i] = toGoDM[i] = 0;
+
+  for(int n = 0; n < Tp->NumPart; n++)
+    {
+      int off, num;
+      int task = TaskOfLeaf[n_to_no(n)];
+
+      if(Tp->P[n].getType() == 0)
+        {
+          num = toGoSph[task]++;
+
+          off         = send_sph_offset[task] + num;
+          sphBuf[off] = Tp->SphP[n];
+        }
+      else
+        {
+          num = toGoDM[task]++;
+
+          off = send_dm_offset[task] + num;
+        }
+
+      partBuf[off] = Tp->P[n];
+      keyBuf[off]  = domain_key[n];
+    }
+
+  /**** now resize the storage for the P[] and SphP[] arrays if needed ****/
+  domain_resize_storage(count_get_dm + count_get_sph, count_get_sph, 1);
+
+  /*****  space has been created, now we can do the actual exchange *****/
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+
+  int flag_big = 0, flag_big_all;
+  for(int i = 0; i < NTask; i++)
+    {
+      if(toGoSph[i] * sizeof(sph_particle_data) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+        flag_big = 1;
+
+      if(std::max<int>(toGoSph[i], toGoDM[i]) * sizeof(typename partset::pdata) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+        flag_big = 1;
+    }
+
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+#if 1
+#ifdef USE_MPIALLTOALLV_IN_DOMAINDECOMP
+  int method = 0;
+#else
+#ifndef ISEND_IRECV_IN_DOMAIN /* synchronous communication */
+  int method = 1;
+#else
+  int method = 2; /* asynchronous communication */
+#endif
+#endif
+  MPI_Datatype tp;
+  MPI_Type_contiguous(sizeof(typename partset::pdata), MPI_CHAR, &tp);
+  MPI_Type_commit(&tp);
+  myMPI_Alltoallv_new(partBuf, toGoSph, send_sph_offset, tp, Tp->P, toGetSph, recv_sph_offset, tp, Communicator, method);
+  myMPI_Alltoallv_new(partBuf, toGoDM, send_dm_offset, tp, Tp->P, toGetDM, recv_dm_offset, tp, Communicator, method);
+  MPI_Type_free(&tp);
+  MPI_Type_contiguous(sizeof(sph_particle_data), MPI_CHAR, &tp);
+  MPI_Type_commit(&tp);
+  myMPI_Alltoallv_new(sphBuf, toGoSph, send_sph_offset, tp, Tp->SphP, toGetSph, recv_sph_offset, tp, Communicator, method);
+  MPI_Type_free(&tp);
+  MPI_Type_contiguous(sizeof(peanokey), MPI_CHAR, &tp);
+  MPI_Type_commit(&tp);
+  myMPI_Alltoallv_new(keyBuf, toGoSph, send_sph_offset, tp, domain_key, toGetSph, recv_sph_offset, tp, Communicator, method);
+  myMPI_Alltoallv_new(keyBuf, toGoDM, send_dm_offset, tp, domain_key, toGetDM, recv_dm_offset, tp, Communicator, method);
+  MPI_Type_free(&tp);
+#else
+  my_int_MPI_Alltoallv(partBuf, toGoSph, send_sph_offset, P, toGetSph, recv_sph_offset, sizeof(pdata), flag_big_all, Communicator);
+
+  my_int_MPI_Alltoallv(sphBuf, toGoSph, send_sph_offset, SphP, toGetSph, recv_sph_offset, sizeof(sph_particle_data), flag_big_all,
+                       Communicator);
+
+  my_int_MPI_Alltoallv(keyBuf, toGoSph, send_sph_offset, domain_key, toGetSph, recv_sph_offset, sizeof(peanokey), flag_big_all,
+                       Communicator);
+
+  my_int_MPI_Alltoallv(partBuf, toGoDM, send_dm_offset, P, toGetDM, recv_dm_offset, sizeof(pdata), flag_big_all, Communicator);
+
+  my_int_MPI_Alltoallv(keyBuf, toGoDM, send_dm_offset, domain_key, toGetDM, recv_dm_offset, sizeof(peanokey), flag_big_all,
+                       Communicator);
+#endif
+
+  Tp->NumPart = count_get_dm + count_get_sph;
+  Tp->NumGas  = count_get_sph;
+
+  Mem.myfree(keyBuf);
+  Mem.myfree(sphBuf);
+  Mem.myfree(partBuf);
+
+  Mem.myfree(recv_dm_offset);
+  Mem.myfree(recv_sph_offset);
+  Mem.myfree(send_dm_offset);
+  Mem.myfree(send_sph_offset);
+
+  Mem.myfree(toGetSph);
+  Mem.myfree(toGetDM);
+  Mem.myfree(toGoSph);
+  Mem.myfree(toGoDM);
+
+  double t1 = Logs.second();
+
+  domain_printf("DOMAIN: particle exchange done. (took %g sec)\n", Logs.timediff(t0, t1));
+}
+
+template <typename partset>
+void domain<partset>::peano_hilbert_order(peanokey *key)
+{
+  mpi_printf("PEANO: Begin Peano-Hilbert order...\n");
+  double t0 = Logs.second();
+
+  if(Tp->NumGas)
+    {
+      peano_hilbert_data *pmp = (peano_hilbert_data *)Mem.mymalloc("pmp", sizeof(peano_hilbert_data) * Tp->NumGas);
+      int *Id                 = (int *)Mem.mymalloc("Id", sizeof(int) * Tp->NumGas);
+
+      for(int i = 0; i < Tp->NumGas; i++)
+        {
+          pmp[i].index = i;
+          pmp[i].key   = key[i];
+        }
+
+      mycxxsort(pmp, pmp + Tp->NumGas, compare_peano_hilbert_data);
+
+      for(int i = 0; i < Tp->NumGas; i++)
+        Id[pmp[i].index] = i;
+
+      reorder_gas(Id);
+
+      Mem.myfree(Id);
+      Mem.myfree(pmp);
+    }
+
+  if(Tp->NumPart - Tp->NumGas > 0)
+    {
+      peano_hilbert_data *pmp = (peano_hilbert_data *)Mem.mymalloc("pmp", sizeof(peano_hilbert_data) * (Tp->NumPart - Tp->NumGas));
+      int *Id                 = (int *)Mem.mymalloc("Id", sizeof(int) * (Tp->NumPart - Tp->NumGas));
+
+      for(int i = Tp->NumGas; i < Tp->NumPart; i++)
+        {
+          pmp[i - Tp->NumGas].index = i;
+          pmp[i - Tp->NumGas].key   = key[i];
+        }
+
+      mycxxsort(pmp, pmp + Tp->NumPart - Tp->NumGas, compare_peano_hilbert_data);
+
+      for(int i = Tp->NumGas; i < Tp->NumPart; i++)
+        Id[pmp[i - Tp->NumGas].index - Tp->NumGas] = i;
+
+      reorder_particles(Id - Tp->NumGas, Tp->NumGas, Tp->NumPart);
+
+      Mem.myfree(Id);
+      Mem.myfree(pmp);
+    }
+
+  mpi_printf("PEANO: done, took %g sec.\n", Logs.timediff(t0, Logs.second()));
+}
+
+template <typename partset>
+void domain<partset>::reorder_gas(int *Id)
+{
+  for(int i = 0; i < Tp->NumGas; i++)
+    {
+      if(Id[i] != i)
+        {
+          pdata Psource                = Tp->P[i];
+          sph_particle_data SphPsource = Tp->SphP[i];
+
+          int idsource = Id[i];
+          int dest     = Id[i];
+
+          do
+            {
+              pdata Psave                = Tp->P[dest];
+              sph_particle_data SphPsave = Tp->SphP[dest];
+              int idsave                 = Id[dest];
+
+              Tp->P[dest]    = Psource;
+              Tp->SphP[dest] = SphPsource;
+              Id[dest]       = idsource;
+
+              if(dest == i)
+                break;
+
+              Psource    = Psave;
+              SphPsource = SphPsave;
+              idsource   = idsave;
+
+              dest = idsource;
+            }
+          while(1);
+        }
+    }
+}
+
+template <typename partset>
+void domain<partset>::reorder_particles(int *Id, int Nstart, int N)
+{
+  for(int i = Nstart; i < N; i++)
+    {
+      if(Id[i] != i)
+        {
+          pdata Psource = Tp->P[i];
+          int idsource  = Id[i];
+
+          int dest = Id[i];
+
+          do
+            {
+              pdata Psave = Tp->P[dest];
+              int idsave  = Id[dest];
+
+              Tp->P[dest] = Psource;
+              Id[dest]    = idsource;
+
+              if(dest == i)
+                break;
+
+              Psource  = Psave;
+              idsource = idsave;
+
+              dest = idsource;
+            }
+          while(1);
+        }
+    }
+}
+
+template <typename partset>
+void domain<partset>::reorder_PS(int *Id, int Nstart, int N)
+{
+  for(int i = Nstart; i < N; i++)
+    {
+      if(Id[i] != i)
+        {
+          subfind_data PSsource = Tp->PS[i];
+
+          int idsource = Id[i];
+          int dest     = Id[i];
+
+          do
+            {
+              subfind_data PSsave = Tp->PS[dest];
+              int idsave          = Id[dest];
+
+              Tp->PS[dest] = PSsource;
+              Id[dest]     = idsource;
+
+              if(dest == i)
+                break;
+
+              PSsource = PSsave;
+              idsource = idsave;
+
+              dest = idsource;
+            }
+          while(1);
+        }
+    }
+}
+
+template <typename partset>
+void domain<partset>::reorder_P_and_PS(int *Id)
+{
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      if(Id[i] != i)
+        {
+          pdata Psource         = Tp->P[i];
+          subfind_data PSsource = Tp->PS[i];
+
+          int idsource = Id[i];
+          int dest     = Id[i];
+
+          do
+            {
+              pdata Psave         = Tp->P[dest];
+              subfind_data PSsave = Tp->PS[dest];
+              int idsave          = Id[dest];
+
+              Tp->P[dest]  = Psource;
+              Tp->PS[dest] = PSsource;
+              Id[dest]     = idsource;
+
+              if(dest == i)
+                break;
+
+              Psource  = Psave;
+              PSsource = PSsave;
+              idsource = idsave;
+
+              dest = idsource;
+            }
+          while(1);
+        }
+    }
+}
+
+template <typename partset>
+void domain<partset>::reorder_P_PS(int loc_numgas, int loc_numpart)
+{
+  local_sort_data *mp = (local_sort_data *)Mem.mymalloc("mp", sizeof(local_sort_data) * (loc_numpart - loc_numgas));
+  mp -= loc_numgas;
+
+  int *Id = (int *)Mem.mymalloc("Id", sizeof(int) * (loc_numpart - loc_numgas));
+  Id -= loc_numgas;
+
+  for(int i = loc_numgas; i < loc_numpart; i++)
+    {
+      mp[i].index       = i;
+      mp[i].targetindex = Tp->PS[i].TargetIndex;
+    }
+
+  mycxxsort(mp + loc_numgas, mp + loc_numpart, compare_local_sort_data_targetindex);
+
+  for(int i = loc_numgas; i < loc_numpart; i++)
+    Id[mp[i].index] = i;
+
+  reorder_particles(Id, loc_numgas, loc_numpart);
+
+  for(int i = loc_numgas; i < loc_numpart; i++)
+    Id[mp[i].index] = i;
+
+  reorder_PS(Id, loc_numgas, loc_numpart);
+
+  Id += loc_numgas;
+  Mem.myfree(Id);
+  mp += loc_numgas;
+  Mem.myfree(mp);
+}
+
+/* This function redistributes the particles according to what is stored in
+ * PS[].TargetTask, and PS[].TargetIndex.
+ */
+template <typename partset>
+void domain<partset>::particle_exchange_based_on_PS(MPI_Comm Communicator)
+{
+  int CommThisTask, CommNTask, CommPTask;
+  MPI_Comm_size(Communicator, &CommNTask);
+  MPI_Comm_rank(Communicator, &CommThisTask);
+
+  for(CommPTask = 0; CommNTask > (1 << CommPTask); CommPTask++)
+    ;
+
+  int *Send_count  = (int *)Mem.mymalloc_movable(&Send_count, "Send_count", sizeof(int) * CommNTask);
+  int *Send_offset = (int *)Mem.mymalloc_movable(&Send_offset, "Send_offset", sizeof(int) * CommNTask);
+  int *Recv_count  = (int *)Mem.mymalloc_movable(&Recv_count, "Recv_count", sizeof(int) * CommNTask);
+  int *Recv_offset = (int *)Mem.mymalloc_movable(&Recv_offset, "Recv_offset", sizeof(int) * CommNTask);
+  int nimport = 0, nexport = 0, nstay = 0, nlocal = 0;
+
+  /* for type_select == 0, we process gas particles, otherwise all other particles */
+  for(int type_select = 0; type_select < 2; type_select++)
+    {
+      /* In order to be able to later distribute the PS[] array, we need to temporarily save the particle type array,
+       * and save the old particle number
+       */
+
+      unsigned char *Ptype = (unsigned char *)Mem.mymalloc_movable(&Ptype, "Ptype", sizeof(unsigned char) * Tp->NumPart);
+      int *Ptask           = (int *)Mem.mymalloc_movable(&Ptask, "Ptask", sizeof(int) * Tp->NumPart);
+
+      for(int i = 0; i < Tp->NumPart; i++)
+        {
+          Ptype[i] = Tp->P[i].getType();
+          Ptask[i] = Tp->PS[i].TargetTask;
+
+          if(Ptype[i] == 0 && i >= Tp->NumGas)
+            Terminate("Bummer1");
+
+          if(Ptype[i] != 0 && i < Tp->NumGas)
+            Terminate("Bummer2");
+        }
+
+      int NumPart_saved = Tp->NumPart;
+
+      /* distribute gas particles up front */
+      if(type_select == 0)
+        {
+          sph_particle_data *sphBuf = NULL;
+
+          for(int rep = 0; rep < 2; rep++)
+            {
+              for(int n = 0; n < CommNTask; n++)
+                Send_count[n] = 0;
+
+              nstay = 0;
+
+              for(int n = 0; n < Tp->NumGas; n++)
+                {
+                  int target = Ptask[n];
+
+                  if(rep == 0)
+                    {
+                      if(target != CommThisTask)
+                        Send_count[target]++;
+                      else
+                        nstay++;
+                    }
+                  else
+                    {
+                      if(target != CommThisTask)
+                        sphBuf[Send_offset[target] + Send_count[target]++] = Tp->SphP[n];
+                      else
+                        Tp->SphP[nstay++] = Tp->SphP[n];
+                    }
+                }
+
+              if(rep == 0)
+                {
+                  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+                  nimport = 0, nexport = 0;
+                  Recv_offset[0] = Send_offset[0] = 0;
+                  for(int j = 0; j < CommNTask; j++)
+                    {
+                      nexport += Send_count[j];
+                      nimport += Recv_count[j];
+
+                      if(j > 0)
+                        {
+                          Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                          Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                        }
+                    }
+
+                  sphBuf = (sph_particle_data *)Mem.mymalloc_movable(&sphBuf, "sphBuf", nexport * sizeof(sph_particle_data));
+                }
+              else
+                {
+                  Tp->NumGas += (nimport - nexport);
+
+                  int max_loadsph = Tp->NumGas;
+                  MPI_Allreduce(MPI_IN_PLACE, &max_loadsph, 1, MPI_INT, MPI_MAX, Communicator);
+
+                  if(max_loadsph > (1.0 - ALLOC_TOLERANCE) * Tp->MaxPartSph ||
+                     max_loadsph < (1.0 - 3 * ALLOC_TOLERANCE) * Tp->MaxPartSph)
+                    Tp->reallocate_memory_maxpartsph(max_loadsph / (1.0 - 2 * ALLOC_TOLERANCE));
+
+                  for(int ngrp = 1; ngrp < (1 << CommPTask); ngrp++)
+                    {
+                      int target = CommThisTask ^ ngrp;
+
+                      if(target < CommNTask)
+                        {
+                          if(Send_count[target] > 0 || Recv_count[target] > 0)
+                            {
+                              MPI_Sendrecv(sphBuf + Send_offset[target], Send_count[target] * sizeof(sph_particle_data), MPI_BYTE,
+                                           target, TAG_SPHDATA, Tp->SphP + Recv_offset[target] + nstay,
+                                           Recv_count[target] * sizeof(sph_particle_data), MPI_BYTE, target, TAG_SPHDATA, Communicator,
+                                           MPI_STATUS_IGNORE);
+                            }
+                        }
+                    }
+
+                  Mem.myfree(sphBuf);
+                }
+            }
+        }
+
+      pdata *partBuf = NULL;
+
+      for(int rep = 0; rep < 2; rep++)
+        {
+          for(int n = 0; n < CommNTask; n++)
+            Send_count[n] = 0;
+
+          nstay  = 0;
+          nlocal = 0;
+
+          for(int n = 0; n < NumPart_saved; n++)
+            {
+              if(Ptype[n] == type_select || (type_select != 0))
+                {
+                  int target = Ptask[n];
+
+                  if(rep == 0)
+                    {
+                      if(target != CommThisTask)
+                        Send_count[target]++;
+                      else
+                        {
+                          nstay++;
+                          nlocal++;
+                        }
+                    }
+                  else
+                    {
+                      if(target != CommThisTask)
+                        partBuf[Send_offset[target] + Send_count[target]++] = Tp->P[n];
+                      else
+                        {
+                          Tp->P[nstay++] = Tp->P[n];
+                          nlocal++;
+                        }
+                    }
+                }
+              else
+                {
+                  // this is only relevant for type_select == 0
+                  if(rep == 0)
+                    nstay++;
+                  else
+                    Tp->P[nstay++] = Tp->P[n];
+                }
+            }
+
+          if(rep == 0)
+            {
+              MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+              nimport = 0, nexport = 0;
+              Recv_offset[0] = Send_offset[0] = 0;
+              for(int j = 0; j < CommNTask; j++)
+                {
+                  nexport += Send_count[j];
+                  nimport += Recv_count[j];
+
+                  if(j > 0)
+                    {
+                      Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                      Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                    }
+                }
+
+              partBuf = (pdata *)Mem.mymalloc_movable(&partBuf, "partBuf", nexport * sizeof(pdata));
+            }
+          else
+            {
+              Tp->NumPart += (nimport - nexport);
+
+              int max_load = Tp->NumPart;
+              MPI_Allreduce(MPI_IN_PLACE, &max_load, 1, MPI_INT, MPI_MAX, Communicator);
+
+              if(max_load > (1.0 - ALLOC_TOLERANCE) * Tp->MaxPart || max_load < (1.0 - 3 * ALLOC_TOLERANCE) * Tp->MaxPart)
+                Tp->reallocate_memory_maxpart(max_load / (1.0 - 2 * ALLOC_TOLERANCE));
+
+              if(type_select == 0)
+                {
+                  // create a gap to place the incoming particles at the end of the already present gas particles
+                  memmove(static_cast<void *>(Tp->P + nlocal + nimport), static_cast<void *>(Tp->P + nlocal),
+                          (nstay - nlocal) * sizeof(pdata));
+                }
+
+              for(int ngrp = 1; ngrp < (1 << CommPTask); ngrp++)
+                {
+                  int target = CommThisTask ^ ngrp;
+
+                  if(target < CommNTask)
+                    {
+                      if(Send_count[target] > 0 || Recv_count[target] > 0)
+                        {
+                          MPI_Sendrecv(partBuf + Send_offset[target], Send_count[target] * sizeof(pdata), MPI_BYTE, target, TAG_PDATA,
+                                       Tp->P + Recv_offset[target] + nlocal, Recv_count[target] * sizeof(pdata), MPI_BYTE, target,
+                                       TAG_PDATA, Communicator, MPI_STATUS_IGNORE);
+                        }
+                    }
+                }
+
+              Mem.myfree(partBuf);
+            }
+        }
+
+      /* now deal with subfind data */
+
+      subfind_data *subBuf = NULL;
+      for(int rep = 0; rep < 2; rep++)
+        {
+          for(int n = 0; n < CommNTask; n++)
+            Send_count[n] = 0;
+
+          nstay  = 0;
+          nlocal = 0;
+
+          for(int n = 0; n < NumPart_saved; n++)
+            {
+              if(Ptype[n] == type_select || (type_select != 0))
+                {
+                  int target = Ptask[n];
+
+                  if(rep == 0)
+                    {
+                      if(target != CommThisTask)
+                        Send_count[target]++;
+                      else
+                        {
+                          nstay++;
+                          nlocal++;
+                        }
+                    }
+                  else
+                    {
+                      if(target != CommThisTask)
+                        subBuf[Send_offset[target] + Send_count[target]++] = Tp->PS[n];
+                      else
+                        {
+                          Tp->PS[nstay++] = Tp->PS[n];
+                          nlocal++;
+                        }
+                    }
+                }
+              else
+                {
+                  // this is only relevant for type_select == 0
+                  if(rep == 0)
+                    nstay++;
+                  else
+                    Tp->PS[nstay++] = Tp->PS[n];
+                }
+            }
+
+          if(rep == 0)
+            {
+              MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+              nimport = 0, nexport = 0;
+              Recv_offset[0] = Send_offset[0] = 0;
+              for(int j = 0; j < CommNTask; j++)
+                {
+                  nexport += Send_count[j];
+                  nimport += Recv_count[j];
+
+                  if(j > 0)
+                    {
+                      Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                      Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                    }
+                }
+
+              subBuf = (subfind_data *)Mem.mymalloc_movable(&subBuf, "subBuf", nexport * sizeof(subfind_data));
+            }
+          else
+            {
+              /* reallocate with new particle number */
+              Tp->PS = (subfind_data *)Mem.myrealloc_movable(Tp->PS, Tp->NumPart * sizeof(subfind_data));
+
+              if(type_select == 0)
+                {
+                  // create a gap to place the incoming particles at the end of the already present gas particles
+                  memmove(Tp->PS + nlocal + nimport, Tp->PS + nlocal, (nstay - nlocal) * sizeof(subfind_data));
+                }
+
+              for(int ngrp = 1; ngrp < (1 << CommPTask); ngrp++)
+                {
+                  int target = CommThisTask ^ ngrp;
+
+                  if(target < CommNTask)
+                    {
+                      if(Send_count[target] > 0 || Recv_count[target] > 0)
+                        {
+                          MPI_Sendrecv(subBuf + Send_offset[target], Send_count[target] * sizeof(subfind_data), MPI_BYTE, target,
+                                       TAG_KEY, Tp->PS + Recv_offset[target] + nlocal, Recv_count[target] * sizeof(subfind_data),
+                                       MPI_BYTE, target, TAG_KEY, Communicator, MPI_STATUS_IGNORE);
+                        }
+                    }
+                }
+
+              Mem.myfree(subBuf);
+            }
+        }
+
+      Mem.myfree(Ptask);
+      Mem.myfree(Ptype);
+    }
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  /* finally, let's also address the desired local order according to PS[].TargetIndex */
+
+  if(Tp->NumGas)
+    {
+      local_sort_data *mp = (local_sort_data *)Mem.mymalloc("mp", sizeof(local_sort_data) * Tp->NumGas);
+      int *Id             = (int *)Mem.mymalloc("Id", sizeof(int) * Tp->NumGas);
+
+      for(int i = 0; i < Tp->NumGas; i++)
+        {
+          mp[i].index       = i;
+          mp[i].targetindex = Tp->PS[i].TargetIndex;
+        }
+
+      mycxxsort(mp, mp + Tp->NumGas, compare_local_sort_data_targetindex);
+
+      for(int i = 0; i < Tp->NumGas; i++)
+        Id[mp[i].index] = i;
+
+      reorder_gas(Id);
+
+      for(int i = 0; i < Tp->NumGas; i++)
+        Id[mp[i].index] = i;
+
+      reorder_PS(Id, 0, Tp->NumGas);
+
+      Mem.myfree(Id);
+      Mem.myfree(mp);
+    }
+
+  if(Tp->NumPart - Tp->NumGas > 0)
+    {
+      reorder_P_PS(Tp->NumGas, Tp->NumPart);
+    }
+}
+
+#include "../data/simparticles.h"
+template class domain<simparticles>;
+
+#ifdef LIGHTCONE_PARTICLES
+#include "../data/lcparticles.h"
+template class domain<lcparticles>;
+#endif
diff --git a/src/domain/domain_toplevel.cc b/src/domain/domain_toplevel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..062b8f4c330e78fc49260fd5d673ac013a33e577
--- /dev/null
+++ b/src/domain/domain_toplevel.cc
@@ -0,0 +1,301 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file domain_toplevel.cc
+ *
+ *  \brief construction of top-level for subdividing the volume into domains
+ */
+
+#include "gadgetconfig.h"
+
+#include <mpi.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/cxxsort.h"
+#include "../sort/peano.h"
+#include "../system/system.h"
+
+template <typename partset>
+void domain<partset>::domain_do_local_refine(int n, int *list)
+{
+  double *worklist     = (double *)Mem.mymalloc("worklist", 8 * n * sizeof(double));
+  long long *countlist = (long long *)Mem.mymalloc("countlist", 8 * n * sizeof(long long));
+
+  /* create the new nodes */
+  for(int k = 0; k < n; k++)
+    {
+      int i                = list[k];
+      TopNodes[i].Daughter = NTopnodes;
+      NTopnodes += 8;
+      NTopleaves += 7;
+
+      for(int j = 0; j < 8; j++)
+        {
+          int sub = TopNodes[i].Daughter + j;
+
+          TopNodes[sub].Daughter = -1;
+          topNodes[sub].Level    = topNodes[i].Level + 1;
+          topNodes[sub].StartKey = topNodes[i].StartKey + get_peanokey_offset(j, (3 * (BITS_FOR_POSITIONS - topNodes[sub].Level)));
+          topNodes[sub].PIndex   = topNodes[i].PIndex;
+          topNodes[sub].Count    = 0;
+          topNodes[sub].Cost     = 0;
+        }
+
+      int sub = TopNodes[i].Daughter;
+
+      for(int p = topNodes[i].PIndex, j = 0; p < topNodes[i].PIndex + topNodes[i].Count; p++)
+        {
+          if(j < 7)
+            while(mp[p].key >= topNodes[sub + 1].StartKey)
+              {
+                j++;
+                sub++;
+                topNodes[sub].PIndex = p;
+                if(j >= 7)
+                  break;
+              }
+
+          topNodes[sub].Cost += mp[p].cost;
+          topNodes[sub].Count++;
+        }
+
+      for(int j = 0; j < 8; j++)
+        {
+          int sub              = TopNodes[i].Daughter + j;
+          worklist[k * 8 + j]  = topNodes[sub].Cost;
+          countlist[k * 8 + j] = topNodes[sub].Count;
+        }
+    }
+
+  allreduce_sum<double>(worklist, 8 * n, Communicator);
+  allreduce_sum<long long>(countlist, 8 * n, Communicator);
+  /*
+  MPI_Allreduce(MPI_IN_PLACE, worklist, 8 * n, MPI_DOUBLE, MPI_SUM, Communicator);
+  MPI_Allreduce(MPI_IN_PLACE, countlist, 8 * n, MPI_LONG_LONG, MPI_SUM, Communicator);
+*/
+
+  /* store the results in the corresponding top nodes */
+
+  for(int k = 0; k < n; k++)
+    {
+      int i = list[k];
+
+      for(int j = 0; j < 8; j++)
+        {
+          int sub                = TopNodes[i].Daughter + j;
+          topNodes[sub].Cost     = worklist[k * 8 + j];
+          topNodes[sub].CountTot = countlist[k * 8 + j];
+        }
+    }
+
+  Mem.myfree(countlist);
+  Mem.myfree(worklist);
+}
+
+/*! This function walks the global top tree in order to establish the
+ *  number of leaves it has, and for assigning the leaf numbers along the
+ *  Peano-Hilbert Curve. These leaves are later combined to domain pieces,
+ *  which are distributed to different processors.
+ */
+template <typename partset>
+void domain<partset>::domain_walktoptree(int no)
+{
+  if(TopNodes[no].Daughter == -1)
+    {
+      TopNodes[no].Leaf = NTopleaves;
+
+      domain_leaf_cost[TopNodes[no].Leaf].Cost = topNodes[no].Cost;
+
+      NTopleaves++;
+    }
+  else
+    {
+      for(int i = 0; i < 8; i++)
+        domain_walktoptree(TopNodes[no].Daughter + i);
+    }
+}
+
+template <>
+double domain<simparticles>::domain_get_cost_summed_over_timebins(int i)
+{
+  double cost = 0;
+
+  for(int n = 0; n < NumTimeBinsToBeBalanced; n++)
+    {
+      int bin = ListOfTimeBinsToBeBalanced[n];
+
+      if(bin >= Tp->P[i].TimeBinGrav)
+        cost += GravCostNormFactors[n] * Tp->P[i].GravCost;
+
+      if(Tp->P[i].getType() == 0)
+        {
+#ifdef SUBFIND
+          if(Mode == COLL_SUBFIND)
+            {
+              if(Tp->PS[i].DomainFlag)
+                cost += HydroCostNormFactors[n];
+            }
+          else
+#endif
+            {
+              if(bin >= Tp->P[i].getTimeBinHydro())
+                cost += HydroCostNormFactors[n];
+            }
+        }
+    }
+
+  return cost;
+}
+
+#ifdef LIGHTCONE_PARTICLES
+template <>
+double domain<lcparticles>::domain_get_cost_summed_over_timebins(int i)
+{
+  return 0;
+}
+#endif
+
+/*! This function constructs the global top-level tree node that is used
+ *  for the domain decomposition. This is done by considering the string of
+ *  Peano-Hilbert keys for all particles, which is recursively chopped off
+ *  in pieces of eight segments until each segment holds at most a certain
+ *  number of particles.
+ */
+template <typename partset>
+void domain<partset>::domain_determineTopTree(void)
+{
+  double t0           = Logs.second();
+  int message_printed = 0;
+
+  mp = (domain_peano_hilbert_data *)Mem.mymalloc_movable(&mp, "mp", sizeof(domain_peano_hilbert_data) * Tp->NumPart);
+
+  int count  = 0;
+  double sum = 0.0;
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      mp[i].key = domain_key[i] = peano_hilbert_key(Tp->P[i].IntPos[0], Tp->P[i].IntPos[1], Tp->P[i].IntPos[2], BITS_FOR_POSITIONS);
+      mp[i].cost                = 0;
+      count++;
+
+      mp[i].cost += domain_get_cost_summed_over_timebins(i);
+
+      mp[i].cost += NormFactorLoad;
+
+      if(Tp->P[i].getType() == 0)
+        mp[i].cost += NormFactorLoadSph;
+
+      sum += mp[i].cost;
+    }
+
+  MPI_Allreduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  domain_printf("DOMAIN: Sum=%g  TotalCost=%g  NumTimeBinsToBeBalanced=%d  MultipleDomains=%d\n", sum, TotalCost,
+                NumTimeBinsToBeBalanced, MultipleDomains);
+
+  mycxxsort(mp, mp + Tp->NumPart, domain_compare_key);
+
+  NTopnodes            = 1;
+  NTopleaves           = 1;
+  TopNodes[0].Daughter = -1;
+  topNodes[0].Level    = 0;
+  topNodes[0].StartKey = {0, 0, 0};
+  topNodes[0].PIndex   = 0;
+  topNodes[0].Count    = count; /* this is the local number */
+  topNodes[0].CountTot = Tp->TotNumPart;
+  topNodes[0].Cost     = sum;
+
+  /* in list[], we store the node indices hat should be refined */
+  int *list = (int *)Mem.mymalloc_movable(&list, "list", MaxTopNodes * sizeof(int));
+
+  double limit = 1.0 / (All.TopNodeFactor * NTask);
+
+  int iter = 0;
+
+  do
+    {
+      count = 0;
+
+      for(int n = 0; n < NTopnodes; n++)
+        if(TopNodes[n].Daughter == -1)  // consider only leaf nodes
+          {
+            if(topNodes[n].CountTot > 1)  // only split nodes with at list 1 particles
+              if(topNodes[n].Cost > limit)
+                {
+                  while(NTopnodes + 8 * (count + 1) > MaxTopNodes)
+                    {
+                      domain_printf("DOMAIN: Increasing TopNodeAllocFactor=%g  ", All.TopNodeAllocFactor);
+                      All.TopNodeAllocFactor *= 1.3;
+                      domain_printf("new value=%g\n", All.TopNodeAllocFactor);
+                      if(All.TopNodeAllocFactor > 1000)
+                        Terminate("something seems to be going seriously wrong here. Stopping.\n");
+
+                      MaxTopNodes = All.TopNodeAllocFactor * std::max<int>(All.TopNodeFactor * MultipleDomains * NTask, BASENUMBER);
+
+                      topNodes   = (local_topnode_data *)Mem.myrealloc_movable(topNodes, (MaxTopNodes * sizeof(local_topnode_data)));
+                      TopNodes   = (topnode_data *)Mem.myrealloc_movable(TopNodes, (MaxTopNodes * sizeof(topnode_data)));
+                      TaskOfLeaf = (int *)Mem.myrealloc_movable(TaskOfLeaf, (MaxTopNodes * sizeof(int)));
+                      domain_leaf_cost =
+                          (domain_cost_data *)Mem.myrealloc_movable(domain_leaf_cost, (MaxTopNodes * sizeof(domain_cost_data)));
+                      list = (int *)Mem.myrealloc_movable(list, MaxTopNodes * sizeof(int));
+                    }
+
+                  if(topNodes[n].Level >= BITS_FOR_POSITIONS - 2)
+                    {
+                      if(message_printed == 0)
+                        {
+                          domain_printf(
+                              "DOMAIN: Note: we would like to refine top-tree beyond the level allowed by the selected positional "
+                              "accuracy.\n");
+                          message_printed = 1;
+                        }
+                    }
+                  else
+                    {
+                      list[count] = n;
+                      count++;
+                    }
+                }
+          }
+
+      if(count > 0)
+        {
+          domain_do_local_refine(count, list);
+          iter++;
+        }
+    }
+  while(count > 0);
+
+  Mem.myfree(list);
+  Mem.myfree(mp);
+
+  /* count the number of top leaves */
+  NTopleaves = 0;
+  domain_walktoptree(0);
+
+  double t1 = Logs.second();
+
+  domain_printf("DOMAIN: NTopleaves=%d, determination of top-level tree involved %d iterations and took %g sec\n", NTopleaves, iter,
+                Logs.timediff(t0, t1));
+}
+
+#include "../data/simparticles.h"
+template class domain<simparticles>;
+
+#ifdef LIGHTCONE_PARTICLES
+#include "../data/lcparticles.h"
+template class domain<lcparticles>;
+#endif
diff --git a/src/fmm/fmm.cc b/src/fmm/fmm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07629d9401ecb3432ad5a8a909c1956f5aa9c845
--- /dev/null
+++ b/src/fmm/fmm.cc
@@ -0,0 +1,2033 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file fmm.cc
+ *
+ *  \brief main routines for gravitational force computation with the fast multipole method (FMM)
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef FMM
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fmm/fmm.h"
+#include "../gravity/ewald.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../mpi_utils/shared_mem_handler.h"
+#include "../pm/pm.h"
+#include "../sort/cxxsort.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+void fmm::fmm_force_passdown(int no, unsigned char no_shmrank, taylor_data taylor_current)
+{
+  if(no >= MaxPart && no < MaxPart + MaxNodes) /* an internal node  */
+    {
+      /* first, let's add the external field expansion to the local one accumulated for this node */
+#ifdef EVALPOTENTIAL
+      taylor_current.coeff.phi += TaylorCoeff[no].coeff.phi;
+#endif
+      taylor_current.coeff.dphi += TaylorCoeff[no].coeff.dphi;
+#if(MULTIPOLE_ORDER >= 2)
+      taylor_current.coeff.d2phi += TaylorCoeff[no].coeff.d2phi;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+      taylor_current.coeff.d3phi += TaylorCoeff[no].coeff.d3phi;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+      taylor_current.coeff.d4phi += TaylorCoeff[no].coeff.d4phi;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+      taylor_current.coeff.d5phi += TaylorCoeff[no].coeff.d5phi;
+#endif
+
+      taylor_current.coeff.interactions += TaylorCoeff[no].coeff.interactions;
+    }
+  else
+    Terminate("this is not an internal node, which should not happen\n");
+
+  gravnode *node_no = get_nodep(no, no_shmrank);
+
+  int p = node_no->nextnode; /* open cell */
+
+  unsigned char shmrank = node_no->nextnode_shmrank;
+
+  while(p != node_no->sibling || (shmrank != node_no->sibling_shmrank && node_no->sibling >= MaxPart + D->NTopnodes))
+    {
+      if(p < MaxPart || (p >= ImportedNodeOffset && p < EndOfTreePoints)) /* we have found a single particle */
+        {
+          if(shmrank != Shmem.Island_ThisTask)
+            Terminate("odd");
+
+          int m, mp;
+          MyIntPosType *intpos;
+
+          if(p >= ImportedNodeOffset) /* an imported Treepoint particle  */
+            {
+              m      = p - ImportedNodeOffset;
+              intpos = Points[m].IntPos;
+              mp     = -1;
+              p      = get_nextnodep(shmrank)[p - MaxNodes];
+            }
+          else
+            {
+              m      = p;
+              intpos = Tp->P[p].IntPos;
+              mp     = p;
+              p      = get_nextnodep(shmrank)[p];
+            }
+
+          /* apply expansion to particle */
+
+          vector<MyReal> dxyz;
+          Tp->nearest_image_intpos_to_pos(intpos, node_no->s.da, dxyz.da);
+
+#ifdef EVALPOTENTIAL
+          MyReal pot = taylor_current.coeff.phi + taylor_current.coeff.dphi * dxyz;
+#endif
+          vector<MyReal> dphi = taylor_current.coeff.dphi;
+
+#if(MULTIPOLE_ORDER >= 2)
+          vector<MyReal> d2phidxyz = taylor_current.coeff.d2phi * dxyz;
+          dphi += d2phidxyz;
+#ifdef EVALPOTENTIAL
+          pot += 0.5f * (d2phidxyz * dxyz);
+#endif
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+          vector<MyReal> d3phidxyz2 = contract_twice(taylor_current.coeff.d3phi, dxyz);
+          dphi += 0.5f * d3phidxyz2;
+#ifdef EVALPOTENTIAL
+          pot += static_cast<MyReal>(1.0 / 6) * (dxyz * d3phidxyz2);
+#endif
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+          vector<MyReal> d4phidxyz3 = contract_thrice(taylor_current.coeff.d4phi, dxyz);
+          dphi += static_cast<MyReal>(1.0 / 6) * d4phidxyz3;
+#ifdef EVALPOTENTIAL
+          pot += static_cast<MyReal>(1.0 / 24) * (dxyz * d4phidxyz3);
+#endif
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+          vector<MyReal> d5phidxyz4 = contract_fourtimes(taylor_current.coeff.d5phi, dxyz);
+          dphi += static_cast<MyReal>(1.0 / 24) * d5phidxyz4;
+#ifdef EVALPOTENTIAL
+          pot += static_cast<MyReal>(1.0 / 120) * (dxyz * d5phidxyz4);
+#endif
+#endif
+
+          if(mp >= 0)
+            {
+#ifndef HIERARCHICAL_GRAVITY
+              if(Tp->TimeBinSynchronized[Tp->P[mp].TimeBinGrav])
+#endif
+                {
+                  Tp->P[mp].GravAccel -= dphi;
+#ifdef EVALPOTENTIAL
+                  Tp->P[mp].Potential += pot;
+#endif
+
+                  if(MeasureCostFlag)
+                    Tp->P[mp].GravCost += taylor_current.coeff.interactions;
+
+                  interactioncountEffective += taylor_current.coeff.interactions;
+                }
+            }
+          else
+            {
+#ifndef HIERARCHICAL_GRAVITY
+              if(Points[m].ActiveFlag)
+#endif
+                {
+                  int idx = ResultIndexList[m];
+                  ResultsActiveImported[idx].GravAccel -= dphi;
+#ifdef EVALPOTENTIAL
+                  ResultsActiveImported[idx].Potential += pot;
+#endif
+                  if(MeasureCostFlag)
+                    ResultsActiveImported[idx].GravCost += taylor_current.coeff.interactions;
+
+                  interactioncountEffective += taylor_current.coeff.interactions;
+                }
+            }
+        }
+      else if(p < MaxPart + MaxNodes) /* an internal node  */
+        {
+          gravnode *node_p = get_nodep(p, shmrank);
+
+          if(fmm_depends_on_local_mass(p, shmrank))
+            {
+              taylor_data taylor_sub = taylor_current;
+
+              vector<MyReal> dxyz;
+              Tp->nearest_image_intpos_to_pos(node_p->s.da, node_no->s.da, dxyz.da);
+
+              /* now shift the expansion center */
+
+#ifdef EVALPOTENTIAL
+              taylor_sub.coeff.phi += taylor_current.coeff.dphi * dxyz;
+#endif
+
+#if(MULTIPOLE_ORDER >= 2)
+              vector<MyReal> delta_dphi = taylor_current.coeff.d2phi * dxyz;
+              taylor_sub.coeff.dphi += delta_dphi;
+#ifdef EVALPOTENTIAL
+              taylor_sub.coeff.phi += 0.5f * (delta_dphi * dxyz);
+#endif
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+              symtensor2<MyReal> delta_d2phi = taylor_current.coeff.d3phi * dxyz;
+
+              taylor_sub.coeff.d2phi += delta_d2phi;
+
+              delta_dphi = delta_d2phi * dxyz;
+
+              taylor_sub.coeff.dphi += 0.5f * delta_dphi;
+#ifdef EVALPOTENTIAL
+              taylor_sub.coeff.phi += static_cast<MyReal>(1.0 / 6) * (delta_dphi * dxyz);
+#endif
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+              symtensor3<MyReal> delta_d3phi = taylor_current.coeff.d4phi * dxyz;
+              taylor_sub.coeff.d3phi += delta_d3phi;
+
+              delta_d2phi = delta_d3phi * dxyz;
+              taylor_sub.coeff.d2phi += 0.5f * delta_d2phi;
+
+              delta_dphi = delta_d2phi * dxyz;
+              taylor_sub.coeff.dphi += static_cast<MyReal>(1.0 / 6) * delta_dphi;
+#ifdef EVALPOTENTIAL
+              taylor_sub.coeff.phi += static_cast<MyReal>(1.0 / 24) * (delta_dphi * dxyz);
+#endif
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+              symtensor4<MyReal> delta_d4phi = taylor_current.coeff.d5phi * dxyz;
+              taylor_sub.coeff.d4phi += delta_d4phi;
+
+              delta_d3phi = delta_d4phi * dxyz;
+              taylor_sub.coeff.d3phi += 0.5f * delta_d3phi;
+
+              delta_d2phi = delta_d3phi * dxyz;
+              taylor_sub.coeff.d2phi += static_cast<MyReal>(1.0 / 6) * delta_d2phi;
+
+              delta_dphi = delta_d2phi * dxyz;
+              taylor_sub.coeff.dphi += static_cast<MyReal>(1.0 / 24) * delta_dphi;
+#ifdef EVALPOTENTIAL
+              taylor_sub.coeff.phi += static_cast<MyReal>(1.0 / 120) * (delta_dphi * dxyz);
+#endif
+#endif
+              fmm_force_passdown(p, shmrank, taylor_sub);
+            }
+
+          p       = node_p->sibling;
+          shmrank = node_p->sibling_shmrank;
+        }
+      else if(p >= EndOfTreePoints && p < EndOfForeignNodes) /* an imported tree node */
+        {
+          Terminate("A");
+          gravnode *nop = get_nodep(p, shmrank);
+          p             = nop->sibling;
+          shmrank       = nop->sibling_shmrank;
+        }
+      else if(p >= EndOfForeignNodes) /* an imported particle below an imported tree node */
+        {
+          Terminate("B");
+          foreign_gravpoint_data *foreignpoint = get_foreignpointsp(p - EndOfForeignNodes, shmrank);
+          p                                    = foreignpoint->Nextnode;
+          shmrank                              = foreignpoint->Nextnode_shmrank;
+        }
+      else
+        {
+          /* a pseudo point */
+          Terminate(
+              "should not happen: p=%d MaxPart=%d MaxNodes=%d  ImportedNodeOffset=%d  EndOfTreePoints=%d  EndOfForeignNodes=%d "
+              "shmrank=%d",
+              p, MaxPart, MaxNodes, ImportedNodeOffset, EndOfTreePoints, EndOfForeignNodes, shmrank);
+        }
+    }
+}
+
+inline void fmm::fmm_open_both(gravnode *noptr_sink, gravnode *noptr_source, int mintopleafnode, int committed)
+{
+  int self_flag = 0;
+  if(noptr_sink == noptr_source)
+    self_flag = 1;
+
+  /* open node */
+  int p_sink                 = noptr_sink->nextnode;
+  unsigned char shmrank_sink = noptr_sink->nextnode_shmrank;
+
+  while(p_sink != noptr_sink->sibling ||
+        (shmrank_sink != noptr_sink->sibling_shmrank && noptr_sink->sibling >= MaxPart + D->NTopnodes))
+    {
+      int next_sink;
+      unsigned char next_shmrank_sink;
+      char type_sink;
+
+      if(p_sink < MaxPart) /* a local particle */
+        {
+          /* note: here shmrank cannot change */
+          next_sink         = get_nextnodep(shmrank_sink)[p_sink];
+          next_shmrank_sink = shmrank_sink;
+          type_sink         = NODE_TYPE_LOCAL_PARTICLE;
+        }
+      else if(p_sink < MaxPart + MaxNodes) /* an internal node  */
+        {
+          gravnode *nop     = get_nodep(p_sink, shmrank_sink);
+          next_sink         = nop->sibling;
+          next_shmrank_sink = nop->sibling_shmrank;
+          type_sink         = NODE_TYPE_LOCAL_NODE;
+        }
+      else if(p_sink >= ImportedNodeOffset && p_sink < EndOfTreePoints) /* an imported Treepoint particle */
+        {
+          /* note: here shmrank cannot change */
+          next_sink         = get_nextnodep(shmrank_sink)[p_sink - MaxNodes];
+          next_shmrank_sink = shmrank_sink;
+          type_sink         = NODE_TYPE_TREEPOINT_PARTICLE;
+        }
+      else if(p_sink >= EndOfTreePoints && p_sink < EndOfForeignNodes) /* an imported tree node */
+        {
+          gravnode *nop     = get_nodep(p_sink, shmrank_sink);
+          next_sink         = nop->sibling;
+          next_shmrank_sink = nop->sibling_shmrank;
+          type_sink         = NODE_TYPE_FETCHED_NODE;
+        }
+      else if(p_sink >= EndOfForeignNodes)
+        {
+          foreign_gravpoint_data *foreignpoint = get_foreignpointsp(p_sink - EndOfForeignNodes, shmrank_sink);
+          next_sink                            = foreignpoint->Nextnode;
+          next_shmrank_sink                    = foreignpoint->Nextnode_shmrank;
+          type_sink                            = NODE_TYPE_FETCHED_PARTICLE;
+        }
+      else
+        {
+          /* a pseudo point */
+          next_sink = 0;
+          type_sink = 0;
+          Terminate("pseudo particle - should not happen");
+        }
+
+      int p_source                 = noptr_source->nextnode; /* open cell */
+      unsigned char shmrank_source = noptr_source->nextnode_shmrank;
+
+      while(p_source != noptr_source->sibling ||
+            (shmrank_source != noptr_source->sibling_shmrank && noptr_source->sibling >= MaxPart + D->NTopnodes))
+        {
+          int next_source;
+          unsigned char next_shmrank_source;
+          char type_source;
+
+          if(p_source < MaxPart) /* a local particle */
+            {
+              /* note: here shmrank cannot change */
+              next_source         = get_nextnodep(shmrank_source)[p_source];
+              next_shmrank_source = shmrank_source;
+              type_source         = NODE_TYPE_LOCAL_PARTICLE;
+            }
+          else if(p_source < MaxPart + MaxNodes) /* an internal node  */
+            {
+              gravnode *nop       = get_nodep(p_source, shmrank_source);
+              next_source         = nop->sibling;
+              next_shmrank_source = nop->sibling_shmrank;
+              type_source         = NODE_TYPE_LOCAL_NODE;
+            }
+          else if(p_source >= ImportedNodeOffset && p_source < EndOfTreePoints) /* an imported Treepoint particle */
+            {
+              /* note: here shmrank cannot change */
+              next_source         = get_nextnodep(shmrank_source)[p_source - MaxNodes];
+              next_shmrank_source = shmrank_source;
+              type_source         = NODE_TYPE_TREEPOINT_PARTICLE;
+            }
+          else if(p_source >= EndOfTreePoints && p_source < EndOfForeignNodes) /* an imported tree node */
+            {
+              gravnode *nop       = get_nodep(p_source, shmrank_source);
+              next_source         = nop->sibling;
+              next_shmrank_source = nop->sibling_shmrank;
+              type_source         = NODE_TYPE_FETCHED_NODE;
+            }
+          else if(p_source >= EndOfForeignNodes)
+            {
+              foreign_gravpoint_data *foreignpoint = get_foreignpointsp(p_source - EndOfForeignNodes, shmrank_source);
+              next_source                          = foreignpoint->Nextnode;
+              next_shmrank_source                  = foreignpoint->Nextnode_shmrank;
+              type_source                          = NODE_TYPE_FETCHED_PARTICLE;
+            }
+          else
+            {
+              /* a pseudo point */
+              next_source = 0;
+              type_source = 0;
+              Terminate("pseudo particle - should not happen");
+            }
+
+          if(self_flag == 0 || p_source >= p_sink)
+            {
+              if(type_sink >= NODE_TYPE_LOCAL_NODE && type_source <= NODE_TYPE_FETCHED_PARTICLE)
+                {
+                  /* in this case we have node-particle interaction, which we swap into a particle-node interaction */
+                  fmm_force_interact(p_source, p_sink, type_source, type_sink, shmrank_source, shmrank_sink, mintopleafnode,
+                                     committed);
+                }
+              else
+                {
+                  fmm_force_interact(p_sink, p_source, type_sink, type_source, shmrank_sink, shmrank_source, mintopleafnode,
+                                     committed);
+                }
+            }
+
+          p_source       = next_source;
+          shmrank_source = next_shmrank_source;
+        }
+
+      p_sink       = next_sink;
+      shmrank_sink = next_shmrank_sink;
+    }
+}
+
+inline void fmm::fmm_open_node(int no_particle, gravnode *nop, char type_particle, unsigned char shmrank_particle, int mintopleafnode,
+                               int committed)
+{
+  int p                 = nop->nextnode;
+  unsigned char shmrank = nop->nextnode_shmrank;
+
+  while(p != nop->sibling || (shmrank != nop->sibling_shmrank && nop->sibling >= MaxPart + D->NTopnodes))
+    {
+      if(p < 0)
+        Terminate("p=%d < 0", p);
+
+      int next;
+      unsigned char next_shmrank;
+      char type;
+
+      if(p < MaxPart) /* a local particle */
+        {
+          /* note: here shmrank cannot change */
+          next         = get_nextnodep(shmrank)[p];
+          next_shmrank = shmrank;
+          type         = NODE_TYPE_LOCAL_PARTICLE;
+        }
+      else if(p < MaxPart + MaxNodes) /* an internal node  */
+        {
+          gravnode *nop = get_nodep(p, shmrank);
+          next          = nop->sibling;
+          next_shmrank  = nop->sibling_shmrank;
+          type          = NODE_TYPE_LOCAL_NODE;
+        }
+      else if(p >= ImportedNodeOffset && p < EndOfTreePoints) /* an imported Treepoint particle  */
+        {
+          /* note: here shmrank cannot change */
+          next         = get_nextnodep(shmrank)[p - MaxNodes];
+          next_shmrank = shmrank;
+          type         = NODE_TYPE_TREEPOINT_PARTICLE;
+        }
+      else if(p >= EndOfTreePoints && p < EndOfForeignNodes) /* an imported tree node */
+        {
+          gravnode *nop = get_nodep(p, shmrank);
+          next          = nop->sibling;
+          next_shmrank  = nop->sibling_shmrank;
+          type          = NODE_TYPE_FETCHED_NODE;
+        }
+      else if(p >= EndOfForeignNodes) /* an imported particle below an imported tree node */
+        {
+          foreign_gravpoint_data *foreignpoint = get_foreignpointsp(p - EndOfForeignNodes, shmrank);
+
+          next         = foreignpoint->Nextnode;
+          next_shmrank = foreignpoint->Nextnode_shmrank;
+          type         = NODE_TYPE_FETCHED_PARTICLE;
+        }
+      else
+        {
+          /* a pseudo point */
+          Terminate(
+              "should not happen: p=%d MaxPart=%d MaxNodes=%d  ImportedNodeOffset=%d  EndOfTreePoints=%d  EndOfForeignNodes=%d "
+              "shmrank=%d",
+              p, MaxPart, MaxNodes, ImportedNodeOffset, EndOfTreePoints, EndOfForeignNodes, shmrank);
+        }
+
+      fmm_force_interact(no_particle, p, type_particle, type, shmrank_particle, shmrank, mintopleafnode, committed);
+
+      p       = next;
+      shmrank = next_shmrank;
+    }
+}
+
+inline void fmm::fmm_particle_particle_interaction(int no_sink, int no_source, int type_sink, int type_source,
+                                                   unsigned char shmrank_sink, unsigned char shmrank_source)
+{
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  if(skip_actual_force_computation)
+    return;
+#endif
+
+  MyIntPosType *intpos_n, *intpos_m;
+  MyReal mass_n, mass_m;
+  MyReal h_n, h_m;
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+  int test_n, test_m;
+#endif
+
+  /* in one of the following three cases we have a single particle on the sink side */
+  if(type_sink == NODE_TYPE_LOCAL_PARTICLE)
+    {
+      particle_data *P = get_Pp(no_sink, shmrank_sink);
+
+      intpos_n = P->IntPos;
+      mass_n   = P->getMass();
+      h_n      = All.ForceSoftening[P->getSofteningClass()];
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_n = P->InsideOutsideFlag;
+#endif
+    }
+  else if(type_sink == NODE_TYPE_TREEPOINT_PARTICLE)
+    {
+      gravpoint_data *Pointp = get_pointsp(no_sink - ImportedNodeOffset, shmrank_sink);
+
+      intpos_n = Pointp->IntPos;
+      mass_n   = Pointp->Mass;
+      h_n      = All.ForceSoftening[Pointp->getSofteningClass()];
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_n = Pointp->InsideOutsideFlag;
+#endif
+    }
+  else /* a point that was fetched */
+    {
+      foreign_gravpoint_data *foreignpoint = get_foreignpointsp(no_sink - EndOfForeignNodes, shmrank_sink);
+
+      intpos_n = foreignpoint->IntPos;
+      mass_n   = foreignpoint->Mass;
+      h_n      = All.ForceSoftening[foreignpoint->getSofteningClass()];
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_n = foreignpoint->InsideOutsideFlag;
+#endif
+    }
+
+  /* in one of the following three cases we have a single particle on the source side */
+  if(type_source == NODE_TYPE_LOCAL_PARTICLE)
+    {
+      particle_data *P = get_Pp(no_source, shmrank_source);
+
+      intpos_m = P->IntPos;
+      mass_m   = P->getMass();
+      h_m      = All.ForceSoftening[P->getSofteningClass()];
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_m = P->InsideOutsideFlag;
+#endif
+    }
+  else if(type_source == NODE_TYPE_TREEPOINT_PARTICLE)
+    {
+      gravpoint_data *Pointp = get_pointsp(no_source - ImportedNodeOffset, shmrank_source);
+
+      intpos_m = Pointp->IntPos;
+      mass_m   = Pointp->Mass;
+      h_m      = All.ForceSoftening[Pointp->getSofteningClass()];
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_m = Pointp->InsideOutsideFlag;
+#endif
+    }
+  else
+    {
+      foreign_gravpoint_data *foreignpoint = get_foreignpointsp(no_source - EndOfForeignNodes, shmrank_source);
+
+      intpos_m = foreignpoint->IntPos;
+      mass_m   = foreignpoint->Mass;
+      h_m      = All.ForceSoftening[foreignpoint->getSofteningClass()];
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_m = foreignpoint->InsideOutsideFlag;
+#endif
+    }
+
+  MyReal h_max = (h_m > h_n) ? h_m : h_n;
+
+  vector<MyReal> dxyz;
+  Tp->nearest_image_intpos_to_pos(intpos_m, intpos_n, dxyz.da); /* converts the integer distance to floating point */
+
+  MyReal r2   = dxyz.r2();
+  MyReal r    = sqrt(r2);
+  MyReal rinv = (r > 0) ? 1 / r : 0;
+
+  gfactors gfac;
+
+#ifdef PMGRID
+  if(DoPM)
+    {
+      mesh_factors *mfp = &mf[LOW_MESH];
+
+#ifdef PLACEHIGHRESREGION
+      if((DoPM & TREE_ACTIVE_CUTTOFF_HIGHRES_PM))
+        {
+          if(test_m == FLAG_INSIDE && test_n == FLAG_INSIDE)
+            mfp = &mf[HIGH_MESH];
+        }
+#endif
+      if(modify_gfactors_pm_monopole(gfac, r, rinv, mfp)) /* if we are outside the cut-off radius, we have no interaction */
+        return;
+    }
+#endif
+
+  get_gfactors_monopole(gfac, r, h_max, rinv);
+
+#ifdef EVALPOTENTIAL
+  MyReal D0 = gfac.fac0;
+#endif
+  vector<MyReal> D1 = gfac.fac1 * rinv * dxyz;
+
+  if(DoEwald)
+    {
+      ewald_data ew;
+      Ewald.ewald_gridlookup(intpos_m, intpos_n, ewald::POINTMASS, ew);
+
+      D1 -= ew.D1phi;
+
+#ifdef EVALPOTENTIAL
+      D0 -= ew.D0phi;
+#endif
+    }
+
+  if(shmrank_sink == Shmem.Island_ThisTask)
+    {
+      if(type_sink == NODE_TYPE_LOCAL_PARTICLE)
+        {
+#ifndef HIERARCHICAL_GRAVITY
+          if(Tp->TimeBinSynchronized[Tp->P[no_sink].TimeBinGrav])
+#endif
+            {
+              Tp->P[no_sink].GravAccel -= mass_m * D1;
+#ifdef EVALPOTENTIAL
+              Tp->P[no_sink].Potential -= mass_m * D0;
+#endif
+              if(MeasureCostFlag)
+                Tp->P[no_sink].GravCost++;
+
+              interactioncountPP += 1;
+            }
+        }
+      else if(type_sink == NODE_TYPE_TREEPOINT_PARTICLE)
+        {
+#ifndef HIERARCHICAL_GRAVITY
+          if(Points[no_sink - ImportedNodeOffset].ActiveFlag)
+#endif
+            {
+              int idx = ResultIndexList[no_sink - ImportedNodeOffset];
+              ResultsActiveImported[idx].GravAccel -= mass_m * D1;
+#ifdef EVALPOTENTIAL
+              ResultsActiveImported[idx].Potential -= mass_m * D0;
+#endif
+              if(MeasureCostFlag)
+                ResultsActiveImported[idx].GravCost++;
+
+              interactioncountPP += 1;
+            }
+        }
+    }
+
+  if(shmrank_source == Shmem.Island_ThisTask)
+    {
+      if(type_source == NODE_TYPE_LOCAL_PARTICLE)
+        {
+#ifndef HIERARCHICAL_GRAVITY
+          if(Tp->TimeBinSynchronized[Tp->P[no_source].TimeBinGrav])
+#endif
+            {
+              Tp->P[no_source].GravAccel += mass_n * D1;
+#ifdef EVALPOTENTIAL
+              Tp->P[no_source].Potential -= mass_n * D0;
+#endif
+
+              if(MeasureCostFlag)
+                Tp->P[no_source].GravCost++;
+
+              interactioncountPP += 1;
+            }
+        }
+      else if(type_source == NODE_TYPE_TREEPOINT_PARTICLE)
+        {
+#ifndef HIERARCHICAL_GRAVITY
+          if(Points[no_source - ImportedNodeOffset].ActiveFlag)
+#endif
+            {
+              int idx = ResultIndexList[no_source - ImportedNodeOffset];
+              ResultsActiveImported[idx].GravAccel += mass_n * D1;
+#ifdef EVALPOTENTIAL
+              ResultsActiveImported[idx].Potential -= mass_n * D0;
+#endif
+              if(MeasureCostFlag)
+                ResultsActiveImported[idx].GravCost++;
+
+              interactioncountPP += 1;
+            }
+        }
+    }
+}
+
+inline void fmm::fmm_particle_node_interaction(int no_sink, int no_source, int type_sink, int type_source, unsigned char shmrank_sink,
+                                               unsigned char shmrank_source, gravnode *noptr_source, vector<MyReal> &dxyz, MyReal &r2)
+{
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  if(skip_actual_force_computation)
+    return;
+#endif
+
+  /* 'sink' is a particle
+   * 'source' node is a node with multipole moments.
+   * 'dxyz' is the distance vector, pointing from sink to source, i.e. dxyz = pos(source) - pos(sink)
+   */
+
+  MyReal mass_i, h_i;
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+  int test_point;
+#endif
+
+  MyIntPosType *intpos_i;
+
+  if(type_sink == NODE_TYPE_LOCAL_PARTICLE)
+    {
+      particle_data *P = get_Pp(no_sink, shmrank_sink);
+
+      intpos_i = P->IntPos;
+      mass_i   = P->getMass();
+      h_i      = All.ForceSoftening[P->getSofteningClass()];
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_point = P->InsideOutsideFlag;
+#endif
+    }
+  else if(type_sink == NODE_TYPE_TREEPOINT_PARTICLE)
+    {
+      gravpoint_data *Pointp = get_pointsp(no_sink - ImportedNodeOffset, shmrank_sink);
+
+      intpos_i = Pointp->IntPos;
+      mass_i   = Pointp->Mass;
+      h_i      = All.ForceSoftening[Pointp->getSofteningClass()];
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_point = Pointp->InsideOutsideFlag;
+#endif
+    }
+  else /* a point that was fetched */
+    {
+      foreign_gravpoint_data *foreignpoint = get_foreignpointsp(no_sink - EndOfForeignNodes, shmrank_sink);
+
+      intpos_i = foreignpoint->IntPos;
+      mass_i   = foreignpoint->Mass;
+      h_i      = All.ForceSoftening[foreignpoint->getSofteningClass()];
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_point = foreignpoint->InsideOutsideFlag;
+#endif
+    }
+
+  MyReal h_j   = All.ForceSoftening[noptr_source->getSofteningClass()];
+  MyReal h_max = (h_j > h_i) ? h_j : h_i;
+
+  /* do cell-particle interaction, node can be used */
+  MyReal r = sqrt(r2);
+
+  MyReal rinv = (r > 0) ? 1 / r : 0;
+
+  gfactors gfac;
+
+#ifdef PMGRID
+  if(DoPM)
+    {
+      mesh_factors *mfp = &mf[LOW_MESH];
+
+#ifdef PLACEHIGHRESREGION
+      if((DoPM & TREE_ACTIVE_CUTTOFF_HIGHRES_PM))
+        {
+          int test_node = noptr_source->overlap_flag;
+          if(test_node == FLAG_INSIDE && test_point == FLAG_INSIDE)
+            mfp = &mf[HIGH_MESH];
+        }
+#endif
+
+      if(modify_gfactors_pm_multipole(gfac, r, rinv, mfp)) /* if we are outside the cut-off radius, we have no interaction */
+        return;
+    }
+#endif
+
+  get_gfactors_multipole(gfac, r, h_max, rinv);
+
+#ifdef EVALPOTENTIAL
+  MyReal g0 = gfac.fac0;
+  MyReal D0 = g0;
+#endif
+
+  MyReal g1         = gfac.fac1 * rinv;
+  vector<MyReal> D1 = g1 * dxyz;
+
+#if(MULTIPOLE_ORDER >= 2)
+  MyReal g2               = gfac.fac2 * gfac.rinv2;
+  symtensor2<MyReal> aux2 = dxyz % dxyz;  // construct outer product of the two vectors
+  symtensor2<MyReal> D2   = g2 * aux2;
+  D2[qXX] += g1;
+  D2[qYY] += g1;
+  D2[qZZ] += g1;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+  MyReal g3 = gfac.fac3 * gfac.rinv3;
+  symtensor3<MyReal> D3;
+  symtensor3<MyReal> aux3;
+  setup_D3(INIT, D3, dxyz, aux2, aux3, g2, g3);
+#endif
+
+#if(MULTIPOLE_ORDER >= 4)
+  MyReal g4 = gfac.fac4 * gfac.rinv2 * gfac.rinv2;
+  symtensor4<MyReal> D4;
+  symtensor4<MyReal> aux4;
+  setup_D4(INIT, D4, dxyz, aux2, aux3, aux4, g2, g3, g4);
+#endif
+
+#if(MULTIPOLE_ORDER >= 5)
+  MyReal g5 = gfac.fac5 * gfac.rinv3 * gfac.rinv2;
+  symtensor5<MyReal> D5;
+  symtensor5<MyReal> aux5;
+  setup_D5(INIT, D5, dxyz, aux3, aux4, aux5, g3, g4, g5);
+#endif
+
+  if(DoEwald)
+    {
+      ewald_data ew;
+      Ewald.ewald_gridlookup(noptr_source->s.da, intpos_i, ewald::MULTIPOLES, ew);
+
+#ifdef EVALPOTENTIAL
+      D0 -= ew.D0phi;
+#endif
+      D1 -= ew.D1phi;
+#if(MULTIPOLE_ORDER >= 2)
+      D2 -= ew.D2phi;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+      D3 -= ew.D3phi;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+      D4 -= ew.D4phi;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+      D5 -= ew.D5phi;
+#endif
+    }
+
+  /* finally store the force on the particle */
+  if(shmrank_sink == Shmem.Island_ThisTask)
+    if(type_sink == NODE_TYPE_LOCAL_PARTICLE || type_sink == NODE_TYPE_TREEPOINT_PARTICLE)
+      {
+        MyReal mass_j = noptr_source->mass;
+
+#if(MULTIPOLE_ORDER >= 3) || ((MULTIPOLE_ORDER >= 2) && defined(EXTRAPOTTERM))
+        symtensor2<MyDouble> &Q2_j = noptr_source->Q2Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 4) || ((MULTIPOLE_ORDER >= 3) && defined(EXTRAPOTTERM))
+        symtensor3<MyDouble> &Q3_j = noptr_source->Q3Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 5) || ((MULTIPOLE_ORDER >= 4) && defined(EXTRAPOTTERM))
+        symtensor4<MyDouble> &Q4_j = noptr_source->Q4Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 5) && defined(EXTRAPOTTERM)
+        symtensor5<MyDouble> &Q5_j = noptr_source->Q5Tensor;
+#endif
+
+#ifdef EVALPOTENTIAL
+        MyReal pot = -mass_j * D0;
+#if(MULTIPOLE_ORDER >= 3) || ((MULTIPOLE_ORDER >= 2) && defined(EXTRAPOTTERM))
+        pot -= 0.5f * (D2 * Q2_j);
+#endif
+#if(MULTIPOLE_ORDER >= 4) || ((MULTIPOLE_ORDER >= 3) && defined(EXTRAPOTTERM))
+        pot -= static_cast<MyReal>(1.0 / 6) * (D3 * Q3_j);
+#endif
+#if(MULTIPOLE_ORDER >= 5) || ((MULTIPOLE_ORDER >= 4) && defined(EXTRAPOTTERM))
+        pot -= static_cast<MyReal>(1.0 / 24) * (D4 * Q4_j);
+#endif
+#if(MULTIPOLE_ORDER >= 5) && defined(EXTRAPOTTERM)
+
+        pot -= static_cast<MyReal>(1.0 / 120) * (D5 * Q5_j);
+#endif
+#endif
+
+        vector<MyReal> dphi = mass_j * D1;
+
+#if(MULTIPOLE_ORDER >= 3)
+        dphi += static_cast<MyReal>(0.5) * (D3 * Q2_j);
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+        dphi += static_cast<MyReal>(1.0 / 6) * (D4 * Q3_j);
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+        dphi += static_cast<MyReal>(1.0 / 24) * (D5 * Q4_j);
+#endif
+
+        if(type_sink == NODE_TYPE_LOCAL_PARTICLE)
+          {
+#ifndef HIERARCHICAL_GRAVITY
+            if(Tp->TimeBinSynchronized[Tp->P[no_sink].TimeBinGrav])
+#endif
+              {
+                Tp->P[no_sink].GravAccel -= dphi;
+#ifdef EVALPOTENTIAL
+                Tp->P[no_sink].Potential += pot;
+#endif
+                if(MeasureCostFlag)
+                  Tp->P[no_sink].GravCost++;
+
+                interactioncountPN += 1;
+                interactioncountEffective += 1;
+              }
+          }
+        else
+          {
+#ifndef HIERARCHICAL_GRAVITY
+            if(Points[no_sink - ImportedNodeOffset].ActiveFlag)
+#endif
+              {
+                int idx = ResultIndexList[no_sink - ImportedNodeOffset];
+                ResultsActiveImported[idx].GravAccel -= dphi;
+#ifdef EVALPOTENTIAL
+                ResultsActiveImported[idx].Potential += pot;
+#endif
+                if(MeasureCostFlag)
+                  ResultsActiveImported[idx].GravCost++;
+
+                interactioncountPN += 1;
+                interactioncountEffective += 1;
+              }
+          }
+      }
+
+  if(fmm_depends_on_local_mass(no_source, shmrank_source))
+    if(type_source == NODE_TYPE_LOCAL_NODE)
+      {
+        /* mediating field expansion of point particle on source node */
+#ifdef EVALPOTENTIAL
+        TaylorCoeff[no_source].coeff.phi += (-mass_i) * D0;
+#endif
+        TaylorCoeff[no_source].coeff.dphi += (-mass_i) * D1;
+#if(MULTIPOLE_ORDER >= 2)
+        TaylorCoeff[no_source].coeff.d2phi += (-mass_i) * D2;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+        TaylorCoeff[no_source].coeff.d3phi += (-mass_i) * D3;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+        TaylorCoeff[no_source].coeff.d4phi += (-mass_i) * D4;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+        TaylorCoeff[no_source].coeff.d5phi += (-mass_i) * D5;
+#endif
+        TaylorCoeff[no_source].coeff.interactions += 1;
+
+        interactioncountPN += 1;
+      }
+}
+
+inline void fmm::fmm_node_node_interaction(int no_sink, int no_source, int type_sink, int type_source, unsigned char shmrank_sink,
+                                           unsigned char shmrank_source, gravnode *noptr_sink, gravnode *noptr_source,
+                                           vector<MyReal> &dxyz, MyReal &r2)
+{
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  if(skip_actual_force_computation)
+    return;
+#endif
+
+  /* 'sink' is a node with multipole moments
+   * 'source' node is a node with multipole moments
+   * 'dxyz' is the distance vector, pointing from sink to source, i.e. dxyz = pos(source) - pos(sink)
+   */
+
+  /* now do the node-node interaction */
+  MyReal r = sqrt(r2);
+
+#if(MULTIPOLE_ORDER >= 3) || ((MULTIPOLE_ORDER >= 2) && defined(EXTRAPOTTERM))
+  symtensor2<MyDouble> &Q2_m = noptr_source->Q2Tensor;
+  symtensor2<MyDouble> &Q2_n = noptr_sink->Q2Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 4) || ((MULTIPOLE_ORDER >= 3) && defined(EXTRAPOTTERM))
+  symtensor3<MyDouble> &Q3_m = noptr_source->Q3Tensor;
+  symtensor3<MyDouble> &Q3_n = noptr_sink->Q3Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 5) || ((MULTIPOLE_ORDER >= 4) && defined(EXTRAPOTTERM))
+  symtensor4<MyDouble> &Q4_m = noptr_source->Q4Tensor;
+  symtensor4<MyDouble> &Q4_n = noptr_sink->Q4Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 5) && defined(EXTRAPOTTERM) && defined(EVALPOTENTIAL)
+  symtensor5<MyDouble> &Q5_m = noptr_source->Q5Tensor;
+  symtensor5<MyDouble> &Q5_n = noptr_sink->Q5Tensor;
+#endif
+
+  MyReal mass_m = noptr_source->mass;
+  MyReal mass_n = noptr_sink->mass;
+
+  MyReal rinv = (r > 0) ? 1 / r : 0;
+
+  MyReal h_n   = All.ForceSoftening[noptr_sink->getSofteningClass()];
+  MyReal h_m   = All.ForceSoftening[noptr_source->getSofteningClass()];
+  MyReal h_max = (h_m > h_n) ? h_m : h_n;
+
+  gfactors gfac;
+
+#ifdef PMGRID
+  if(DoPM)
+    {
+      mesh_factors *mfp = &mf[LOW_MESH];
+
+#ifdef PLACEHIGHRESREGION
+      if((DoPM & TREE_ACTIVE_CUTTOFF_HIGHRES_PM))
+        {
+          if(noptr_source->overlap_flag == FLAG_INSIDE && noptr_sink->overlap_flag == FLAG_INSIDE)
+            mfp = &mf[HIGH_MESH];
+        }
+#endif
+
+      if(modify_gfactors_pm_multipole(gfac, r, rinv, mfp)) /* if we are outside the cut-off radius, we have no interaction */
+        return;
+    }
+#endif
+
+  get_gfactors_multipole(gfac, r, h_max, rinv);
+
+#ifdef EVALPOTENTIAL
+  MyReal g0 = gfac.fac0;
+  MyReal D0 = g0;
+#endif
+
+  MyReal g1         = gfac.fac1 * rinv;
+  vector<MyReal> D1 = g1 * dxyz;
+
+#if(MULTIPOLE_ORDER >= 2)
+  MyReal g2               = gfac.fac2 * gfac.rinv2;
+  symtensor2<MyReal> aux2 = dxyz % dxyz;  // construct outer product of the two vectors
+  symtensor2<MyReal> D2   = g2 * aux2;
+  D2[qXX] += g1;
+  D2[qYY] += g1;
+  D2[qZZ] += g1;
+#endif
+
+#if(MULTIPOLE_ORDER >= 3)
+  MyReal g3 = gfac.fac3 * gfac.rinv3;
+  symtensor3<MyReal> D3;
+  symtensor3<MyReal> aux3;
+  setup_D3(INIT, D3, dxyz, aux2, aux3, g2, g3);
+#endif
+
+#if(MULTIPOLE_ORDER >= 4)
+  MyReal g4 = gfac.fac4 * gfac.rinv2 * gfac.rinv2;
+  symtensor4<MyReal> D4;
+  symtensor4<MyReal> aux4;
+  setup_D4(INIT, D4, dxyz, aux2, aux3, aux4, g2, g3, g4);
+#endif
+
+#if(MULTIPOLE_ORDER >= 5)
+  MyReal g5 = gfac.fac5 * gfac.rinv3 * gfac.rinv2;
+  symtensor5<MyReal> D5;
+  symtensor5<MyReal> aux5;
+  setup_D5(INIT, D5, dxyz, aux3, aux4, aux5, g3, g4, g5);
+#endif
+
+  if(DoEwald)
+    {
+      ewald_data ew;
+      Ewald.ewald_gridlookup(noptr_source->s.da, noptr_sink->s.da, ewald::MULTIPOLES, ew);
+
+#ifdef EVALPOTENTIAL
+      D0 -= ew.D0phi;
+#endif
+      D1 -= ew.D1phi;
+#if(MULTIPOLE_ORDER >= 2)
+      D2 -= ew.D2phi;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+      D3 -= ew.D3phi;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+      D4 -= ew.D4phi;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+      D5 -= ew.D5phi;
+#endif
+    }
+
+  if(fmm_depends_on_local_mass(no_sink, shmrank_sink))
+    if(type_sink == NODE_TYPE_LOCAL_NODE)
+      {
+#ifdef EVALPOTENTIAL
+        TaylorCoeff[no_sink].coeff.phi += (-mass_m) * D0;
+#if(MULTIPOLE_ORDER >= 3) || ((MULTIPOLE_ORDER >= 2) && defined(EXTRAPOTTERM))
+        TaylorCoeff[no_sink].coeff.phi += static_cast<MyReal>(-0.5) * (D2 * Q2_m);
+#endif
+#if(MULTIPOLE_ORDER >= 4) || ((MULTIPOLE_ORDER >= 3) && defined(EXTRAPOTTERM))
+        TaylorCoeff[no_sink].coeff.phi += static_cast<MyReal>(-1.0 / 6) * (D3 * Q3_m);
+#endif
+#if(MULTIPOLE_ORDER >= 5) || ((MULTIPOLE_ORDER >= 4) && defined(EXTRAPOTTERM))
+        TaylorCoeff[no_sink].coeff.phi += static_cast<MyReal>(-1.0 / 24) * (D4 * Q4_m);
+#endif
+#if(MULTIPOLE_ORDER >= 5) && defined(EXTRAPOTTERM)
+        TaylorCoeff[no_sink].coeff.phi += static_cast<MyReal>(-1.0 / 120) * (D5 * Q5_m);
+#endif
+#endif
+
+        TaylorCoeff[no_sink].coeff.dphi += mass_m * D1;
+#if(MULTIPOLE_ORDER >= 2)
+        TaylorCoeff[no_sink].coeff.d2phi += (-mass_m) * D2;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+        TaylorCoeff[no_sink].coeff.dphi += static_cast<MyReal>(0.5) * (D3 * Q2_m);
+        TaylorCoeff[no_sink].coeff.d3phi += mass_m * D3;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+        TaylorCoeff[no_sink].coeff.dphi += static_cast<MyReal>(1.0 / 6) * (D4 * Q3_m);
+        TaylorCoeff[no_sink].coeff.d2phi += static_cast<MyReal>(-0.5) * (D4 * Q2_m);
+        TaylorCoeff[no_sink].coeff.d4phi += (-mass_m) * D4;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+        TaylorCoeff[no_sink].coeff.dphi += static_cast<MyReal>(1.0 / 24) * (D5 * Q4_m);
+        TaylorCoeff[no_sink].coeff.d2phi += static_cast<MyReal>(-1.0 / 6) * (D5 * Q3_m);
+        TaylorCoeff[no_sink].coeff.d3phi += static_cast<MyReal>(0.5) * (D5 * Q2_m);
+        TaylorCoeff[no_sink].coeff.d5phi += mass_m * D5;
+#endif
+
+        TaylorCoeff[no_sink].coeff.interactions += 1;
+        interactioncountNN += 1;
+      }
+
+  if(fmm_depends_on_local_mass(no_source, shmrank_source))
+    if(type_source == NODE_TYPE_LOCAL_NODE)
+      {
+#ifdef EVALPOTENTIAL
+        TaylorCoeff[no_source].coeff.phi += (-mass_n) * D0;
+#if(MULTIPOLE_ORDER >= 3) || ((MULTIPOLE_ORDER >= 2) && defined(EXTRAPOTTERM))
+        TaylorCoeff[no_source].coeff.phi += static_cast<MyReal>(-0.5) * (D2 * Q2_n);
+#endif
+#if(MULTIPOLE_ORDER >= 4) || ((MULTIPOLE_ORDER >= 3) && defined(EXTRAPOTTERM))
+        TaylorCoeff[no_source].coeff.phi += static_cast<MyReal>(1.0 / 6) * (D3 * Q3_n);
+#endif
+#if(MULTIPOLE_ORDER >= 5) || ((MULTIPOLE_ORDER >= 4) && defined(EXTRAPOTTERM))
+        TaylorCoeff[no_source].coeff.phi += static_cast<MyReal>(-1.0 / 24) * (D4 * Q4_n);
+#endif
+#if(MULTIPOLE_ORDER >= 5) && defined(EXTRAPOTTERM)
+        TaylorCoeff[no_source].coeff.phi += static_cast<MyReal>(1.0 / 120) * (D5 * Q5_n);
+#endif
+#endif
+
+        TaylorCoeff[no_source].coeff.dphi += (-mass_n) * D1;
+#if(MULTIPOLE_ORDER >= 2)
+        TaylorCoeff[no_source].coeff.d2phi += (-mass_n) * D2;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+        TaylorCoeff[no_source].coeff.dphi += static_cast<MyReal>(-0.5) * (D3 * Q2_n);
+        TaylorCoeff[no_source].coeff.d3phi += (-mass_n) * D3;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+        TaylorCoeff[no_source].coeff.dphi += static_cast<MyReal>(1.0 / 6) * (D4 * Q3_n);
+        TaylorCoeff[no_source].coeff.d2phi += static_cast<MyReal>(-0.5) * (D4 * Q2_n);
+        TaylorCoeff[no_source].coeff.d4phi += (-mass_n) * D4;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+        TaylorCoeff[no_source].coeff.dphi += static_cast<MyReal>(-1.0 / 24) * (D5 * Q4_n);
+        TaylorCoeff[no_source].coeff.d2phi += static_cast<MyReal>(1.0 / 6) * (D5 * Q3_n);
+        TaylorCoeff[no_source].coeff.d3phi += static_cast<MyReal>(-0.5) * (D5 * Q2_n);
+        TaylorCoeff[no_source].coeff.d5phi += (-mass_n) * D5;
+#endif
+
+        TaylorCoeff[no_source].coeff.interactions += 1;
+        interactioncountNN += 1;
+      }
+}
+
+inline int fmm::fmm_evaluate_node_node_opening_criterion(gravnode *noptr_sink, gravnode *noptr_source, vector<MyReal> &dxyz,
+                                                         MyReal &r2)
+{
+  if(noptr_source->level != noptr_sink->level)
+    Terminate("This shouldn't happen: noptr_level=%d   noptr_sink->level=%d ", noptr_source->level, noptr_sink->level);
+
+  if(noptr_source->level <=
+     1)  // always open the root node, and the next level (note: full node length does not fit in the integer type)
+    return NODE_OPEN;
+
+  /* Note: we will always have noptr_sink->len == noptr_source->len in our algorithm! */
+
+  MyIntPosType halflen = ((MyIntPosType)1) << ((BITS_FOR_POSITIONS - 1) - noptr_sink->level);
+  MyIntPosType intlen  = halflen << 1;
+
+#ifndef TREE_NO_SAFETY_BOX
+  // We always open adjacent nodes to protect against worst-case force errors
+  MyIntPosType twolens = intlen + (intlen - 1);
+  MyIntPosType dist[3];
+  Tp->nearest_image_intpos_to_absolute_intdist(noptr_source->center.da, noptr_sink->center.da, dist);
+
+  if(dist[0] < twolens && dist[1] < twolens && dist[2] < twolens)
+    return NODE_OPEN;
+#endif
+
+  /* converts the integer distance of the centers of mass to floating point */
+  Tp->nearest_image_intpos_to_pos(noptr_source->s.da, noptr_sink->s.da, dxyz.da);
+
+  r2 = dxyz.r2();
+
+#ifdef PMGRID
+  mesh_factors *mfp = &mf[LOW_MESH];
+
+#ifdef PLACEHIGHRESREGION
+  if((DoPM & TREE_ACTIVE_CUTTOFF_HIGHRES_PM))
+    {
+      int test_source = noptr_source->overlap_flag;
+      int test_sink   = noptr_sink->overlap_flag;
+
+      if((test_source == FLAG_BOUNDARYOVERLAP && test_sink != FLAG_OUTSIDE) ||
+         (test_sink == FLAG_BOUNDARYOVERLAP && test_source != FLAG_OUTSIDE))
+        {
+          Terminate("this shouldn't happen any more");
+          return NODE_OPEN;
+        }
+      else
+        {
+          if(test_source == FLAG_INSIDE && test_sink == FLAG_INSIDE)
+            mfp = &mf[HIGH_MESH];
+        }
+    }
+#endif
+
+  if(DoPM && r2 > mfp->rcut2 && noptr_sink->level > 1)
+    {
+      /* check whether we can ignore any interactions along this branch */
+      MyIntPosType dist_x = noptr_source->center[0] - noptr_sink->center[0];
+      dist_x              = (((MySignedIntPosType)dist_x) >= 0) ? dist_x : -dist_x;
+      if(dist_x > mfp->intrcut[0] + intlen)
+        return NODE_DISCARD;
+
+      MyIntPosType dist_y = noptr_source->center[1] - noptr_sink->center[1];
+      dist_y              = (((MySignedIntPosType)dist_y) >= 0) ? dist_y : -dist_y;
+      if(dist_y > mfp->intrcut[1] + intlen)
+        return NODE_DISCARD;
+
+      MyIntPosType dist_z = noptr_source->center[2] - noptr_sink->center[2];
+      dist_z              = (((MySignedIntPosType)dist_z) >= 0) ? dist_z : -dist_z;
+      if(dist_z > mfp->intrcut[2] + intlen)
+        return NODE_DISCARD;
+    }
+#endif
+
+  /* evaluate generalized opening criterion */
+
+  MyReal len  = intlen * Tp->FacIntToCoord;
+  MyReal len2 = len * len;
+
+  if(All.RelOpeningCriterionInUse == 0) /* check Barnes-Hut opening criterion */
+    {
+      if(4 * len2 > r2 * errTolTheta2)
+        return NODE_OPEN;
+    }
+  else
+    {
+      if(4 * len2 > r2 * errTolThetaMax2)
+        return NODE_OPEN;
+
+      MyReal mmax = (noptr_source->mass < noptr_sink->mass) ? noptr_sink->mass : noptr_source->mass;
+      MyReal amin =
+          errTolForceAcc * ((noptr_sink->MinOldAcc < noptr_source->MinOldAcc) ? noptr_sink->MinOldAcc : noptr_source->MinOldAcc);
+#if(MULTIPOLE_ORDER == 1)
+      if(mmax > r2 * amin)
+        return NODE_OPEN;
+#elif(MULTIPOLE_ORDER == 2)
+      if(square(mmax * len) > r2 * square(r2 * amin))
+        return NODE_OPEN;
+#elif(MULTIPOLE_ORDER == 3)
+      if(mmax * len2 > r2 * r2 * amin)
+        return NODE_OPEN;
+#elif(MULTIPOLE_ORDER == 4)
+      if(square(mmax * len * len2) > r2 * square(r2 * r2 * amin))
+        return NODE_OPEN;
+#elif(MULTIPOLE_ORDER == 5)
+      if(mmax * len2 * len2 > r2 * r2 * r2 * amin)
+        return NODE_OPEN;
+#endif
+    }
+
+#if NSOFTCLASSES > 1
+  MyReal h_m   = All.ForceSoftening[noptr_sink->getSofteningClass()];
+  MyReal h_n   = All.ForceSoftening[noptr_source->getSofteningClass()];
+  MyReal h_max = (h_m > h_n) ? h_m : h_n;
+
+  if(r2 < h_max * h_max)
+    {
+      if(All.ForceSoftening[noptr_source->minsofttype] < All.ForceSoftening[noptr_source->maxsofttype] &&
+         h_max > All.ForceSoftening[noptr_sink->minsofttype])
+        return NODE_OPEN;
+      else if(All.ForceSoftening[noptr_sink->minsofttype] < All.ForceSoftening[noptr_sink->maxsofttype] &&
+              h_max > All.ForceSoftening[noptr_source->minsofttype])
+        return NODE_OPEN;
+    }
+#endif
+
+  return NODE_USE;
+}
+
+inline int fmm::fmm_evaluate_particle_node_opening_criterion(int no_sink, char type_sink, unsigned char shmrank_sink,
+                                                             gravnode *nop_source, vector<MyReal> &dxyz, MyReal &r2)
+{
+  MyIntPosType *intpos_n;
+  MyReal mass_n;
+  MyReal aold;
+#if NSOFTCLASSES > 1
+  MyReal h_n;
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+  int test_point;
+#endif
+
+  if(type_sink == NODE_TYPE_LOCAL_PARTICLE)
+    {
+      particle_data *P = get_Pp(no_sink, shmrank_sink);
+
+      intpos_n = P->IntPos;
+      mass_n   = P->getMass();
+      aold     = P->OldAcc;
+#if NSOFTCLASSES > 1
+      h_n = All.ForceSoftening[P->getSofteningClass()];
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_point = P->InsideOutsideFlag;
+#endif
+    }
+  else if(type_sink == NODE_TYPE_TREEPOINT_PARTICLE)
+    {
+      gravpoint_data *Pointp = get_pointsp(no_sink - ImportedNodeOffset, shmrank_sink);
+
+      intpos_n = Pointp->IntPos;
+      mass_n   = Pointp->Mass;
+      aold     = Pointp->OldAcc;
+#if NSOFTCLASSES > 1
+      h_n = All.ForceSoftening[Pointp->getSofteningClass()];
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_point = Pointp->InsideOutsideFlag;
+#endif
+    }
+  else /* a point that was fetched */
+    {
+      foreign_gravpoint_data *foreignpoint = get_foreignpointsp(no_sink - EndOfForeignNodes, shmrank_sink);
+
+      intpos_n = foreignpoint->IntPos;
+      mass_n   = foreignpoint->Mass;
+      aold     = foreignpoint->OldAcc;
+#if NSOFTCLASSES > 1
+      h_n = All.ForceSoftening[foreignpoint->getSofteningClass()];
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      test_point = foreignpoint->InsideOutsideFlag;
+#endif
+    }
+
+  if(nop_source->level == 0)  // always open the root node (note: full node length does not fit in the integer type)
+    return NODE_OPEN;
+
+  MyIntPosType halflen = ((MyIntPosType)1) << ((BITS_FOR_POSITIONS - 1) - nop_source->level);
+  MyIntPosType intlen  = halflen << 1;
+
+#ifndef TREE_NO_SAFETY_BOX
+  MyIntPosType dist[3];
+  Tp->nearest_image_intpos_to_absolute_intdist(nop_source->center.da, intpos_n, dist);
+  // if we are close to the node, and therefore open it to protect against worst-case force errors
+  if(dist[0] < intlen && dist[1] < intlen && dist[2] < intlen)
+    return NODE_OPEN;
+#endif
+
+  /* check a variant of the classic opening criterion */
+  Tp->nearest_image_intpos_to_pos(nop_source->s.da, intpos_n, dxyz.da); /* converts the integer distance to floating point */
+
+  r2 = dxyz.r2();
+
+#ifdef PMGRID
+  mesh_factors *mfp = &mf[LOW_MESH];
+
+#ifdef PLACEHIGHRESREGION
+  if((DoPM & TREE_ACTIVE_CUTTOFF_HIGHRES_PM))
+    {
+      int test_node = nop_source->overlap_flag;
+
+      if(test_point == FLAG_INSIDE && test_node == FLAG_BOUNDARYOVERLAP)
+        {
+          Terminate("this shouldn't happen any more");
+          return NODE_OPEN;
+        }
+      else
+        {
+          if(test_node == FLAG_INSIDE && test_point == FLAG_INSIDE)
+            mfp = &mf[HIGH_MESH];
+        }
+    }
+#endif
+
+  if(DoPM && r2 > mfp->rcut2)
+    {
+      /* check whether we can ignore any interactions along this branch */
+      MyIntPosType dist_x = nop_source->center[0] - intpos_n[0];
+      dist_x              = (((MySignedIntPosType)dist_x) >= 0) ? dist_x : -dist_x;
+      if(dist_x > mfp->intrcut[0] + halflen)
+        return NODE_DISCARD;
+
+      MyIntPosType dist_y = nop_source->center[1] - intpos_n[1];
+      dist_y              = (((MySignedIntPosType)dist_y) >= 0) ? dist_y : -dist_y;
+      if(dist_y > mfp->intrcut[1] + halflen)
+        return NODE_DISCARD;
+
+      MyIntPosType dist_z = nop_source->center[2] - intpos_n[2];
+      dist_z              = (((MySignedIntPosType)dist_z) >= 0) ? dist_z : -dist_z;
+      if(dist_z > mfp->intrcut[2] + halflen)
+        return NODE_DISCARD;
+    }
+#endif
+
+  MyReal len  = intlen * Tp->FacIntToCoord;
+  MyReal len2 = len * len;
+
+  if(All.RelOpeningCriterionInUse == 0) /* check Barnes-Hut opening criterion */
+    {
+      if(4 * len2 > r2 * errTolTheta2)
+        return NODE_OPEN;
+    }
+  else /* check relative opening criterion */
+    {
+      if(4 * len2 > r2 * errTolThetaMax2)
+        return NODE_OPEN;
+
+      MyReal mmax = (nop_source->mass < mass_n) ? mass_n : nop_source->mass;
+      MyReal amin = errTolForceAcc * ((aold < nop_source->MinOldAcc) ? aold : nop_source->MinOldAcc);
+
+#if(MULTIPOLE_ORDER == 1)
+      if(mmax > r2 * amin)
+        return NODE_OPEN;
+#elif(MULTIPOLE_ORDER == 2)
+      if(square(mmax * len) > r2 * square(r2 * amin))
+        return NODE_OPEN;
+#elif(MULTIPOLE_ORDER == 3)
+      if(mmax * len2 > r2 * r2 * amin)
+        return NODE_OPEN;
+#elif(MULTIPOLE_ORDER == 4)
+      if(square(mmax * len * len2) > r2 * square(r2 * r2 * amin))
+        return NODE_OPEN;
+#elif(MULTIPOLE_ORDER == 5)
+      if(mmax * len2 * len2 > r2 * r2 * r2 * amin)
+        return NODE_OPEN;
+#endif
+    }
+
+#if NSOFTCLASSES > 1
+  MyReal h_m = All.ForceSoftening[nop_source->getSofteningClass()];
+
+  if(h_m > h_n)
+    {
+      if(r2 < h_m * h_m)
+        if(All.ForceSoftening[nop_source->minsofttype] < All.ForceSoftening[nop_source->maxsofttype])
+          {
+            return NODE_OPEN;
+          }
+    }
+#endif
+
+  return NODE_USE;
+}
+
+/* function to account for interaction of two nodes in the tree */
+void fmm::fmm_force_interact(int no_sink, int no_source, char type_sink, char type_source, unsigned char shmrank_sink,
+                             unsigned char shmrank_source, int mintopleafnode, int committed)
+{
+  if(type_sink <= NODE_TYPE_FETCHED_PARTICLE && type_source <= NODE_TYPE_FETCHED_PARTICLE) /* particle-particle interaction */
+    {
+      /* nothing to be done, or if we do not deal with at least one local particle */
+      if(type_sink == NODE_TYPE_FETCHED_PARTICLE && type_source == NODE_TYPE_FETCHED_PARTICLE)
+        return;
+
+      if(no_sink != no_source || shmrank_source != shmrank_sink)  // exclude self-interaction
+        fmm_particle_particle_interaction(no_sink, no_source, type_sink, type_source, shmrank_sink, shmrank_source);
+    }
+  else if(!(type_sink > NODE_TYPE_FETCHED_PARTICLE && type_source > NODE_TYPE_FETCHED_PARTICLE)) /* cell-particle interaction */
+    {
+      /* we have arranged it such that the particle is always on the sink side, the node on the source side */
+
+      gravnode *noptr_source = get_nodep(no_source, shmrank_source);
+
+      /* noting to be done if we do not deal with any local mass */
+      if(fmm_depends_on_local_mass(no_source, shmrank_source) == false &&
+         (type_sink == NODE_TYPE_FETCHED_PARTICLE ||
+          (type_sink < NODE_TYPE_FETCHED_PARTICLE && shmrank_sink != Shmem.Island_ThisTask)))
+        return;
+
+      if(noptr_source->not_empty == 0)
+        return;
+
+      if(no_source < MaxPart + MaxNodes)                  // we have a top-levelnode
+        if(noptr_source->nextnode >= MaxPart + MaxNodes)  // if the next node is not a top-level, we have a leaf node
+          mintopleafnode = no_source;
+
+      MyReal r2;
+      vector<MyReal> dxyz;
+
+      int openflag = fmm_evaluate_particle_node_opening_criterion(no_sink, type_sink, shmrank_sink, noptr_source, dxyz, r2);
+
+      if(openflag == NODE_USE)
+        {
+          fmm_particle_node_interaction(no_sink, no_source, type_sink, type_source, shmrank_sink, shmrank_source, noptr_source, dxyz,
+                                        r2);
+        }
+      else if(openflag == NODE_OPEN) /* open cell in a cell-particle interaction */
+        {
+          if(noptr_source->cannot_be_opened_locally)
+            {
+              // are we in the same shared memory node?
+              if(Shmem.GetNodeIDForSimulCommRank[noptr_source->OriginTask] == Shmem.GetNodeIDForSimulCommRank[D->ThisTask])
+                {
+                  Terminate("this should not happen any more");
+                }
+              else
+                {
+                  tree_add_to_fetch_stack(noptr_source, no_source, shmrank_source);
+
+                  fmm_add_to_work_stack(no_sink, no_source, shmrank_sink, shmrank_source, mintopleafnode);
+                }
+            }
+          else
+            {
+              int min_buffer_space =
+                  std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+
+              if(min_buffer_space >= committed + 8 * TREE_NUM_BEFORE_NODESPLIT)
+                fmm_open_node(no_sink, noptr_source, type_sink, shmrank_sink, mintopleafnode,
+                              committed + 8 * TREE_NUM_BEFORE_NODESPLIT);
+              else
+                fmm_add_to_work_stack(no_sink, no_source, shmrank_sink, shmrank_source, mintopleafnode);
+            }
+        }
+    }
+  else /* cell - cell interaction */
+    {
+      gravnode *noptr_sink   = get_nodep(no_sink, shmrank_sink);
+      gravnode *noptr_source = get_nodep(no_source, shmrank_source);
+
+      /* at least one of the cells needs to depend on local particles */
+      if(fmm_depends_on_local_mass(no_sink, shmrank_sink) || fmm_depends_on_local_mass(no_source, shmrank_source))
+        {
+          /* both cells need to be non-empty */
+          if(noptr_sink->not_empty != 0 && noptr_source->not_empty != 0)
+            {
+              if(noptr_sink == noptr_source) /* self-interaction */
+                {
+                  if(no_source < MaxPart + MaxNodes)                  // we have a top-levelnode
+                    if(noptr_source->nextnode >= MaxPart + MaxNodes)  // if the next node is not a top-level, we have a leaf node
+                      mintopleafnode = no_source;
+
+                  if(noptr_sink->cannot_be_opened_locally)
+                    {
+                      Terminate("should not happen because we have a self-interaction of a supposedly local node");
+                    }
+                  else
+                    {
+                      int min_buffer_space =
+                          std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+
+                      if(min_buffer_space >= committed + 8 * 8 * TREE_NUM_BEFORE_NODESPLIT * TREE_NUM_BEFORE_NODESPLIT)
+                        fmm_open_both(noptr_sink, noptr_sink, mintopleafnode,
+                                      committed + 8 * 8 * TREE_NUM_BEFORE_NODESPLIT * TREE_NUM_BEFORE_NODESPLIT);
+                      else
+                        fmm_add_to_work_stack(no_source, no_sink, shmrank_source, shmrank_sink, mintopleafnode);
+                    }
+                }
+              else
+                {
+                  MyReal r2;
+                  vector<MyReal> dxyz;
+
+                  int openflag = fmm_evaluate_node_node_opening_criterion(noptr_sink, noptr_source, dxyz, r2);
+
+                  if(openflag == NODE_USE)
+                    {
+                      /* evaluate the interaction */
+                      fmm_node_node_interaction(no_sink, no_source, type_sink, type_source, shmrank_sink, shmrank_source, noptr_sink,
+                                                noptr_source, dxyz, r2);
+                    }
+                  else if(openflag == NODE_OPEN)
+                    {
+                      /* open both */
+
+                      if(no_source < MaxPart + MaxNodes)                  // we have a top-levelnode
+                        if(noptr_source->nextnode >= MaxPart + MaxNodes)  // if the next node is not a top-level, we have a leaf node
+                          mintopleafnode = std::min<int>(mintopleafnode, no_source);
+
+                      if(no_sink < MaxPart + MaxNodes)                  // we have a top-levelnode
+                        if(noptr_sink->nextnode >= MaxPart + MaxNodes)  // if the next node is not a top-level, we have a leaf node
+                          mintopleafnode = std::min<int>(mintopleafnode, no_sink);
+
+                      if(noptr_source->cannot_be_opened_locally || noptr_sink->cannot_be_opened_locally)
+                        {
+                          if(noptr_source->cannot_be_opened_locally && noptr_sink->cannot_be_opened_locally)
+                            Terminate("this should not happen, because then both nodes would be foreign");
+
+                          if(noptr_source->cannot_be_opened_locally)
+                            tree_add_to_fetch_stack(noptr_source, no_source, shmrank_source);
+
+                          if(noptr_sink->cannot_be_opened_locally)
+                            tree_add_to_fetch_stack(noptr_sink, no_sink, shmrank_sink);
+
+                          fmm_add_to_work_stack(no_source, no_sink, shmrank_source, shmrank_sink, mintopleafnode);
+                        }
+                      else
+                        {
+                          int min_buffer_space =
+                              std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+
+                          if(min_buffer_space >= committed + 8 * 8 * TREE_NUM_BEFORE_NODESPLIT * TREE_NUM_BEFORE_NODESPLIT)
+                            fmm_open_both(noptr_sink, noptr_source, mintopleafnode,
+                                          committed + 8 * 8 * TREE_NUM_BEFORE_NODESPLIT * TREE_NUM_BEFORE_NODESPLIT);
+                          else
+                            fmm_add_to_work_stack(no_source, no_sink, shmrank_source, shmrank_sink, mintopleafnode);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void fmm::fmm_determine_nodes_with_local_mass(int no, int sib)
+{
+  gravnode *nop = get_nodep(no, Shmem.Island_ThisTask);
+
+  int p = nop->nextnode;
+
+  /* if the next node is not a top-level node, we have reached a leaf node, and we need to do nothing */
+  if(p < MaxPart || p >= FirstNonTopLevelNode)
+    return;
+
+  unsigned char depends_on_local_mass = 0;
+
+  while(p != nop->sibling)
+    {
+      if(p >= 0)
+        {
+          if(p >= MaxPart && p < MaxPart + MaxNodes) /* we have an internal node */
+            {
+              int nextsib = get_nodep(p, Shmem.Island_ThisTask)->sibling;
+
+              fmm_determine_nodes_with_local_mass(p, nextsib);
+            }
+
+          if(p < MaxPart) /* a particle */
+            {
+              Terminate("stop");
+            }
+          else if(p < MaxPart + MaxNodes) /* an internal node  */
+            {
+              depends_on_local_mass |= Topnode_depends_on_local_mass[p];
+
+              p = get_nodep(p, Shmem.Island_ThisTask)->sibling;
+            }
+          else if(p < MaxPart + MaxNodes + D->NTopleaves) /* a pseudo particle */
+            {
+              /* we are processing a local leaf-node which does not have any particles.
+               * can continue to the next element, which should end the work.
+               */
+              Terminate("stop");
+            }
+          else
+            {
+              Terminate("stop");
+            }
+        }
+    }
+
+  Topnode_depends_on_local_mass[no] = depends_on_local_mass;
+}
+
+void fmm::gravity_fmm(int timebin)
+{
+  interactioncountPP        = 0;
+  interactioncountPN        = 0;
+  interactioncountNN        = 0;
+  interactioncountEffective = 0;
+
+  TIMER_STORE;
+  TIMER_START(CPU_TREE);
+
+  D->mpi_printf("FMM: Begin tree force. timebin=%d  (presently allocated=%g MB)\n", timebin, All.ErrTolTheta,
+                Mem.getAllocatedBytesInMB());
+
+#ifdef PMGRID
+  set_mesh_factors();
+#endif
+
+  Topnode_depends_on_local_mass = (char *)Mem.mymalloc_clear("Topnode_depends_on_local_mass", D->NTopnodes * sizeof(char));
+  Topnode_depends_on_local_mass -= MaxPart;
+
+  for(int n = 0; n < D->NTopleaves; n++)
+    {
+      if(D->TaskOfLeaf[n] == D->ThisTask)
+        {
+          int no = NodeIndex[n];
+
+          if(TopNodes[no].not_empty)
+            Topnode_depends_on_local_mass[no] = 1;
+        }
+    }
+
+  fmm_determine_nodes_with_local_mass(MaxPart, -1);
+
+  TIMER_START(CPU_TREESTACK);
+
+  NumOnWorkStack         = 0;
+  AllocWorkStackBaseLow  = std::max<int>(1.5 * (Tp->NumPart + NumPartImported), TREE_MIN_WORKSTACK_SIZE);
+  AllocWorkStackBaseHigh = AllocWorkStackBaseLow + TREE_EXPECTED_CYCLES * TREE_MIN_WORKSTACK_SIZE;
+  MaxOnWorkStack         = AllocWorkStackBaseLow;
+
+  FMM_WorkStack   = (fmm_workstack_data *)Mem.mymalloc("FMM_WorkStack", AllocWorkStackBaseHigh * sizeof(fmm_workstack_data));
+  ResultIndexList = (int *)Mem.mymalloc("ResultIndexList", NumPartImported * sizeof(int));
+
+  for(int i = 0; i < Tp->TimeBinsGravity.NActiveParticles; i++)
+    {
+      int target = Tp->TimeBinsGravity.ActiveParticleList[i];
+
+      /* let's do a safety check here to protect against accidental use of zero softening lengths */
+      int softtype = Tp->P[target].getSofteningClass();
+      if(All.ForceSoftening[softtype] == 0)
+        Terminate("Particle with ID=%lld of type=%d and softening type=%d was assigned zero softening\n",
+                  (long long)Tp->P[target].ID.get(), Tp->P[target].getType(), softtype);
+    }
+
+  int ncount = 0;
+
+  for(int i = 0; i < NumPartImported; i++)
+    {
+#ifndef HIERARCHICAL_GRAVITY
+      if(Points[i].ActiveFlag)
+#endif
+        {
+          ResultIndexList[i] = ncount++;
+        }
+    }
+
+  NumOnWorkStack = 0;
+  NewOnWorkStack = 0;
+
+  /* for starting, request the self-interaction between the root node */
+  fmm_add_to_work_stack(MaxPart, MaxPart, Shmem.Island_ThisTask, Shmem.Island_ThisTask, MaxPart + D->NTopnodes);
+
+  NumOnWorkStack = NewOnWorkStack;
+
+  ResultsActiveImported =
+      (resultsactiveimported_data *)Mem.mymalloc_clear("ResultsActiveImported", ncount * sizeof(resultsactiveimported_data));
+
+  TaylorCoeff = (taylor_data *)Mem.mymalloc_clear("TaylorCoeff", NumNodes * sizeof(taylor_data));
+  TaylorCoeff -= MaxPart;
+
+  /******************************************/
+  /* now execute the tree walk calculations */
+  /******************************************/
+
+  errTolForceAcc  = All.ErrTolForceAcc;
+  errTolThetaMax2 = All.ErrTolThetaMax * All.ErrTolThetaMax;
+  errTolTheta2    = All.ErrTolTheta * All.ErrTolTheta;
+
+  sum_NumForeignNodes  = 0;
+  sum_NumForeignPoints = 0;
+
+  // set a default size of the fetch stack equal to half the work stack (this may still be somewhat too large)
+  MaxOnFetchStack = std::max<int>(0.1 * (Tp->NumPart + NumPartImported), TREE_MIN_WORKSTACK_SIZE);
+  StackToFetch    = (fetch_data *)Mem.mymalloc_movable(&StackToFetch, "StackToFetch", MaxOnFetchStack * sizeof(fetch_data));
+
+  // let's grab at most half the still available memory for imported points and nodes
+  int nspace = (0.33 * Mem.FreeBytes) / (sizeof(gravnode) + 8 * sizeof(foreign_gravpoint_data));
+
+  MaxForeignNodes  = nspace;
+  MaxForeignPoints = 8 * nspace;
+  NumForeignNodes  = 0;
+  NumForeignPoints = 0;
+
+  /* the following two arrays hold imported tree nodes and imported points to augment the local tree */
+  Foreign_Nodes  = (gravnode *)Mem.mymalloc_movable(&Foreign_Nodes, "Foreign_Nodes", MaxForeignNodes * sizeof(gravnode));
+  Foreign_Points = (foreign_gravpoint_data *)Mem.mymalloc_movable(&Foreign_Points, "Foreign_Points",
+                                                                  MaxForeignPoints * sizeof(foreign_gravpoint_data));
+
+  tree_initialize_leaf_node_access_info();
+
+  TIMER_STOP(CPU_TREESTACK);
+
+  double t0       = Logs.second();
+  int max_ncycles = 0;
+
+  prepare_shared_memory_access();
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  for(int rep = 0; rep < 2; rep++)
+    {
+      if(rep == 0)
+        {
+          skip_actual_force_computation = true;
+        }
+      else
+        {
+          skip_actual_force_computation = false;
+
+          NumOnWorkStack = 0;
+          NewOnWorkStack = 0;
+
+          /* for starting, request the self-interaction between the root node */
+          fmm_add_to_work_stack(MaxPart, MaxPart, Shmem.Island_ThisTask, Shmem.Island_ThisTask, MaxPart + D->NTopnodes);
+
+          NumOnWorkStack = NewOnWorkStack;
+        }
+#endif
+
+      while(NumOnWorkStack > 0)  // repeat until we are out of work
+        {
+          NewOnWorkStack  = 0;  // gives the new entries
+          NumOnFetchStack = 0;
+          MaxOnWorkStack  = std::min<int>(AllocWorkStackBaseLow + max_ncycles * TREE_MIN_WORKSTACK_SIZE, AllocWorkStackBaseHigh);
+
+          TIMER_START(CPU_TREEWALK);
+
+          int item = 0;
+
+          while(item < NumOnWorkStack)
+            {
+              int min_buffer_space =
+                  std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+
+              int committed = 8 * 8 * TREE_NUM_BEFORE_NODESPLIT * TREE_NUM_BEFORE_NODESPLIT;
+
+              if(min_buffer_space >= committed)
+                {
+                  int no1                = FMM_WorkStack[item].Node1;
+                  int no2                = FMM_WorkStack[item].Node2;
+                  unsigned char shmrank1 = FMM_WorkStack[item].ShmRank1;
+                  unsigned char shmrank2 = FMM_WorkStack[item].ShmRank2;
+                  int mintopleaf         = FMM_WorkStack[item].MinTopLeafNode;
+                  item++;
+
+                  char type1 = 0, type2 = 0;
+
+                  if(no1 < MaxPart) /* a local particle */
+                    type1 = NODE_TYPE_LOCAL_PARTICLE;
+                  else if(no1 < MaxPart + MaxNodes) /* an internal node  */
+                    type1 = NODE_TYPE_LOCAL_NODE;
+                  else if(no1 >= ImportedNodeOffset && no1 < EndOfTreePoints) /* an imported Treepoint particle  */
+                    type1 = NODE_TYPE_TREEPOINT_PARTICLE;
+                  else if(no1 >= EndOfTreePoints && no1 < EndOfForeignNodes) /* an imported LET node */
+                    type1 = NODE_TYPE_FETCHED_NODE;
+                  else if(no1 >= EndOfForeignNodes) /* an imported LED particle */
+                    type1 = NODE_TYPE_FETCHED_PARTICLE;
+
+                  if(no2 < MaxPart) /* a local particle */
+                    type2 = NODE_TYPE_LOCAL_PARTICLE;
+                  else if(no2 < MaxPart + MaxNodes) /* an internal node  */
+                    type2 = NODE_TYPE_LOCAL_NODE;
+                  else if(no2 >= ImportedNodeOffset && no2 < EndOfTreePoints) /* an imported Treepoint particle  */
+                    type2 = NODE_TYPE_TREEPOINT_PARTICLE;
+                  else if(no2 >= EndOfTreePoints && no2 < EndOfForeignNodes) /* an imported LET node */
+                    type2 = NODE_TYPE_FETCHED_NODE;
+                  else if(no2 >= EndOfForeignNodes) /* an imported LED particle */
+                    type2 = NODE_TYPE_FETCHED_PARTICLE;
+
+                  if(no1 == MaxPart && no2 == MaxPart)
+                    {
+                      // we have the interaction between the two root nodes
+                      fmm_force_interact(no1, no2, type1, type2, shmrank1, shmrank2, mintopleaf, committed);
+                    }
+                  else
+                    {
+                      if(type1 > NODE_TYPE_FETCHED_PARTICLE && type2 > NODE_TYPE_FETCHED_PARTICLE)
+                        {
+                          /* node-node interaction */
+
+                          gravnode *nop1 = get_nodep(no1, shmrank1);
+                          gravnode *nop2 = get_nodep(no2, shmrank2);
+
+                          if(nop1->cannot_be_opened_locally || nop2->cannot_be_opened_locally)
+                            Terminate("how can this be");
+
+                          fmm_open_both(nop1, nop2, mintopleaf, committed);
+                        }
+                      else
+                        {
+                          /* particle-node interaction, particle should on the sink side */
+
+                          // we have a node that we previously could not open
+                          gravnode *nop2 = get_nodep(no2, shmrank2);
+
+                          if(nop2->cannot_be_opened_locally)
+                            Terminate("now we should be able to open it!");
+
+                          fmm_open_node(no1, nop2, type1, shmrank1, mintopleaf, committed);
+                        }
+                    }
+                }
+              else
+                break;
+            }
+
+          if(item == 0 && NumOnWorkStack > 0)
+            Terminate("Can't even process a single particle");
+
+          TIMER_STOP(CPU_TREEWALK);
+
+          TIMER_START(CPU_TREEFETCH);
+
+          tree_fetch_foreign_nodes(FETCH_GRAVTREE);
+
+          TIMER_STOP(CPU_TREEFETCH);
+
+          TIMER_START(CPU_TREESTACK);
+
+          /* now reorder the workstack such that we are first going to do residual pristine particles, and then
+           * imported nodes that hang below the first leaf nodes */
+          NumOnWorkStack = NumOnWorkStack - item + NewOnWorkStack;
+          memmove(FMM_WorkStack, FMM_WorkStack + item, NumOnWorkStack * sizeof(fmm_workstack_data));
+
+          /* now let's sort such that we can go deep on top-level node branches, allowing us to clear them out eventually */
+          mycxxsort(FMM_WorkStack, FMM_WorkStack + NumOnWorkStack, compare_fmm_workstack);
+
+          TIMER_STOP(CPU_TREESTACK);
+
+          max_ncycles++;
+        }
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+    }
+#endif
+
+  TIMER_START(CPU_TREEIMBALANCE);
+
+  MPI_Allreduce(MPI_IN_PLACE, &max_ncycles, 1, MPI_INT, MPI_MAX, D->Communicator);
+
+  TIMER_STOP(CPU_TREEIMBALANCE);
+
+  cleanup_shared_memory_access();
+
+  /* free temporary buffers */
+
+  Mem.myfree(Foreign_Points);
+  Mem.myfree(Foreign_Nodes);
+  Mem.myfree(StackToFetch);
+
+  TIMER_START(CPU_TREEWALK);
+
+  taylor_data taylor_current{};  // note: the curly braces initialize this to zero in this case
+
+  /* propagate node expansions to particles */
+  if(fmm_depends_on_local_mass(MaxPart, Shmem.Island_ThisTask))
+    fmm_force_passdown(MaxPart, Shmem.Island_ThisTask, taylor_current);
+
+  TIMER_STOP(CPU_TREEWALK);
+
+  Mem.myfree(TaylorCoeff + MaxPart);
+
+  double t1 = Logs.second();
+
+  D->mpi_printf("FMM: Forces calculated, with %d cycles took %g sec\n", max_ncycles, Logs.timediff(t0, t1));
+
+  /* now communicate the forces in ResultsActiveImported */
+  gravity_exchange_forces();
+
+  Mem.myfree(ResultsActiveImported);
+  Mem.myfree(ResultIndexList);
+  Mem.myfree(FMM_WorkStack);
+
+  TIMER_STOP(CPU_TREE);
+
+  D->mpi_printf("FMM: tree-force is done.\n");
+
+  /*  gather some diagnostic information */
+
+  TIMER_START(CPU_LOGS);
+
+  struct detailed_timings
+  {
+    double all, tree, wait, fetch, stack, lastpm;
+    double costtotal, numnodes;
+    double interactioncountEffective;
+    double interactioncountPP, interactioncountPN, interactioncountNN;
+    double NumForeignNodes, NumForeignPoints;
+    double fillfacFgnNodes, fillfacFgnPoints;
+    double sumcost;
+  };
+  detailed_timings timer, tisum, timax;
+
+  memset(&timer, 0, sizeof(detailed_timings));
+
+  if(MeasureCostFlag)
+    {
+      double sum = 0;
+      for(int i = 0; i < Tp->NumPart; i++)
+        if(Tp->TimeBinSynchronized[Tp->P[i].TimeBinGrav])
+          sum += Tp->P[i].GravCost;
+
+      timer.sumcost = sum;
+    }
+
+  timer.tree                      = TIMER_DIFF(CPU_TREEWALK);
+  timer.wait                      = TIMER_DIFF(CPU_TREEIMBALANCE);
+  timer.fetch                     = TIMER_DIFF(CPU_TREEFETCH);
+  timer.stack                     = TIMER_DIFF(CPU_TREESTACK);
+  timer.all                       = timer.tree + timer.wait + timer.fetch + timer.stack + TIMER_DIFF(CPU_TREE);
+  tisum.lastpm                    = All.CPUForLastPMExecution;
+  timer.costtotal                 = interactioncountPP + interactioncountEffective;
+  timer.interactioncountPP        = interactioncountPP;
+  timer.interactioncountPN        = interactioncountPN;
+  timer.interactioncountNN        = interactioncountNN;
+  timer.interactioncountEffective = interactioncountEffective;
+  timer.NumForeignNodes           = NumForeignNodes;
+  timer.NumForeignPoints          = NumForeignPoints;
+  timer.fillfacFgnNodes           = NumForeignNodes / ((double)MaxForeignNodes);
+  timer.fillfacFgnPoints          = NumForeignPoints / ((double)MaxForeignPoints);
+  timer.numnodes                  = NumNodes;
+
+  MPI_Reduce((double *)&timer, (double *)&tisum, (int)(sizeof(detailed_timings) / sizeof(double)), MPI_DOUBLE, MPI_SUM, 0,
+             D->Communicator);
+  MPI_Reduce((double *)&timer, (double *)&timax, (int)(sizeof(detailed_timings) / sizeof(double)), MPI_DOUBLE, MPI_MAX, 0,
+             D->Communicator);
+
+  All.TotNumOfForces += Tp->TimeBinsGravity.GlobalNActiveParticles;
+
+  if(D->ThisTask == 0)
+    {
+      fprintf(Logs.FdTimings, "Nf=%9lld FMM  timebin=%d  total-Nf=%lld\n", Tp->TimeBinsGravity.GlobalNActiveParticles, timebin,
+              All.TotNumOfForces);
+      fprintf(Logs.FdTimings, "   work-load balance: %g   part/sec: raw=%g, effective=%g     ia/part: avg=%g   (%g|%g|%g)\n",
+              timax.tree / ((tisum.tree + 1e-20) / D->NTask), Tp->TimeBinsGravity.GlobalNActiveParticles / (tisum.tree + 1.0e-20),
+              Tp->TimeBinsGravity.GlobalNActiveParticles / ((timax.tree + 1.0e-20) * D->NTask),
+              tisum.costtotal / (Tp->TimeBinsGravity.GlobalNActiveParticles + 1.0e-20),
+              tisum.interactioncountPP / (Tp->TimeBinsGravity.GlobalNActiveParticles + 1.0e-20),
+              tisum.interactioncountPN / (Tp->TimeBinsGravity.GlobalNActiveParticles + 1.0e-20),
+              tisum.interactioncountNN / (Tp->TimeBinsGravity.GlobalNActiveParticles + 1.0e-20));
+      fprintf(Logs.FdTimings,
+              "   maximum number of nodes: %g, filled: %g  NumForeignNodes: max=%g avg=%g fill=%g NumForeignPoints: max=%g avg=%g "
+              "fill=%g  cycles=%d\n",
+              timax.numnodes, timax.numnodes / MaxNodes, timax.NumForeignNodes, tisum.NumForeignNodes / D->NTask,
+              timax.fillfacFgnNodes, timax.NumForeignPoints, tisum.NumForeignPoints / D->NTask, timax.fillfacFgnPoints, max_ncycles);
+      fprintf(Logs.FdTimings,
+              "   avg times: <all>=%g  <tree>=%g  <wait>=%g  <fetch>=%g  <stack>=%g  "
+              "(lastpm=%g) sec\n",
+              tisum.all / D->NTask, tisum.tree / D->NTask, tisum.wait / D->NTask, tisum.fetch / D->NTask, tisum.stack / D->NTask,
+              tisum.lastpm / D->NTask);
+      fprintf(Logs.FdTimings, "   total interaction cost: %g  (imbalance=%g)  total cost measure: %g %g\n", tisum.costtotal,
+              timax.costtotal / (tisum.costtotal / D->NTask), tisum.sumcost,
+              tisum.interactioncountPP + tisum.interactioncountEffective);
+      myflush(Logs.FdTimings);
+    }
+
+  Mem.myfree(Topnode_depends_on_local_mass + MaxPart);
+
+  TIMER_STOP(CPU_LOGS);
+}
+
+#endif
diff --git a/src/fmm/fmm.h b/src/fmm/fmm.h
new file mode 100644
index 0000000000000000000000000000000000000000..46db767f4992dce774f9347fea6902b259b82e3d
--- /dev/null
+++ b/src/fmm/fmm.h
@@ -0,0 +1,154 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file fmm.h
+ *
+ *  \brief declares the class used for the fast multipole method (FMM)
+ */
+
+#ifndef FMM_H
+#define FMM_H
+
+#ifdef FMM
+
+#include "../data/symtensors.h"
+#include "../gravtree/gravtree.h"
+
+class fmm : public gravtree<simparticles>
+{
+ public:
+  void gravity_fmm(int timebin);
+
+ private:
+  long long interactioncountPP;
+  long long interactioncountPN;
+  long long interactioncountNN;
+  long long sum_NumForeignNodes;
+  long long sum_NumForeignPoints;
+  long long interactioncountEffective;
+
+  char *Topnode_depends_on_local_mass;
+
+  MyReal errTolTheta2;
+  MyReal errTolThetaMax2;
+  MyReal errTolForceAcc;
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  bool skip_actual_force_computation;
+#endif
+
+  struct fieldcoeff
+  {
+    MyReal phi;
+    vector<MyReal> dphi; /* first potential derivative at center of mass (i.e. minus the acceleration) */
+#if(MULTIPOLE_ORDER >= 2)
+    symtensor2<MyReal> d2phi; /* second derivatives  */
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+    symtensor3<MyReal> d3phi; /* third derivatives */
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+    symtensor4<MyReal> d4phi; /* fourth derivatives */
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+    symtensor5<MyReal> d5phi; /* fifth derivatives */
+#endif
+
+    MyReal interactions;
+  };
+
+  struct taylor_data
+  {
+    fieldcoeff coeff;
+  };
+  taylor_data *TaylorCoeff;
+
+  struct fmm_workstack_data
+  {
+    int Node1;
+    int Node2;
+    unsigned char ShmRank1;
+    unsigned char ShmRank2;
+    int MinTopLeafNode;
+  };
+
+  fmm_workstack_data *FMM_WorkStack;
+
+  static bool compare_fmm_workstack(const fmm_workstack_data &a, const fmm_workstack_data &b)
+  {
+    if(a.MinTopLeafNode < b.MinTopLeafNode)
+      return true;
+    if(a.MinTopLeafNode > b.MinTopLeafNode)
+      return false;
+
+    return a.Node1 < b.Node1;
+  }
+
+  inline void fmm_add_to_work_stack(int node1, int node2, unsigned char shmrank1, unsigned char shmrank2, int mintopleafnode)
+  {
+    if(NumOnWorkStack + NewOnWorkStack >= MaxOnWorkStack)
+      {
+        Terminate("we have run out of space:  NumOnWorkStack=%d + NewOnWorkStack=%d >= MaxOnWorkStack=%d", NumOnWorkStack,
+                  NewOnWorkStack, MaxOnWorkStack);
+      }
+
+    FMM_WorkStack[NumOnWorkStack + NewOnWorkStack].Node1          = node1;
+    FMM_WorkStack[NumOnWorkStack + NewOnWorkStack].Node2          = node2;
+    FMM_WorkStack[NumOnWorkStack + NewOnWorkStack].ShmRank1       = shmrank1;
+    FMM_WorkStack[NumOnWorkStack + NewOnWorkStack].ShmRank2       = shmrank2;
+    FMM_WorkStack[NumOnWorkStack + NewOnWorkStack].MinTopLeafNode = mintopleafnode;
+
+    NewOnWorkStack++;
+  }
+
+  inline bool fmm_depends_on_local_mass(int no, unsigned char shmrank)
+  {
+    if(no >= MaxPart && no < FirstNonTopLevelNode)
+      {
+        if(Topnode_depends_on_local_mass[no])
+          return true;
+        else
+          return false;
+      }
+    else if(no >= FirstNonTopLevelNode && no < MaxPart + MaxNodes)
+      {
+        if(shmrank == Shmem.Island_ThisTask)
+          return true;
+        else
+          return false;
+      }
+    else
+      return false;
+  }
+
+  void fmm_force_interact(int no_sink, int no_source, char type_sink, char type_source, unsigned char shmrank_sink,
+                          unsigned char shmrank_source, int mintopleafnode, int committed);
+
+  void fmm_force_passdown(int no, unsigned char shmrank, taylor_data taylor_current);
+  void fmm_open_both(gravnode *noptr_sink, gravnode *noptr_source, int mintopleafnode, int committed);
+
+  void fmm_open_node(int no_particle, gravnode *nop, char type_particle, unsigned char shmrank_particle, int mintopleafnode,
+                     int committed);
+  void fmm_particle_particle_interaction(int no_sink, int no_source, int type_sink, int type_source, unsigned char shmrank_sink,
+                                         unsigned char shmrank_source);
+
+  void fmm_particle_node_interaction(int no_sink, int no_source, int type_sink, int type_source, unsigned char shmrank_sink,
+                                     unsigned char shmrank_source, gravnode *noptr_source, vector<MyReal> &dxyz, MyReal &r2);
+
+  void fmm_node_node_interaction(int no_sink, int no_source, int type_sink, int type_source, unsigned char shmrank_sink,
+                                 unsigned char shmrank_source, gravnode *noptr_sink, gravnode *noptr_source, vector<MyReal> &dxyz,
+                                 MyReal &r2);
+
+  int fmm_evaluate_node_node_opening_criterion(gravnode *noptr_sink, gravnode *noptr_source, vector<MyReal> &dxyz, MyReal &r2);
+
+  int fmm_evaluate_particle_node_opening_criterion(int no_sink, char type_sink, unsigned char shmrank_sink, gravnode *nop_source,
+                                                   vector<MyReal> &dxyz, MyReal &r2);
+
+  void fmm_determine_nodes_with_local_mass(int no, int sib);
+};
+
+#endif
+#endif
diff --git a/src/fof/fof.cc b/src/fof/fof.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eff678c2a4dec811e3d48d03f3e4619141f420f0
--- /dev/null
+++ b/src/fof/fof.cc
@@ -0,0 +1,1200 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file fof.cc
+ *
+ *  \brief main routines of parallel FoF group finder
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef FOF
+
+#include <mpi.h>
+#include <algorithm>
+#include <climits>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../fof/fof_io.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../sort/cxxsort.h"
+#include "../sort/parallel_sort.h"
+#include "../sort/peano.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+using namespace std;
+
+/*! Computation of a FOF group catalogue.
+ *
+ * \param num
+ * If called with -1 as argument, only FOF is carried out and no group catalogs are saved to disk, for
+ *  >= 0, the code will store the group/subhalo catalogs, and bring the particles into output order.
+ * In this case, the calling routine (which is normally write_snapshot()) will need to free PS[] and bring
+ * the particles back into the original order.
+ */
+template <typename partset>
+void fof<partset>::fof_fof(int num, const char *grpcat_basename, const char *grpcat_dirbasename, double inner_distance)
+{
+  TIMER_START(CPU_FOF);
+
+  mpi_printf("FOF: Begin to compute FoF group catalogue...  (presently allocated=%g MB)\n", Mem.getAllocatedBytesInMB());
+
+  double ta = Logs.second();
+
+  /* determine linking length */
+  Tp->LinkL = fof_get_comoving_linking_length();
+  mpi_printf("FOF: Comoving linking length: %g\n", Tp->LinkL);
+
+  /* allocate link lists for book-keeping local particle sets */
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+  Tp->DistanceOrigin = (double *)Mem.mymalloc("DistanceOrigin", Tp->NumPart * sizeof(double));
+#endif
+
+  Tp->MinID     = (MyIDStorage *)Mem.mymalloc("MinID", Tp->NumPart * sizeof(MyIDStorage));  // smallest particle ID withing FOF group
+  Tp->MinIDTask = (int *)Mem.mymalloc("MinIDTask", Tp->NumPart * sizeof(int));              // processor on which this ID is stored
+  Tp->Head = (int *)Mem.mymalloc("Head", Tp->NumPart * sizeof(int));  // first particle in chaining list if local FOF group segment
+  Tp->Next = (int *)Mem.mymalloc("Next", Tp->NumPart * sizeof(int));  // next particle in chaining list
+  Tp->Tail = (int *)Mem.mymalloc("Tail", Tp->NumPart * sizeof(int));  // points to last particle in chaining list
+  Tp->Len  = (int *)Mem.mymalloc("Len", Tp->NumPart * sizeof(int));  // length of local FOF group segment (note: 32 bit enough even for
+                                                                     // huge groups because they are split across processors)
+
+  /* initialize link-lists, each particle is in a group of its own initially */
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      Tp->Head[i] = Tp->Tail[i] = i;
+      Tp->Len[i]                = 1;
+      Tp->Next[i]               = -1;
+      Tp->MinID[i]              = Tp->P[i].ID;
+      Tp->MinIDTask[i]          = ThisTask;
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+      Tp->DistanceOrigin[i] = fof_distance_to_origin(i);
+      Tp->P[i].setFlagSaveDistance();
+#endif
+    }
+
+    /* make a list with the particles that are of the primary link type(s) */
+#ifdef LEAN
+  int npart       = Tp->NumPart;
+  int *targetlist = NULL;
+#else
+  int npart       = 0;
+  int *targetlist = (int *)Mem.mymalloc("targetlist", Tp->NumPart * sizeof(int));
+  for(int i = 0; i < Tp->NumPart; i++)
+    if(is_type_primary_link_type(Tp->P[i].getType()))
+      targetlist[npart++] = i;
+#endif
+
+  /* build neighbour tree with primary link particles only */
+  FoFNgbTree.treeallocate(Tp->NumPart, Tp, FoFDomain);
+  FoFNgbTree.treebuild(npart, targetlist);
+
+  /* call routine to find groups made up of the primary particle types */
+  double cputime = fof_find_groups();
+  mpi_printf("FOF: primary group finding took = %g sec\n", cputime);
+
+  /* call routine to attach secondary particles/cells to primary groups */
+  cputime = fof_find_nearest_dmparticle();
+  mpi_printf("FOF: attaching gas and star particles to nearest dm particles took = %g sec\n", cputime);
+
+  /* free some arrays that are not needed any more */
+  FoFNgbTree.treefree();
+#ifndef LEAN
+  Mem.myfree(targetlist);
+#endif
+  Mem.myfree(Tp->Len);
+  Mem.myfree(Tp->Tail);
+  Mem.myfree(Tp->Next);
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+  double save_distance = 0;
+  if(inner_distance > 0)
+    save_distance = inner_distance + Tp->LinkL;
+#endif
+
+  double t0 = Logs.second();
+
+  /* transfer the still required link list information to a somewhat smaller structure, "FOF_PList"
+   * particles that are in the same group have the same MinID. The MinIDTask variable informs about the processor, on which
+   * the particle which has ID=MinID is stored (and this particle is also member of the group).
+   */
+  FOF_PList = (fof_particle_list *)Mem.mymalloc_movable(&FOF_PList, "FOF_PList", Tp->NumPart * sizeof(fof_particle_list));
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      FOF_PList[i].MinID     = Tp->MinID[Tp->Head[i]];
+      FOF_PList[i].MinIDTask = Tp->MinIDTask[Tp->Head[i]];
+      FOF_PList[i].Pindex    = i;
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+      FOF_PList[i].DistanceOrigin = Tp->DistanceOrigin[Tp->Head[i]];
+
+      if(Tp->DistanceOrigin[Tp->Head[i]] < save_distance)
+        Tp->P[i].clearFlagSaveDistance();
+#endif
+    }
+
+  /* free the rest of the original link lists */
+  Mem.myfree_movable(Tp->Head);
+  Mem.myfree_movable(Tp->MinIDTask);
+  Mem.myfree_movable(Tp->MinID);
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+  Mem.myfree_movable(Tp->DistanceOrigin);
+#endif
+
+  /* Allocate a list of group pieces in FOF_Glist, one entry for each local group segment.
+   * If a group is split across processor boundaries, it will appear on each of the processors with
+   * a segment.
+   */
+  FOF_GList = (fof_group_list *)Mem.mymalloc_movable(&FOF_GList, "FOF_GList", sizeof(fof_group_list) * Tp->NumPart);
+
+  fof_compile_catalogue(inner_distance);
+
+  /* now determine total group number, largest group size, and output some log messages */
+  double t1 = Logs.second();
+  mpi_printf("FOF: compiling local group data and catalogue took = %g sec\n", Logs.timediff(t0, t1));
+
+  sumup_large_ints(1, &Ngroups, &TotNgroups, Communicator);
+  sumup_longs(1, &Nids, &TotNids, Communicator);
+
+  /* determine the largest group size */
+  long long largestgroup = 0;
+  if(TotNgroups > 0)
+    {
+      long long largestloc = 0;
+
+      for(int i = 0; i < NgroupsExt; i++)
+        if(FOF_GList[i].Count > largestloc)
+          largestloc = FOF_GList[i].Count;
+      MPI_Allreduce(&largestloc, &largestgroup, 1, MPI_LONG_LONG, MPI_MAX, Communicator);
+    }
+
+  mpi_printf("FOF: Total number of FOF groups with at least %d particles: %lld\n", FOF_GROUP_MIN_LEN, TotNgroups);
+  mpi_printf("FOF: Largest FOF group has %lld particles.\n", largestgroup);
+  mpi_printf("FOF: Total number of particles in FOF groups: %lld\n", TotNids);
+
+  t0 = Logs.second();
+
+  /* now allocate some storage for the group catalogue, and begin to fill it in with partial
+   * group properties
+   */
+  MaxNgroups = NgroupsExt;
+  Group      = (group_properties *)Mem.mymalloc_movable(&Group, "Group", sizeof(group_properties) * MaxNgroups);
+
+  mpi_printf("FOF: group properties are now allocated.. (presently allocated=%g MB)\n", Mem.getAllocatedBytesInMB());
+
+  /* Sort FOF_GList according to MinID. We are going to match this with FOF_PList (still ordered according to MinID) to get
+   * access to the particles making up each group piece.
+   */
+  mycxxsort(FOF_GList, FOF_GList + NgroupsExt, fof_compare_FOF_GList_MinID);
+
+  /* compute partial group properties for each local group segment */
+  long long count_nids = 0;
+  for(int i = 0, start = 0; i < NgroupsExt; i++)
+    {
+      while(FOF_PList[start].MinID.get() < FOF_GList[i].MinID.get())
+        {
+          start++;
+          if(start > Tp->NumPart)
+            Terminate("start > Tp->NumPart");
+        }
+
+      if(FOF_PList[start].MinID.get() != FOF_GList[i].MinID.get())
+        Terminate("ID mismatch");
+
+      int lenloc = 0;
+      for(lenloc = 0; start + lenloc < Tp->NumPart;)
+        if(FOF_PList[start + lenloc].MinID.get() == FOF_GList[i].MinID.get())
+          lenloc++;
+        else
+          break;
+
+      Group[i].MinID     = FOF_GList[i].MinID.get();
+      Group[i].MinIDTask = FOF_GList[i].MinIDTask;
+      Group[i].Len       = FOF_GList[i].Count;
+
+      /* calculate velocity dispersion etc for a local group segment */
+      fof_compute_group_properties(i, start, lenloc);
+
+      start += lenloc;
+      count_nids += lenloc;
+    }
+
+  Mem.myfree_movable(FOF_GList);
+  FOF_GList = NULL;
+
+  /* do a sanity check */
+  long long totNids;
+  sumup_longs(1, &count_nids, &totNids, Communicator);
+  if(totNids != TotNids)
+    Terminate("Task=%d Nids=%lld count_nids=%lld totNids=%lld TotNids=%lld\n", ThisTask, Nids, count_nids, totNids, TotNids);
+
+  /* add in group properties for each external group segment */
+  fof_add_in_properties_of_group_segments();
+
+  t1 = Logs.second();
+  mpi_printf("FOF: computation of group properties took = %g sec\n", Logs.timediff(t0, t1));
+
+  /* now we assign group numbers */
+  fof_assign_group_numbers();
+
+  Mem.myfree_movable(FOF_PList);
+  FOF_PList = NULL;
+
+  /* finalize the computation of the properties of the groups, and prune the list to just contain the main (local) segment */
+  fof_finish_group_properties();
+
+  /* Sort the groups in parallel according to group-number, which will be our output order. */
+  mycxxsort_parallel(Group, Group + Ngroups, fof_compare_Group_GroupNr, Communicator);
+
+  fof_assign_group_offset();
+
+  double tb = Logs.second();
+
+  mpi_printf("FOF: Finished computing FoF groups.  Complete work took %g sec  (presently allocated=%g MB)\n", Logs.timediff(ta, tb),
+             Mem.getAllocatedBytesInMB());
+
+#ifdef SUBFIND
+  if(num >= 0)
+    {
+      TIMER_STOP(CPU_FOF);
+
+      char catname[1000];
+      sprintf(catname, "%s_subhalo_tab", grpcat_basename);
+
+      subfind_find_subhalos(num, catname, grpcat_dirbasename);
+
+      TIMER_START(CPU_FOF);
+    }
+#else
+  Nsubhalos    = 0;
+  TotNsubhalos = 0;
+  if(num >= 0)
+    {
+      TIMER_STOP(CPU_FOF);
+      TIMER_START(CPU_SNAPSHOT);
+
+      fof_io<partset> FoF_IO{this, this->Communicator, All.SnapFormat};
+
+      char catname[1000];
+      sprintf(catname, "%s_tab", grpcat_basename);
+
+      FoF_IO.fof_subfind_save_groups(num, catname, grpcat_dirbasename);
+
+      TIMER_STOP(CPU_SNAPSHOT);
+      TIMER_START(CPU_FOF);
+    }
+#endif
+
+  Mem.myfree_movable(Group);
+
+  if(num >= 0)
+    {
+      TIMER_STOP(CPU_FOF);
+      TIMER_START(CPU_SNAPSHOT);
+
+      /* now distribute the particles into output order */
+      t0 = Logs.second();
+      /* distribute particles such that FOF groups (and SUBFIND halos) will appear in consecutive way in snapshot files */
+      fof_prepare_output_order();
+      t1 = Logs.second();
+      mpi_printf("FOF: preparing output order of particles took %g sec\n", Logs.timediff(t0, t1));
+
+      TIMER_STOP(CPU_SNAPSHOT);
+      TIMER_START(CPU_FOF);
+    }
+
+  TIMER_STOP(CPU_FOF);
+}
+
+template <typename partset>
+void fof<partset>::fof_prepare_output_order(void)
+{
+  int ntype[NTYPES];
+  for(int i = 0; i < NTYPES; i++)
+    ntype[i] = 0;
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+#ifndef LEAN
+      Tp->PS[i].Type = Tp->P[i].getType();
+#endif
+      ntype[Tp->P[i].getType()]++;
+    }
+
+#if defined(MERGERTREE) && defined(SUBFIND)
+
+  /* we determine a continuously increasing subhalo number (not starting again at zero in every Group), and also assign the ranking
+   * within a subhalo
+   */
+  mycxxsort_parallel(Tp->PS, Tp->PS + Tp->NumPart, fof_compare_subfind_data_GroupNr_SubRankInNr_BndEgy, Communicator);
+
+  int *num_list = (int *)Mem.mymalloc("num_list", NTask * sizeof(int));
+  MPI_Allgather(&Tp->NumPart, 1, MPI_INT, num_list, 1, MPI_INT, Communicator);
+
+  int prev_non_empty_task = ThisTask - 1;
+
+  while(prev_non_empty_task >= 0)
+    if(num_list[prev_non_empty_task] == 0)
+      prev_non_empty_task--;
+    else
+      break;
+
+  /* obtain last element of each processor */
+  subfind_data *aux_last_element = (subfind_data *)Mem.mymalloc("aux_last_element", NTask * sizeof(subfind_data));
+  MPI_Allgather(&Tp->PS[Tp->NumPart > 0 ? Tp->NumPart - 1 : 0], sizeof(subfind_data), MPI_BYTE, aux_last_element, sizeof(subfind_data),
+                MPI_BYTE, Communicator);
+
+  int first_index = INT_MAX;
+  int first_task  = NTask;
+  /* find out the very first particle in a subhalo (which is not necessarily in the first FOF group) */
+  for(int i = 0; i < Tp->NumPart; i++)
+    if(Tp->PS[i].GroupNr.get() < HALONR_MAX && Tp->PS[i].SubRankInGr < INT_MAX) /* particle is in a subhalo */
+      {
+        first_index = i;
+        break;
+      }
+
+  int *first_list = (int *)Mem.mymalloc("first_list", NTask * sizeof(int));
+  MPI_Allgather(&first_index, 1, MPI_INT, first_list, 1, MPI_INT, Communicator);
+  for(int n = 0; n < NTask; n++)
+    {
+      if(first_list[n] != INT_MAX)
+        {
+          first_index = first_list[n];
+          first_task  = n;
+          break;
+        }
+    }
+  Mem.myfree(first_list);
+
+  long long subnr = 0;
+  long long rank  = 0;
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      if(Tp->PS[i].GroupNr.get() < HALONR_MAX && Tp->PS[i].SubRankInGr < INT_MAX) /* particle is in a subhalo */
+        {
+          if(i == 0)
+            {
+              if(prev_non_empty_task >= 0)
+                {
+                  if(Tp->PS[i].GroupNr.get() != aux_last_element[prev_non_empty_task].GroupNr.get() ||
+                     Tp->PS[i].SubRankInGr !=
+                         aux_last_element[prev_non_empty_task].SubRankInGr) /* we are the first particle of a new subhalo */
+                    {
+                      if(ThisTask != first_task || i != first_index)  // to prevent that we start a new subhalo for the very first one
+                        {
+                          subnr++;
+                          rank = 0;
+                        }
+                    }
+                }
+            }
+          else if(Tp->PS[i].GroupNr.get() != Tp->PS[i - 1].GroupNr.get() ||
+                  Tp->PS[i].SubRankInGr != Tp->PS[i - 1].SubRankInGr) /* we are the first particle of a new subhalo */
+            {
+              if(ThisTask != first_task || i != first_index)  // to prevent that we start a new subhalo for the very first one
+                {
+                  subnr++;
+                  rank = 0;
+                }
+            }
+
+          Tp->PS[i].SubhaloNr.set(subnr);
+          Tp->PS[i].RankInSubhalo.set(rank++);
+
+          if(subnr < 0 || subnr >= TotNsubhalos)
+            Terminate("i=%d  NumPart=%d  subnr=%lld  PS[i].SubhaloNr.get()=%lld >= TotNsubhalos=%lld", i, Tp->NumPart, subnr,
+                      (long long)Tp->PS[i].SubhaloNr.get(), TotNsubhalos);
+        }
+      else
+        {
+          Tp->PS[i].SubhaloNr.set(HALONR_MAX);
+          Tp->PS[i].RankInSubhalo.set(INT_MAX);
+        }
+    }
+
+  long long *subnr_list = (long long *)Mem.mymalloc("subnr_list", NTask * sizeof(long long));
+  MPI_Allgather(&subnr, 1, MPI_LONG_LONG, subnr_list, 1, MPI_LONG_LONG, Communicator);
+
+  long long subnr_prev = 0;
+  for(int i = 0; i < ThisTask; i++)
+    subnr_prev += subnr_list[i];
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    if(Tp->PS[i].GroupNr.get() < HALONR_MAX && Tp->PS[i].SubRankInGr < INT_MAX) /* particle is in a subhalo */
+      Tp->PS[i].SubhaloNr.set(Tp->PS[i].SubhaloNr.get() + subnr_prev);
+
+  /* obtain previous element of each processor */
+  long long *rank_list = (long long *)Mem.mymalloc("rank_list", NTask * sizeof(long long));
+  MPI_Allgather(&rank, 1, MPI_LONG_LONG, rank_list, 1, MPI_LONG_LONG, Communicator);
+
+  if(prev_non_empty_task >= 0)
+    {
+      long long rank = rank_list[prev_non_empty_task];
+
+      for(int i = 0; i < Tp->NumPart; i++)
+        {
+          if(Tp->PS[i].GroupNr.get() < HALONR_MAX && Tp->PS[i].SubRankInGr < INT_MAX)
+            {
+              if(i == 0)
+                {
+                  if(prev_non_empty_task >= 0)
+                    if(Tp->PS[i].GroupNr.get() != aux_last_element[prev_non_empty_task].GroupNr.get() ||
+                       Tp->PS[i].SubRankInGr !=
+                           aux_last_element[prev_non_empty_task].SubRankInGr) /* we are the first particle of a new subhalo */
+                      break;
+                }
+              else if(Tp->PS[i].GroupNr.get() != Tp->PS[i - 1].GroupNr.get() ||
+                      Tp->PS[i].SubRankInGr != Tp->PS[i - 1].SubRankInGr) /* we are the first particle of a new subhalo */
+                break;
+
+              Tp->PS[i].RankInSubhalo.set(rank++);
+            }
+        }
+    }
+
+  Mem.myfree(rank_list);
+  Mem.myfree(subnr_list);
+  Mem.myfree(aux_last_element);
+  Mem.myfree(num_list);
+
+  /* now bring back into starting order */
+  mycxxsort_parallel(Tp->PS, Tp->PS + Tp->NumPart, fof_compare_subfind_data_OriginTask_OriginIndex, Communicator);
+#endif
+
+  /* note: the following will destroy the value of the Potential, which is not needed any more at this point */
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+#if defined(RANDOMIZE_DOMAINCENTER) && defined(PERIODIC)
+      Tp->PS[i].u.Key =
+          peano_hilbert_key(Tp->P[i].IntPos[0] - Tp->CurrentShiftVector[0], Tp->P[i].IntPos[1] - Tp->CurrentShiftVector[1],
+                            Tp->P[i].IntPos[2] - Tp->CurrentShiftVector[2], BITS_FOR_POSITIONS);
+#else
+      Tp->PS[i].u.Key = peano_hilbert_key(Tp->P[i].IntPos[0], Tp->P[i].IntPos[1], Tp->P[i].IntPos[2], BITS_FOR_POSITIONS);
+#endif
+#ifdef SUBFIND
+      /* make sure that for particles not in group we have no binding energy and no subrankgr, so that they are ordered only by the key
+       */
+      if(Tp->PS[i].GroupNr.get() == HALONR_MAX)
+        {
+          Tp->PS[i].SubRankInGr        = 0;
+          Tp->PS[i].v.DM_BindingEnergy = 0;
+        }
+#endif
+    }
+
+#ifndef LEAN
+  mycxxsort(Tp->PS, Tp->PS + Tp->NumPart, fof_compare_subfind_data_Type);
+#endif
+
+  for(int i = 0, off = 0; i < NTYPES; i++)
+    {
+      mycxxsort_parallel(Tp->PS + off, Tp->PS + off + ntype[i], fof_compare_subfind_data_GroupNr_SubNr_Egy_Key, Communicator);
+
+      off += ntype[i];
+    }
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      Tp->PS[i].TargetTask  = ThisTask;
+      Tp->PS[i].TargetIndex = i;
+    }
+
+  /* now bring back into starting order */
+  mycxxsort_parallel(Tp->PS, Tp->PS + Tp->NumPart, fof_compare_subfind_data_OriginTask_OriginIndex, Communicator);
+
+  /* finally, reorder both P[] and PS[] */
+  FoFDomain->particle_exchange_based_on_PS(Communicator);
+}
+
+/* calculate linkling length based on mean particle separation */
+template <typename partset>
+double fof<partset>::fof_get_comoving_linking_length(void)
+{
+  int ndm = 0;
+  long long ndmtot;
+  double mass = 0, masstot;
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    if(is_type_primary_link_type(Tp->P[i].getType()))
+      {
+        ndm++;
+        mass += Tp->P[i].getMass();
+      }
+  sumup_large_ints(1, &ndm, &ndmtot, Communicator);
+  MPI_Allreduce(&mass, &masstot, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  double rhodm = (All.Omega0 - All.OmegaBaryon) * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G);
+
+  return FOF_LINKLENGTH * pow(masstot / ndmtot / rhodm, 1.0 / 3);
+}
+
+template <typename partset>
+void fof<partset>::fof_compile_catalogue(double inner_distance)
+{
+  /* sort according to MinID, this brings particles belonging to the same group together */
+  mycxxsort(FOF_PList, FOF_PList + Tp->NumPart, fof_compare_FOF_PList_MinID);
+
+  /* we now use the auxiliary FOF_GList structure to determine the group lengths.
+   * LocCount will count the length of the group piece that is on the principal processor
+   * of that group, while other group pieces on other processors are counted through ExtCount
+   */
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      FOF_GList[i].MinID     = FOF_PList[i].MinID;
+      FOF_GList[i].MinIDTask = FOF_PList[i].MinIDTask;
+      FOF_GList[i].Count     = 1;
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+      FOF_GList[i].DistanceOrigin = FOF_PList[i].DistanceOrigin;
+#endif
+    }
+
+  /* Now we are going to eliminate duplicates in FOF_GList with respect to MinID, i.e. each group
+   * piece on the local processor will be squashed to one entry. Some groups may be present as
+   * pieces on several different processors.
+   */
+  if(Tp->NumPart)
+    NgroupsExt = 1;
+  else
+    NgroupsExt = 0;
+
+  for(int i = 1, start = 0; i < Tp->NumPart; i++)
+    {
+      if(FOF_GList[i].MinID.get() == FOF_GList[start].MinID.get())
+        {
+          if(FOF_GList[i].MinIDTask != FOF_GList[start].MinIDTask)
+            Terminate("FOF_GList[i].MinIDTask=%d != FOF_GList[start].MinIDTask=%d", FOF_GList[i].MinIDTask,
+                      FOF_GList[start].MinIDTask);
+
+          FOF_GList[start].Count += FOF_GList[i].Count;
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+          if(FOF_GList[start].DistanceOrigin != FOF_GList[i].DistanceOrigin)
+            Terminate(
+                "start=%d i=%d  Tp->NumPart=%d  FOF_GList[start].DistanceOrigin=%g != FOF_GList[i].DistanceOrigin=%g  MinID=%lld  "
+                "MinIDTask=%d\n",
+                start, i, Tp->NumPart, FOF_GList[start].DistanceOrigin, FOF_GList[i].DistanceOrigin,
+                (long long)FOF_GList[i].MinID.get(), FOF_GList[i].MinIDTask);
+#endif
+        }
+      else
+        {
+          start            = NgroupsExt;
+          FOF_GList[start] = FOF_GList[i];
+          NgroupsExt++;
+        }
+    }
+
+  /* we resize FOF_GList, which has shrunk */
+  FOF_GList = (fof_group_list *)Mem.myrealloc_movable(FOF_GList, sizeof(fof_group_list) * NgroupsExt);
+
+  /* sort the group pieces according to task */
+  mycxxsort(FOF_GList, FOF_GList + NgroupsExt, fof_compare_FOF_GList_MinIDTask);
+
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  /* count how many group pieces we have for each task */
+  for(int i = 0; i < NTask; i++)
+    Send_count[i] = 0;
+  for(int i = 0; i < NgroupsExt; i++)
+    Send_count[FOF_GList[i].MinIDTask]++;
+
+  /* inform everybody about how much they have to receive */
+  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+  /* count how many we get and prepare offset tables */
+  int nimport    = 0;
+  Recv_offset[0] = 0, Send_offset[0] = 0;
+  for(int j = 0; j < NTask; j++)
+    {
+      if(j == ThisTask) /* we will not exchange the ones that are local */
+        Recv_count[j] = 0;
+      nimport += Recv_count[j];
+
+      if(j > 0)
+        {
+          Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+          Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+        }
+    }
+
+  /* allocate some temporary storage for foreign group pieces */
+  fof_group_list *get_FOF_GList = (fof_group_list *)Mem.mymalloc("get_FOF_GList", nimport * sizeof(fof_group_list));
+
+  /* get them */
+  for(int ngrp = 1; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+
+      if(recvTask < NTask)
+        {
+          if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+            {
+              /* get the group info */
+              MPI_Sendrecv(&FOF_GList[Send_offset[recvTask]], Send_count[recvTask] * sizeof(fof_group_list), MPI_BYTE, recvTask,
+                           TAG_DENS_A, &get_FOF_GList[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(fof_group_list), MPI_BYTE,
+                           recvTask, TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
+            }
+        }
+    }
+
+  /* for the incoming pieces, we re-purpose MinIDTask and set it in ascending order in order to use this later to reestablish the order
+   * we had */
+  for(int i = 0; i < nimport; i++)
+    get_FOF_GList[i].MinIDTask = i;
+
+  /* sort both the local group pieces and the incoming group pieces so that we can match them efficiently */
+  mycxxsort(FOF_GList, FOF_GList + NgroupsExt, fof_compare_FOF_GList_MinID);
+  mycxxsort(get_FOF_GList, get_FOF_GList + nimport, fof_compare_FOF_GList_MinID);
+
+  /* Merge the imported group pieces with the local group pieces.
+   * For all local group pieces in FOF_GList, the total number of particles on other processors will
+   * be established in ExtCount.
+   */
+  int start = 0;
+  for(int i = 0; i < nimport; i++)
+    {
+      while(FOF_GList[start].MinID.get() < get_FOF_GList[i].MinID.get())
+        {
+          start++;
+          if(start >= NgroupsExt)
+            Terminate("start=%d >= NgroupsExt=%d", start, NgroupsExt);
+        }
+
+      if(FOF_GList[start].MinIDTask != ThisTask)
+        Terminate("FOF_GList[start].MinIDTask=%d != ThisTask=%d", FOF_GList[start].MinIDTask, ThisTask);
+
+      if(FOF_GList[start].MinID.get() != get_FOF_GList[i].MinID.get())
+        Terminate(
+            "FOF_GList[start].MinID != get_FOF_GList[i].MinID start=%d i=%d FOF_GList[start].MinID=%llu get_FOF_GList[i].MinID=%llu\n",
+            start, i, (long long)FOF_GList[start].MinID.get(), (long long)get_FOF_GList[i].MinID.get());
+
+      FOF_GList[start].Count += get_FOF_GList[i].Count;
+    }
+
+  /* copy the size information back into the received list, to inform the group pieces on originating processors */
+  start = 0;
+  for(int i = 0; i < nimport; i++)
+    {
+      while(FOF_GList[start].MinID.get() < get_FOF_GList[i].MinID.get())
+        {
+          start++;
+          if(start >= NgroupsExt)
+            Terminate("start >= NgroupsExt");
+        }
+
+      get_FOF_GList[i].Count = FOF_GList[start].Count;
+    }
+
+  /* Sort the imported list according to MinIDTask. This reestablishes the order we had before the previous.
+   * We also sort the local group pieces according to MinIDTask, so that we can fill in the exported info
+   * at the right place again.
+   */
+  mycxxsort(get_FOF_GList, get_FOF_GList + nimport, fof_compare_FOF_GList_MinIDTask);
+  mycxxsort(FOF_GList, FOF_GList + NgroupsExt, fof_compare_FOF_GList_MinIDTask);
+
+  /* fix the value of MinIDTask again that we had temporarily overwritten */
+  for(int i = 0; i < nimport; i++)
+    get_FOF_GList[i].MinIDTask = ThisTask;
+
+  /* bring the data back to the originating processors */
+  for(int ngrp = 1; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+
+      if(recvTask < NTask)
+        {
+          if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+            {
+              /* get the group info */
+              MPI_Sendrecv(&get_FOF_GList[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(fof_group_list), MPI_BYTE, recvTask,
+                           TAG_DENS_A, &FOF_GList[Send_offset[recvTask]], Send_count[recvTask] * sizeof(fof_group_list), MPI_BYTE,
+                           recvTask, TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
+            }
+        }
+    }
+
+  /* free our temporary list */
+  Mem.myfree(get_FOF_GList);
+
+  /* Now we determine how many groups we have above the group size limit.
+   * Groups that are too small, or that are not guaranteed to be in the current lightcone segment
+   * are eliminated. Local groups are counted in length to get the total size of particles in stored groups.
+   */
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+  double save_distance = 0;
+  if(inner_distance > 0)
+    save_distance = inner_distance + Tp->LinkL;
+#endif
+
+  Ngroups = 0, Nids = 0;
+  for(int i = 0; i < NgroupsExt; i++)
+    {
+      if(FOF_GList[i].Count < FOF_GROUP_MIN_LEN)
+        {
+          FOF_GList[i] = FOF_GList[NgroupsExt - 1];
+          NgroupsExt--;
+          i--;
+        }
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+      else if(FOF_GList[i].DistanceOrigin < save_distance)
+        {
+          FOF_GList[i] = FOF_GList[NgroupsExt - 1];
+          NgroupsExt--;
+          i--;
+        }
+#endif
+      else
+        {
+          if(FOF_GList[i].MinIDTask == ThisTask)
+            {
+              Ngroups++;
+              Nids += FOF_GList[i].Count;
+            }
+        }
+    }
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  /* we resize FOF_GList again, which has shrunk */
+  FOF_GList = (fof_group_list *)Mem.myrealloc_movable(FOF_GList, sizeof(fof_group_list) * NgroupsExt);
+}
+
+template <typename partset>
+void fof<partset>::fof_assign_group_numbers(void)
+{
+  mpi_printf("FOF: start assigning group numbers\n");
+
+  double t0 = Logs.second();
+
+  for(int i = 0; i < NgroupsExt; i++)
+    Group[i].OriginTask = ThisTask;
+
+  // carry out a parallel sort that brings the group segments into order of the total group length, with the primary segment coming
+  // first
+  mycxxsort_parallel(Group, Group + NgroupsExt, fof_compare_Group_Len_MinID_DiffOriginTaskMinIDTask, Communicator);
+
+  /* assign group numbers to all local groups */
+  int ngr = 0;
+  for(int i = 0; i < NgroupsExt; i++)
+    {
+      if(Group[i].OriginTask == Group[i].MinIDTask)
+        ngr++;
+
+      Group[i].GroupNr = ngr - 1;
+    }
+
+  /* now need to count how many groups there are on earlier CPUs, so that we can
+   * increase the assigned group numbers accordingly
+   */
+  int *ngr_list = (int *)Mem.mymalloc("ngr_list", sizeof(int) * NTask);
+  MPI_Allgather(&ngr, 1, MPI_INT, ngr_list, 1, MPI_INT, Communicator);
+
+  long long ngr_sum = 0;
+  for(int j = 0; j < ThisTask; j++)
+    ngr_sum += ngr_list[j];
+
+  Mem.myfree(ngr_list);
+
+  /* increase the group numbers with the cumulative count on earlier processors */
+  for(int i = 0; i < NgroupsExt; i++)
+    Group[i].GroupNr += ngr_sum;
+
+  /* check that have consistent group numbers */
+  sumup_large_ints(1, &ngr, &ngr_sum, Communicator);
+  if(ngr_sum != TotNgroups)
+    Terminate("inconsistency  ngr_sum=%lld\n", ngr_sum);
+
+  /* bring the group list back into the original order */
+  mycxxsort_parallel(Group, Group + NgroupsExt, fof_compare_Group_OriginTask_MinID, Communicator);
+
+  /* Let's now mark all particles that are not in any group by assigning them a fiducial maximum (unused)
+   * group number
+   */
+  for(int i = 0; i < Tp->NumPart; i++)
+    Tp->PS[i].GroupNr.set(HALONR_MAX);
+
+  long long Nids_old = Nids;
+
+  /* we now aim to assign the group number also to the particles that make up groups,
+   * in the auxiliary PS array
+   */
+  Nids = 0;
+  for(int i = 0, start = 0; i < NgroupsExt; i++)
+    {
+      while(FOF_PList[start].MinID.get() < Group[i].MinID)
+        {
+          start++;
+          if(start > Tp->NumPart)
+            Terminate("start > Tp->NumPart");
+        }
+
+      if(FOF_PList[start].MinID.get() != Group[i].MinID)
+        Terminate("FOF_PList[start=%d].MinID=%lld != Group[i=%d].MinID=%lld", start, (long long)FOF_PList[start].MinID.get(), i,
+                  (long long)Group[i].MinID);
+
+      int lenloc;
+      for(lenloc = 0; start + lenloc < Tp->NumPart;)
+        if(FOF_PList[start + lenloc].MinID.get() == Group[i].MinID)
+          {
+            Tp->PS[FOF_PList[start + lenloc].Pindex].GroupNr.set(Group[i].GroupNr);
+            Nids++;
+            lenloc++;
+          }
+        else
+          break;
+
+      start += lenloc;
+    }
+
+  long long totNids;
+  sumup_longs(1, &Nids, &totNids, Communicator);
+
+  if(totNids != TotNids)
+    Terminate("Task=%d Nids=%lld  Nids_old=%lld  totNids=%lld TotNids=%lld\n", ThisTask, Nids, Nids_old, totNids, TotNids);
+
+  double t1 = Logs.second();
+
+  mpi_printf("FOF: Assigning of group numbers took = %g sec\n", Logs.timediff(t0, t1));
+}
+
+/* This function computes the Group[].OffsetType variable, which gives for each type how many
+ * particles are before it of the same time in earlier groups in the array.
+ */
+template <typename partset>
+void fof<partset>::fof_assign_group_offset(void)
+{
+  /* Tell everybody, how many particles the groups stored by each processor contain */
+
+  int gtype_loc[NTYPES]; /* particles of each type associated with locally stored groups */
+  long long gtype_previous[NTYPES];
+
+  for(int i = 0; i < NTYPES; i++)
+    gtype_loc[i] = 0;
+
+  for(int i = 0; i < Ngroups; i++)
+    for(int j = 0; j < NTYPES; j++)
+      gtype_loc[j] += Group[i].LenType[j];
+
+  int *gtype_all = (int *)Mem.mymalloc("gtype_all", NTYPES * NTask * sizeof(int));
+  MPI_Allgather(gtype_loc, NTYPES, MPI_INT, gtype_all, NTYPES, MPI_INT, Communicator);
+
+  for(int i = 0; i < NTYPES; i++)
+    gtype_previous[i] = 0;
+
+  for(int i = 0; i < ThisTask; i++)
+    for(int j = 0; j < NTYPES; j++)
+      gtype_previous[j] += gtype_all[i * NTYPES + j];
+
+  if(Ngroups > 0)
+    for(int j = 0; j < NTYPES; j++)
+      Group[0].OffsetType[j] = gtype_previous[j];
+
+  for(int i = 1; i < Ngroups; i++)
+    for(int j = 0; j < NTYPES; j++)
+      Group[i].OffsetType[j] = Group[i - 1].OffsetType[j] + Group[i - 1].LenType[j];
+
+  Mem.myfree(gtype_all);
+}
+
+template <typename partset>
+void fof<partset>::fof_compute_group_properties(int gr, int start, int len)
+{
+  int start_index = FOF_PList[start].Pindex;
+
+  Group[gr].Mass   = 0;
+  Group[gr].Ascale = 0;
+#ifdef STARFORMATION
+  Group[gr].Sfr = 0;
+#endif
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+  Group[gr].LenPrevMostBnd = 0;
+#endif
+
+  for(int k = 0; k < 3; k++)
+    {
+      Group[gr].CM[k]          = 0;
+      Group[gr].Vel[k]         = 0;
+      Group[gr].FirstIntPos[k] = Tp->P[start_index].IntPos[k];
+    }
+
+  for(int k = 0; k < NTYPES; k++)
+    {
+      Group[gr].LenType[k]  = 0;
+      Group[gr].MassType[k] = 0;
+    }
+
+  for(int k = 0; k < len; k++)
+    {
+      int index = FOF_PList[start + k].Pindex;
+
+      Group[gr].Mass += Tp->P[index].getMass();
+      int type = Tp->P[index].getType();
+
+      Group[gr].Ascale += Tp->P[index].getMass() * Tp->P[index].getAscale();
+
+      Group[gr].LenType[type]++;
+      Group[gr].MassType[type] += Tp->P[index].getMass();
+
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+      if(Tp->P[index].ID.is_previously_most_bound())
+        Group[gr].LenPrevMostBnd++;
+#endif
+
+#ifdef STARFORMATION
+      if(Tp->P[index].getType() == 0)
+        Group[gr].Sfr += Tp->SphP[index].Sfr;
+#endif
+
+      double xyz[3];
+      Tp->nearest_image_intpos_to_pos(Tp->P[index].IntPos, Tp->P[start_index].IntPos,
+                                      xyz); /* converts the integer distance to floating point */
+
+      for(int j = 0; j < 3; j++)
+        {
+          Group[gr].CM[j] += Tp->P[index].getMass() * xyz[j];
+          Group[gr].Vel[j] += Tp->P[index].getMass() * Tp->P[index].Vel[j];
+        }
+    }
+}
+
+template <typename partset>
+void fof<partset>::fof_add_in_properties_of_group_segments(void)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  /* sort the groups according to task */
+  mycxxsort(Group, Group + NgroupsExt, fof_compare_Group_MinIDTask);
+
+  /* count how many we have of each task */
+  for(int i = 0; i < NTask; i++)
+    Send_count[i] = 0;
+  for(int i = 0; i < NgroupsExt; i++)
+    Send_count[Group[i].MinIDTask]++;
+
+  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+  int nimport    = 0;
+  Recv_offset[0] = 0, Send_offset[0] = 0;
+
+  for(int j = 0; j < NTask; j++)
+    {
+      if(j == ThisTask) /* we will not exchange the ones that are local */
+        Recv_count[j] = 0;
+      nimport += Recv_count[j];
+
+      if(j > 0)
+        {
+          Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+          Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+        }
+    }
+
+  group_properties *get_Group = (group_properties *)Mem.mymalloc("get_Group", sizeof(group_properties) * nimport);
+
+  for(int ngrp = 1; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+
+      if(recvTask < NTask)
+        {
+          if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+            {
+              /* get the group data */
+              MPI_Sendrecv(&Group[Send_offset[recvTask]], Send_count[recvTask] * sizeof(group_properties), MPI_BYTE, recvTask,
+                           TAG_DENS_A, &get_Group[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(group_properties), MPI_BYTE,
+                           recvTask, TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
+            }
+        }
+    }
+
+  /* sort the groups again according to MinID */
+  mycxxsort(Group, Group + NgroupsExt, fof_compare_Group_MinID);
+  mycxxsort(get_Group, get_Group + nimport, fof_compare_Group_MinID);
+
+  int start = 0;
+  /* now add in the partial imported group data to the main ones */
+  for(int i = 0; i < nimport; i++)
+    {
+      while(Group[start].MinID < get_Group[i].MinID)
+        {
+          start++;
+          if(start >= NgroupsExt)
+            Terminate("start >= NgroupsExt");
+        }
+
+      Group[start].Mass += get_Group[i].Mass;
+      Group[start].Ascale += get_Group[i].Ascale;
+
+      for(int j = 0; j < NTYPES; j++)
+        {
+          Group[start].LenType[j] += get_Group[i].LenType[j];
+          Group[start].MassType[j] += get_Group[i].MassType[j];
+        }
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+      Group[start].LenPrevMostBnd += get_Group[i].LenPrevMostBnd;
+#endif
+#ifdef STARFORMATION
+      Group[start].Sfr += get_Group[i].Sfr;
+#endif
+
+      double tmpxyz[3] = {get_Group[i].CM[0] / get_Group[i].Mass, get_Group[i].CM[1] / get_Group[i].Mass,
+                          get_Group[i].CM[2] / get_Group[i].Mass};
+
+      MyIntPosType delta[3];
+      Tp->pos_to_signedintpos(tmpxyz, (MySignedIntPosType *)delta);
+
+      delta[0] += get_Group[i].FirstIntPos[0];
+      delta[1] += get_Group[i].FirstIntPos[1];
+      delta[2] += get_Group[i].FirstIntPos[2];
+
+      Tp->constrain_intpos(delta); /* will only do something if we have a stretched box */
+
+      double xyz[3];
+      Tp->nearest_image_intpos_to_pos(delta, Group[start].FirstIntPos, xyz); /* converts the integer distance to floating point */
+
+      for(int j = 0; j < 3; j++)
+        {
+          Group[start].CM[j] += get_Group[i].Mass * xyz[j];
+          Group[start].Vel[j] += get_Group[i].Vel[j];
+        }
+    }
+
+  Mem.myfree(get_Group);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+template <typename partset>
+void fof<partset>::fof_finish_group_properties(void)
+{
+  for(int i = 0; i < NgroupsExt; i++)
+    {
+      if(Group[i].MinIDTask == ThisTask)
+        {
+          double cm[3];
+          for(int j = 0; j < 3; j++)
+            {
+              Group[i].Vel[j] /= Group[i].Mass;
+              cm[j] = Group[i].CM[j] / Group[i].Mass;
+            }
+
+          Group[i].Ascale /= Group[i].Mass;
+
+          MyIntPosType delta[3];
+          Tp->pos_to_signedintpos(cm, (MySignedIntPosType *)delta);
+
+          delta[0] += Group[i].FirstIntPos[0];
+          delta[1] += Group[i].FirstIntPos[1];
+          delta[2] += Group[i].FirstIntPos[2];
+
+          Tp->constrain_intpos(delta); /* will only do something if we have a stretched box */
+
+          fof_get_halo_position(delta, cm);
+
+          Group[i].CM[0] = cm[0];
+          Group[i].CM[1] = cm[1];
+          Group[i].CM[2] = cm[2];
+
+          /* define group position as CM . This will be overwritten in case Subfind is used with
+           * the position of the potential minimum
+           */
+          for(int j = 0; j < 3; j++)
+            Group[i].Pos[j] = Group[i].CM[j];
+
+          for(int j = 0; j < 3; j++)
+            Group[i].IntPos[j] = delta[j];
+        }
+    }
+
+  int ngr = NgroupsExt;
+
+  /* eliminate the non-local groups */
+  for(int i = 0; i < ngr; i++)
+    {
+      if(Group[i].MinIDTask != ThisTask)
+        {
+          Group[i] = Group[ngr - 1];
+          i--;
+          ngr--;
+        }
+    }
+
+  if(ngr != Ngroups)
+    Terminate("ngr != Ngroups");
+
+  mycxxsort(Group, Group + Ngroups, fof_compare_Group_MinID);
+}
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+
+template <>
+double fof<simparticles>::fof_distance_to_origin(int i)
+{
+  return 0;  // dummy information for ordinary timeslices
+}
+
+template <>
+double fof<lcparticles>::fof_distance_to_origin(int i)
+{
+  return Tp->signedintpos_to_distanceorigin((MySignedIntPosType *)Tp->P[i].IntPos);
+}
+
+#endif
+
+template <>
+void fof<simparticles>::fof_get_halo_position(MyIntPosType *intpos, double *pos)
+{
+  Tp->intpos_to_pos(intpos, pos);
+}
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+template <>
+void fof<lcparticles>::fof_get_halo_position(MyIntPosType *intpos, double *pos)
+{
+  MyIntPosType origin[3] = {0, 0, 0};
+
+  Tp->nearest_image_intpos_to_pos(intpos, origin, pos);
+}
+#endif
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif /* of FOF */
diff --git a/src/fof/fof.h b/src/fof/fof.h
new file mode 100644
index 0000000000000000000000000000000000000000..3975bc12361869ece9c9ac023a9e019354d6b0b6
--- /dev/null
+++ b/src/fof/fof.h
@@ -0,0 +1,682 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file fof.h
+ *
+ *  \brief declaration of class for the FOF functionality
+ */
+
+#ifndef FOF_H
+#define FOF_H
+
+#include "gadgetconfig.h"
+
+#ifdef FOF
+
+#include "../data/simparticles.h"
+#include "../domain/domain.h"
+#include "../fof/foftree.h"
+#include "../gravtree/gravtree.h"
+
+/*! \brief FOF routines
+ *
+ * The template type 'partset' will be given either by 'simparticles' (normal particle set) or by 'lightconeparticles' (for group
+ * finding on the lightcone).
+ */
+template <typename partset>
+class fof : public setcomm
+{
+ public:
+  fof(MPI_Comm comm, partset *Tp_ptr, domain<partset> *D_ptr) : setcomm(comm)
+  {
+    Tp        = Tp_ptr;
+    FoFDomain = D_ptr;
+  }
+
+  partset *Tp;
+
+  int Ngroups;
+  int MaxNgroups;
+  long long TotNgroups;
+
+  int Nsubhalos;
+  int MaxNsubhalos = 0;
+  long long TotNsubhalos;
+
+  long long Nids;
+  long long TotNids;
+
+  double Time;
+  double Redshift;
+
+  struct group_properties
+  {
+    long long GroupNr;
+    long long OffsetType[NTYPES];
+    MyLenType Len;
+    MyLenType LenType[NTYPES];
+    MyIDType MinID;
+    int MinIDTask;
+    int OriginTask;
+
+    MyDouble MassType[NTYPES];
+    MyDouble Mass;
+    MyDouble Ascale;
+    MyDouble CM[3];
+    MyDouble Pos[3];
+    MyIntPosType IntPos[3];
+    MyIntPosType FirstIntPos[3];
+
+    MyFloat Vel[3];
+#ifdef STARFORMATION
+    MyFloat Sfr;
+#endif
+
+#ifdef SUBFIND
+    int TargetTask; /* primary CPU responsible for running subfind on this group */
+    int Nsubs;
+    long long FirstSub;
+    MyFloat M_Mean200, R_Mean200;
+    MyFloat M_Crit200, R_Crit200;
+    MyFloat M_Crit500, R_Crit500;
+    MyFloat M_TopHat200, R_TopHat200;
+#endif
+#ifdef MERGERTREE
+    long long FileOffset;
+#endif
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+    int LenPrevMostBnd;
+#endif
+  };
+  group_properties *Group;
+
+#ifdef SUBFIND
+  struct subhalo_properties
+  {
+    MyLenType Len;
+    MyLenType LenType[NTYPES];
+    long long OffsetType[NTYPES]; /*!< gives first particle index of subhalo relative to beginning of file (simplifies work with the
+                                       catalogs, as one doesn't need group lengths to know about fuzz particles) */
+    long long GroupNr;            /*!< global parent FOF group number */
+    int SubRankInGr;              /* local subhalo index within a given FOF group */
+    int SubParentRank;
+    MyIDType SubMostBoundID;
+    MyFloat Mass;
+    MyFloat MassType[NTYPES];
+    MyFloat SubVelDisp;
+    MyFloat SubVmax;
+    MyFloat SubVmaxRad;
+    MyFloat SubHalfMassRad;
+    MyFloat SubHalfMassRadType[NTYPES];
+    MyFloat Pos[3];
+    MyIntPosType IntPos[3];
+    MyFloat CM[3];
+    MyFloat Vel[3];
+    MyFloat Spin[3];
+
+#ifdef STARFORMATION
+    MyFloat Sfr;
+    MyFloat GasMassSfr;
+#endif
+#ifdef MERGERTREE
+    long long SubhaloNr;     /* global subhalo number within a given snapshot */
+    long long UniqueGroupNr; /* global group number within whole merger tree  */
+    long long FileOffset;
+    long long TreeID;
+    int TreeTask;
+    int TreeIndex;
+    MyFloat M_Crit200; /* will only be set for main subhalos in halos */
+#endif
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+    int SubhaloLenPrevMostBnd;
+#endif
+  };
+  subhalo_properties *Subhalo;
+
+#if defined(MERGERTREE) || defined(LGALAXIES)
+  /* Subhalos as they appear in the merger tree */
+  struct treehalo_t
+  {
+    /* the following pointers are always meant to be relative to the halos in the same tree */
+    int TreeDescendant;
+    int TreeFirstProgenitor;
+    int TreeNextProgenitor;
+    int TreeFirstHaloInFOFgroup;
+    int TreeNextHaloInFOFgroup;
+    int TreeProgenitor;
+    int TreeFirstDescendant;
+    int TreeNextDescendant;
+    int TreeMainProgenitor;
+
+    long long TreeID;
+    int TreeIndex;
+
+    /* original position of the subhalo in the subhalo group catalogue output */
+    int SnapNum;
+    long long SubhaloNr; /* number of subhalo in the full group catalogue of this snapshot */
+    long long GroupNr;
+    long long UniqueGroupNr;
+
+    /* properties of subhalo */
+    subhalo_properties SubProp;
+  };
+#endif
+
+#endif  // SUBFIND
+
+  void fof_fof(int num, const char *grpcat_basename, const char *grpcat_dirbasename, double inner_distance);
+  double fof_get_comoving_linking_length(void);
+  void fof_subfind_exchange(MPI_Comm Communicator);
+  void fof_subfind_write_file(char *fname, int writeTask, int lastTask, void *CommBuffer);
+  double fof_find_nearest_dmparticle(void);
+  double fof_find_groups(void);
+  void fof_subfind_save_groups(int num, const char *basename, const char *grpcat_dirbasename);
+  void fof_subfind_init_io_fields(void);
+  void fof_subfind_load_groups(int num);
+  void fof_assign_group_offset(void);
+  void fof_reorder_PS(int *Id, int Nstart, int N);
+
+ private:
+  gravtree<partset> FoFGravTree; /*!< an instance of a gravitational tree */
+  foftree<partset> FoFNgbTree;   /*!< an instance of a neighbour search tree */
+  domain<partset> *FoFDomain;    /*!< a pointer to the parent domain decomposition */
+
+  int NgroupsExt;
+
+  void fof_compile_catalogue(double inner_distance);
+  void fof_compute_group_properties(int gr, int start, int len);
+  void fof_prepare_output_order(void);
+  void fof_add_in_properties_of_group_segments(void);
+  void fof_finish_group_properties(void);
+  void fof_assign_group_numbers(void);
+  void fof_get_halo_position(MyIntPosType *intpos, double *pos);
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+  double fof_distance_to_origin(int i);
+#endif
+
+ public:
+  struct fof_particle_list
+  {
+    MyIDStorage MinID;
+    int MinIDTask;
+    int Pindex;
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+    double DistanceOrigin;
+#endif
+  };
+  fof_particle_list *FOF_PList;
+
+  struct fof_group_list
+  {
+    MyIDStorage MinID;
+    int MinIDTask;
+    MyLenType Count;
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+    double DistanceOrigin;
+#endif
+  };
+  fof_group_list *FOF_GList;
+
+ public:
+#ifndef LEAN
+  static bool fof_compare_subfind_data_Type(const subfind_data &a, const subfind_data &b) { return a.Type < b.Type; }
+#endif
+
+  static bool fof_compare_subfind_data_GroupNr_SubNr_Egy_Key(const subfind_data &a, const subfind_data &b)
+  {
+    if(a.GroupNr < b.GroupNr)
+      return true;
+    if(a.GroupNr > b.GroupNr)
+      return false;
+
+#ifdef SUBFIND
+    if(a.SubRankInGr < b.SubRankInGr)
+      return true;
+    if(a.SubRankInGr > b.SubRankInGr)
+      return false;
+
+    if(a.v.DM_BindingEnergy < b.v.DM_BindingEnergy)
+      return true;
+    if(a.v.DM_BindingEnergy > b.v.DM_BindingEnergy)
+      return false;
+#endif
+
+    return a.u.Key < b.u.Key;
+  }
+
+#if defined(MERGERTREE) && defined(SUBFIND)
+
+  static bool fof_compare_subfind_data_GroupNr_SubRankInNr_BndEgy(const subfind_data &a, const subfind_data &b)
+  {
+    if(a.GroupNr < b.GroupNr)
+      return true;
+    if(a.GroupNr > b.GroupNr)
+      return false;
+
+    if(a.SubRankInGr < b.SubRankInGr)
+      return true;
+    if(a.SubRankInGr > b.SubRankInGr)
+      return false;
+
+    return a.v.DM_BindingEnergy < b.v.DM_BindingEnergy;
+  }
+
+#endif
+
+  static bool fof_compare_subfind_data_OriginTask_OriginIndex(const subfind_data &a, const subfind_data &b)
+  {
+    if(a.OriginTask < b.OriginTask)
+      return true;
+    if(a.OriginTask > b.OriginTask)
+      return false;
+
+    return a.OriginIndex < b.OriginIndex;
+  }
+
+  static bool fof_compare_FOF_PList_MinID(const fof_particle_list &a, const fof_particle_list &b)
+  {
+    return a.MinID.get() < b.MinID.get();
+  }
+
+  static bool fof_compare_FOF_GList_MinID(const fof_group_list &a, const fof_group_list &b) { return a.MinID.get() < b.MinID.get(); }
+
+  static bool fof_compare_FOF_GList_MinIDTask(const fof_group_list &a, const fof_group_list &b) { return a.MinIDTask < b.MinIDTask; }
+
+  static bool fof_compare_Group_Len_MinID_DiffOriginTaskMinIDTask(const group_properties &a, const group_properties &b)
+  {
+    if(a.Len > b.Len)
+      return true;
+    if(a.Len < b.Len)
+      return false;
+
+    if(a.MinID < b.MinID)
+      return true;
+    if(a.MinID > b.MinID)
+      return false;
+
+    return labs(a.OriginTask - a.MinIDTask) < labs(b.OriginTask - b.MinIDTask);
+  }
+
+  static bool fof_compare_Group_OriginTask_MinID(const group_properties &a, const group_properties &b)
+  {
+    if(a.OriginTask < b.OriginTask)
+      return true;
+    if(a.OriginTask > b.OriginTask)
+      return false;
+
+    return a.MinID < b.MinID;
+  }
+
+  /* now comes the group catalogue created with execution of FOF */
+
+ public:
+  static inline bool fof_compare_Group_GroupNr(const group_properties &a, const group_properties &b) { return a.GroupNr < b.GroupNr; }
+
+  static bool fof_compare_Group_MinID(const group_properties &a, const group_properties &b) { return a.MinID < b.MinID; }
+
+  static bool fof_compare_Group_MinIDTask(const group_properties &a, const group_properties &b) { return a.MinIDTask < b.MinIDTask; }
+
+  /********************************************  SubFind part ****************************************/
+#ifdef SUBFIND
+ public:
+  unsigned char *ProcessedFlag;
+
+  unsigned long long GroupNr;
+  double Ascale;
+
+  MPI_Comm SubComm;
+  int CommSplitColor;
+  int SubNTask, SubThisTask;
+
+  int Ncollective;
+  int NprocsCollective;
+  int MaxSerialGroupLen;
+
+  void subfind_density_hsml_guess(void);
+  void subfind_find_subhalos(int num, const char *basename, const char *grpcat_dirbasename);
+
+ public:
+  struct sort_r2list
+  {
+    double r;
+    double mass;
+  };
+
+  static bool subfind_compare_dist_rotcurve(const sort_r2list &a, const sort_r2list &b) { return a.r < b.r; }
+
+  double subfind_density(void);
+  double subfind_overdensity(void);
+  double subfind_get_overdensity_value(int type, double ascale);
+  void subfind_save_final(int num, const char *basename, const char *grpcat_dirbasename);
+
+  void subfind_processing(domain<partset> *SubDomain, domain_options mode);
+  void subfind_potential_compute(domain<partset> *SubDomain, int num, int *d);
+  void subfind_find_linkngb(domain<partset> *SubDomain, int num, int *list);
+  void subfind_find_nearesttwo(domain<partset> *SubDomain, int num, int *list);
+  void subfind_redetermine_groupnr(void);
+
+  void subfind_process_groups_serially(void);
+  void subfind_distribute_particles(MPI_Comm Communicator);
+  void subfind_distribute_groups(void);
+  double subfind_get_particle_balance(void);
+  void subfind_assign_subhalo_offsettype(void);
+  void subfind_match_ids_of_previously_most_bound_ids(partset *Tp);
+
+  double subfind_locngb_treefind(MyDouble xyz[3], int desngb, double hguess);
+  int subfind_unbind(domain<partset> *D, MPI_Comm Communicator, int *unbind_list, int len);
+
+  int subfind_determine_sub_halo_properties(int *d, int num, subhalo_properties *subhalo, MPI_Comm Communicator);
+
+  void subfind_hbt_single_group(domain<partset> *SubDomain, domain<partset> *SingleDomain, domain_options mode, int gr);
+
+  struct proc_assign_data
+  {
+    long long GroupNr;
+    MyLenType Len;
+    int FirstTask;
+    int NTask;
+  };
+  proc_assign_data *ProcAssign;
+
+  struct submp_data
+  {
+    long long GroupNr;
+    int index;
+
+#ifndef SUBFIND_HBT
+    MyFloat DM_Density;
+#endif
+  };
+  submp_data *submp;
+
+  struct cand_dat
+  {
+    location head;
+    MyLenType len;
+    MyLenType bound_length;
+
+    int nsub;
+    int rank, subnr, parent;
+  };
+
+  struct coll_cand_dat
+  {
+    location head;
+    MyLenType rank;
+    MyLenType len;
+    MyLenType bound_length;
+
+    int nsub;
+    int subnr, parent;
+  };
+  coll_cand_dat *coll_candidates;
+
+  struct SubDMData
+  {
+    double rho;
+    double vx, vy, vz;
+    double v2;
+  };
+
+  static inline bool subfind_compare_submp_GroupNr_DM_Density(const submp_data &a, const submp_data &b)
+  {
+#ifndef SUBFIND_HBT
+    if(a.GroupNr < b.GroupNr)
+      return true;
+    if(a.GroupNr > b.GroupNr)
+      return false;
+
+    return (a.DM_Density > b.DM_Density);
+#else
+    return a.GroupNr < b.GroupNr;
+#endif
+  }
+
+  static inline bool subfind_compare_binding_energy(const double &a, const double &b) { return a > b; }
+
+  static inline bool subfind_compare_potential(const double &a, const double &b) { return a < b; }
+
+  static inline bool subfind_compare_Subhalo_GroupNr_SubRankInGr(const subhalo_properties &a, const subhalo_properties &b)
+  {
+    if(a.GroupNr < b.GroupNr)
+      return true;
+    if(a.GroupNr > b.GroupNr)
+      return false;
+
+    return a.SubRankInGr < b.SubRankInGr;
+  }
+
+  static inline bool subfind_compare_procassign_GroupNr(const proc_assign_data &a, const proc_assign_data &b)
+  {
+    return a.GroupNr < b.GroupNr;
+  }
+
+ private:
+  long long count_decisions;
+  long long count_different_decisions;
+
+  struct sort_density_data
+  {
+    MyFloat density;
+    int ngbcount;
+    location index; /* this will store the task in the upper word */
+    location ngb_index1;
+    location ngb_index2;
+    approxlen PrevSizeOfSubhalo;
+  };
+  sort_density_data *sd;
+
+  struct PPS_data
+  {
+    int index;
+    int submark;
+  };
+  PPS_data *PPS;
+
+  void subfind_col_find_coll_candidates(long long totgrouplen);
+
+  void subfind_poll_for_requests(void);
+
+  int subfind_distlinklist_get_tail_set_tail_increaselen(location index, location &tail, location newtail, approxlen prevlen);
+
+  void subfind_distlinklist_get_two_heads(location ngb_index1, location ngb_index2, location &head, location &head_attach);
+  void subfind_distlinklist_set_next(location index, location next);
+  void subfind_distlinklist_add_particle(location index);
+  void subfind_distlinklist_add_bound_particles(location index, int nsub);
+  void subfind_distlinklist_mark_particle(location index, int target, int submark);
+  void subfind_distlinklist_set_headandnext(location index, location head, location next);
+  void subfind_distlinklist_set_tailandlen(location index, location tail, MyLenType len, double prevlen);
+  void subfind_distlinklist_get_tailandlen(location index, location &tail, MyLenType &len, double &prevlen);
+  void subfind_distlinklist_set_all(location index, location head, location tail, MyLenType len, location next, approxlen prevlen);
+  location subfind_distlinklist_get_next(location index);
+  location subfind_distlinklist_get_head(location index);
+  location subfind_distlinklist_setrank_and_get_next(location index, MyLenType &rank);
+  location subfind_distlinklist_set_head_get_next(location index, location head);
+  MyLenType subfind_distlinklist_get_rank(location index);
+
+  void subfind_get_factors(double &fac_vel_to_phys, double &fac_hubbleflow, double &fac_comov_to_phys);
+
+  void subfind_process_single_group(domain<partset> *SubDomain, domain<partset> *SingleDomain, domain_options mode, int gr);
+  void subfind_unbind_independent_ones(domain<partset> *SingleDomain, int count);
+  double subfind_vel_to_phys_factor(void);
+
+  void subfind_collective_printf(const char *fmt, ...)
+  {
+    if(SubNTask > 1 && SubThisTask == 0)
+      {
+        va_list l;
+        va_start(l, fmt);
+        vprintf(fmt, l);
+        va_end(l);
+      }
+  }
+
+  static bool subfind_compare_densities(const sort_density_data &a, const sort_density_data &b) /* largest density first */
+  {
+    return a.density > b.density;
+  }
+
+  static bool subfind_PS_compare_DM_density(const subfind_data &a, const subfind_data &b) /* largest density first */
+  {
+    return a.u.s.u.DM_Density > b.u.s.u.DM_Density;
+  }
+
+  static bool subfind_PS_compare_origintask_originindex(const subfind_data &a, const subfind_data &b)
+  {
+    if(a.u.s.origintask < b.u.s.origintask)
+      return true;
+    if(a.u.s.origintask > b.u.s.origintask)
+      return false;
+
+    return a.u.s.originindex < b.u.s.originindex;
+  }
+
+  static bool subfind_compare_coll_candidates_subnr(const coll_cand_dat &a, const coll_cand_dat &b) { return a.subnr < b.subnr; }
+
+  static bool subfind_compare_coll_candidates_nsubs(const coll_cand_dat &a, const coll_cand_dat &b) { return a.nsub < b.nsub; }
+
+  static bool subfind_compare_coll_candidates_boundlength(const coll_cand_dat &a, const coll_cand_dat &b)
+  {
+    if(a.bound_length > b.bound_length)
+      return true;
+    if(a.bound_length < b.bound_length)
+      return false;
+
+    return a.rank < b.rank;
+  }
+
+  static bool subfind_compare_coll_candidates_rank(const coll_cand_dat &a, const coll_cand_dat &b)
+  {
+    if(a.rank < b.rank)
+      return true;
+    if(a.rank > b.rank)
+      return false;
+    return a.len > b.len;
+  }
+
+  static bool subfind_compare_PPS(const PPS_data &a, const PPS_data &b) { return a.submark < b.submark; }
+
+  MyLenType *SFLen;
+  location *SFHead;
+  location *SFNext;
+  location *SFTail;
+  double *SFPrevLen;
+
+  int count_cand, max_coll_candidates;
+  int *unbind_list;
+
+  int NumPartGroup;
+  int *IndexList;
+  int LocalLen;
+
+  struct sort_as_data
+  {
+    double density;
+    int targettask;
+    long long origin;
+  };
+
+  static bool subfind_compare_as_density(const sort_as_data &a, const sort_as_data &b) /* largest density first */
+  {
+    return a.density > b.density;
+  }
+
+  static bool subfind_compare_as_origin(const sort_as_data &a, const sort_as_data &b) /* largest density first */
+  {
+    return a.origin < b.origin;
+  }
+
+  struct hbt_pcand_t
+  {
+    MyHaloNrType SubhaloNr;
+    approxlen PrevSizeOfSubhalo;
+    int index;
+  };
+
+  static bool subfind_hbt_compare_pcand_subhalonr(const hbt_pcand_t &a, const hbt_pcand_t &b) { return a.SubhaloNr < b.SubhaloNr; }
+
+  struct hbt_subcand_t
+  {
+    MyHaloNrType SubhaloNr;
+    MyLenType len;
+    bool DoIt;
+
+    int TargetTask;
+    int TargetIndex;
+    long long summedprevlen;
+
+    /*
+    location head;
+    MyLenType rank;
+
+    MyLenType bound_length;
+
+    int nsub;
+    int subnr, parent;
+    */
+  };
+
+  static bool subfind_hbt_compare_subcand_subhalonr(const hbt_subcand_t &a, const hbt_subcand_t &b)
+  {
+    return a.SubhaloNr < b.SubhaloNr;
+  }
+
+  static bool subfind_hbt_compare_subcand_len(const hbt_subcand_t &a, const hbt_subcand_t &b) { return a.len < b.len; }
+
+  static bool subfind_hbt_compare_subcand_summedprevlen(const hbt_subcand_t &a, const hbt_subcand_t &b)
+  {
+    return a.summedprevlen < b.summedprevlen;
+  }
+
+  struct hbt_subhalo_t
+  {
+    MyLenType Len;
+    int SubRankInGr;
+    int ThisTask;
+    int ThisIndex;
+    long long SubhaloNr;
+  };
+
+  static bool subfind_hbt_compare_subhalolist_len(const hbt_subhalo_t &a, const hbt_subhalo_t &b) { return a.Len > b.Len; }
+
+  static bool subfind_hbt_compare_subhalolist_thistask_thisindex(const hbt_subhalo_t &a, const hbt_subhalo_t &b)
+
+  {
+    if(a.ThisTask < b.ThisTask)
+      return true;
+    if(a.ThisTask > b.ThisTask)
+      return false;
+
+    return a.ThisIndex < b.ThisIndex;
+  }
+
+  static bool subfind_hbt_compare_subhalolist_prevsubhalonr(const hbt_subhalo_t &a, const hbt_subhalo_t &b)
+  {
+    return a.SubhaloNr < b.SubhaloNr;
+  }
+
+#endif
+};
+
+inline bool is_type_primary_link_type(int type)
+{
+  if((1 << type) & (FOF_PRIMARY_LINK_TYPES))
+    return true;
+  else
+    return false;
+}
+
+inline bool is_type_secondary_link_type(int type)
+{
+  if((1 << type) & (FOF_SECONDARY_LINK_TYPES))
+    return true;
+  else
+    return false;
+}
+#endif  // end of FOF
+
+#endif
diff --git a/src/fof/fof_findgroups.cc b/src/fof/fof_findgroups.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8faa002274b5ef16bd25aa1c6cd8d2816fce0321
--- /dev/null
+++ b/src/fof/fof_findgroups.cc
@@ -0,0 +1,750 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file fof_findgroups.cc
+ *
+ *  \brief routines for identifying particle groups via friends-of-friends (FOF) linking
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef FOF
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/generic_comm.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../sort/cxxsort.h"
+#include "../system/system.h"
+
+/* local data structure for collecting particle/cell data that is sent to other processors if needed */
+struct foffind_in : data_in_generic
+{
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+  double DistanceOrigin;
+#endif
+  MyIntPosType IntPos[3];
+  MyIDStorage MinID;
+  int MinIDTask;
+};
+
+/* local data structure that holds results acquired on remote processors */
+struct foffind_out
+{
+  char link_count_flag;
+};
+
+/* routine that fills the relevant particle/cell data into the input structure defined above */
+
+template <typename T_tree, typename T_domain, typename T_partset>
+class foffind_comm : public generic_comm<foffind_in, foffind_out, T_tree, T_domain, T_partset>
+{
+ public:
+  typedef generic_comm<foffind_in, foffind_out, T_tree, T_domain, T_partset> gcomm;
+  using gcomm::D;
+  using gcomm::Thread;
+  using gcomm::Tp;  // This makes sure that we can access Tp from the base class without having to use "this->Tp"
+  using gcomm::Tree;
+
+  /* need to call the base class constructor explicitly */
+  foffind_comm(T_domain *dptr, T_tree *tptr, T_partset *pptr) : gcomm(dptr, tptr, pptr) {}
+
+  void particle2in(foffind_in *in, int i) override
+  {
+    for(int k = 0; k < 3; k++)
+      in->IntPos[k] = Tp->P[i].IntPos[k];
+
+    in->MinID     = Tp->MinID[Tp->Head[i]];
+    in->MinIDTask = Tp->MinIDTask[Tp->Head[i]];
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+    in->DistanceOrigin = Tp->DistanceOrigin[Tp->Head[i]];
+#endif
+  }
+
+  void out2particle(foffind_out *out, int i, int mode) override
+  {
+    if(mode == MODE_LOCAL_PARTICLES) /* initial store */
+      {
+        /* nothing to be done here */
+      }
+    else /* combine */
+      {
+        if(out->link_count_flag)
+          Tp->Flags[i].Marked = 1;
+      }
+  }
+
+  int evaluate(int target, int mode, int thread_id, int action, foffind_in *in, int numnodes, node_info *firstnode,
+               foffind_out &out) override
+  {
+    memset(&out, 0, sizeof(out));
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+    double target_DistanceOrigin = in->DistanceOrigin;
+#else
+    double target_DistanceOrigin = 0;
+#endif
+
+    int numngb = Tree->treefind_fof_primary(in->IntPos, Tp->LinkL, target, mode, &Thread, numnodes, firstnode, Thread.Ngblist,
+                                            in->MinID, in->MinIDTask, target_DistanceOrigin);
+
+    if(mode == MODE_IMPORTED_PARTICLES)
+      {
+        if(numngb > 0)
+          out.link_count_flag = 1;
+        else
+          out.link_count_flag = 0;
+      }
+
+    return numngb;
+  }
+};
+
+template <typename partset>
+double fof<partset>::fof_find_groups(void)
+{
+  double tstart = Logs.second();
+
+  mpi_printf("FOF: Start linking particles (presently allocated=%g MB)\n", Mem.getAllocatedBytesInMB());
+
+  Tp->Flags = (typename partset::bit_flags *)Mem.mymalloc_clear("Flags", Tp->NumPart * sizeof(typename partset::bit_flags));
+
+  FoFNgbTree.FullyLinkedNodePIndex = (int *)Mem.mymalloc("FullyLinkedNodePIndex", FoFNgbTree.NumNodes * sizeof(int));
+  FoFNgbTree.FullyLinkedNodePIndex -= FoFNgbTree.MaxPart;
+
+  for(int i = 0; i < FoFNgbTree.NumNodes; i++)
+    {
+      int no                               = i + FoFNgbTree.MaxPart;
+      FoFNgbTree.FullyLinkedNodePIndex[no] = -1;
+    }
+
+  /* let's link all those primary particles which are in small enough nodes, only process local non-toplevel nodes */
+  double taa = Logs.second();
+  for(int i = 0; i < FoFNgbTree.NumNodes; i++)
+    {
+      int no = i + FoFNgbTree.MaxPart;
+
+      if(FoFNgbTree.FullyLinkedNodePIndex[no] < 0)
+        {
+          if(FoFNgbTree.get_nodep(no)->level > 0)
+            {
+              double len = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - FoFNgbTree.get_nodep(no)->level)) * Tp->FacIntToCoord;
+              if(FACTSQRT3 * len < Tp->LinkL)  // all particles in this node can be linked
+                {
+                  int q = FoFNgbTree.treefind_fof_return_a_particle_in_cell_recursive(no);
+
+                  if(q >= 0)
+                    FoFNgbTree.fof_link_particles_in_cell_recursive(no, q);
+                }
+            }
+        }
+    }
+  double tbb = Logs.second();
+  mpi_printf("FOF: linking of small cells took %g sec\n", Logs.timediff(taa, tbb));
+
+  /* first, link only among local particles */
+  int *targetlist = (int *)Mem.mymalloc("targetlist", Tp->NumPart * sizeof(int));
+
+  int npart = 0;
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      if(is_type_primary_link_type(Tp->P[i].getType()))
+        targetlist[npart++] = i;
+    }
+
+  /* create an object for handling the communication */
+  foffind_comm<foftree<partset>, domain<partset>, partset> commpattern(FoFDomain, &FoFNgbTree, Tp);
+
+  TIMER_STORE; /* active timer should be CPU_FOF */
+
+  double t0 = Logs.second();
+
+  commpattern.execute(npart, targetlist, MODE_LOCAL_NO_EXPORT, logs::CPU_FOFWALK, logs::CPU_FOFWALK, logs::CPU_FOFIMBAL);
+
+  double t1 = Logs.second();
+
+  int marked = 0;
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      if(is_type_primary_link_type(Tp->P[i].getType()))
+        {
+          if(Tp->Flags[i].Nonlocal)
+            targetlist[marked++] = i;
+        }
+    }
+
+  double dt = TIMER_DIFF(CPU_FOFWALK), dtmax, dtsum;
+  MPI_Allreduce(&dt, &dtmax, 1, MPI_DOUBLE, MPI_MAX, Communicator);
+  MPI_Allreduce(&dt, &dtsum, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+
+  long long totmarked, totnpart;
+  sumup_large_ints(1, &marked, &totmarked, Communicator);
+  sumup_large_ints(1, &npart, &totnpart, Communicator);
+  mpi_printf(
+      "FOF: local links done (took %g sec, avg-work=%g, imbalance=%g).\nFOF: Marked=%lld out of the %lld primaries "
+      "which "
+      "are linked\n",
+      Logs.timediff(t0, t1), dtsum / NTask, dtmax / (dtsum / NTask), totmarked, totnpart);
+
+  npart = marked;
+
+  mpi_printf("FOF: begin linking across processors (presently allocated=%g MB) \n", Mem.getAllocatedBytesInMB());
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    Tp->Flags[i].Marked = 1;
+
+  long long link_across_tot;
+
+  do
+    {
+      double t0 = Logs.second();
+
+      npart = 0;
+      for(int i = 0; i < Tp->NumPart; i++)
+        {
+          if(is_type_primary_link_type(Tp->P[i].getType()))
+            {
+              if(Tp->Flags[i].Nonlocal && Tp->Flags[i].Marked)
+                targetlist[npart++] = i;
+            }
+
+          Tp->Flags[i].MinIDChanged = 0;
+          Tp->Flags[i].Marked       = 0;
+        }
+
+      commpattern.execute(npart, targetlist, MODE_DEFAULT, logs::CPU_FOFWALK, logs::CPU_FOFWALK, logs::CPU_FOFIMBAL);
+
+      int link_across = commpattern.Thread.Interactions;
+
+      sumup_large_ints(1, &link_across, &link_across_tot, Communicator);
+
+      long long ntot;
+      sumup_large_ints(1, &npart, &ntot, Communicator);
+
+      double t1 = Logs.second();
+
+      mpi_printf("FOF: have done %15lld cross links (processed %14lld, took %g sec)\n", link_across_tot, ntot, Logs.timediff(t0, t1));
+
+      /* let's check out which particles have changed their MinID */
+
+      for(int i = 0; i < Tp->NumPart; i++)
+        {
+          if(Tp->Flags[i].Nonlocal)
+            {
+              if(Tp->Flags[Tp->Head[i]].MinIDChanged)
+                Tp->Flags[i].Marked = 1;
+            }
+        }
+    }
+  while(link_across_tot > 0);
+
+  Mem.myfree(targetlist);
+  Mem.myfree(FoFNgbTree.FullyLinkedNodePIndex + FoFNgbTree.MaxPart);
+  Mem.myfree(Tp->Flags);
+
+  mpi_printf("FOF: Local groups found.\n");
+
+  double tend = Logs.second();
+  return Logs.timediff(tstart, tend);
+}
+
+/*! This function returns neighbors with distance <= hsml and returns them in
+ *  Ngblist. Actually, particles in a box of half side length hsml are
+ *  returned, i.e. the reduction to a sphere still needs to be done in the
+ *  calling routine.
+ */
+template <typename partset>
+int foftree<partset>::treefind_fof_primary(MyIntPosType *searchcenter, MyNgbTreeFloat hsml, int target, int mode, thread_data *thread,
+                                           int numnodes, node_info *firstnode, int *ngblist, MyIDStorage target_MinID,
+                                           int target_MinIDTask, double target_DistanceOrigin)
+{
+  if(mode == MODE_LOCAL_NO_EXPORT)
+    Tp->Flags[target].Nonlocal = 0;
+
+  MyNgbTreeFloat hsml2 = hsml * hsml;
+
+  MyIntPosType search_min[3], search_range[3];
+  MyIntPosType inthsml = hsml * Tp->FacCoordToInt;
+
+  for(int i = 0; i < 3; i++)
+    {
+      search_min[i]   = searchcenter[i] - inthsml;
+      search_range[i] = inthsml + inthsml;
+    }
+
+  int numngb = 0;
+
+  for(int k = 0; k < numnodes; k++)
+    {
+      int no;
+
+      if(mode == MODE_LOCAL_PARTICLES || mode == MODE_LOCAL_NO_EXPORT)
+        {
+          no = MaxPart; /* root node */
+        }
+      else
+        {
+          no = firstnode[k].Node;
+          no = get_nodep(no)->nextnode; /* open it */
+        }
+
+      int shmrank = TreeSharedMem_ThisTask;
+
+      while(no >= 0)
+        {
+          if(no < MaxPart) /* single particle */
+            {
+              if(shmrank != TreeSharedMem_ThisTask)
+                Terminate("unexpected because in the present algorithm we are only allowed walk local branches");
+
+              int p  = no;
+              auto P = get_Pp(no, shmrank);
+
+              no = get_nextnodep(shmrank)[no]; /* note: here shmrank cannot change */
+
+              if(mode == MODE_LOCAL_PARTICLES) /* because we have already linked those in previous phase with MODE_LOCAL_NO_EXPORT */
+                continue;
+
+              double dx = ((MySignedIntPosType)(P->IntPos[0] - searchcenter[0])) * Tp->FacIntToCoord;
+              double dd = dx * dx;
+              if(dd > hsml2)
+                continue;
+
+              double dy = ((MySignedIntPosType)(P->IntPos[1] - searchcenter[1])) * Tp->FacIntToCoord;
+              dd += dy * dy;
+              if(dd > hsml2)
+                continue;
+
+              double dz = ((MySignedIntPosType)(P->IntPos[2] - searchcenter[2])) * Tp->FacIntToCoord;
+              dd += dz * dz;
+              if(dd > hsml2)
+                continue;
+
+              if(mode == MODE_IMPORTED_PARTICLES)
+                {
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+                  if(Tp->DistanceOrigin[Tp->Head[p]] > target_DistanceOrigin)
+                    {
+                      Tp->DistanceOrigin[Tp->Head[p]]     = target_DistanceOrigin;
+                      Tp->Flags[Tp->Head[p]].MinIDChanged = 1;
+                      numngb++;
+                    }
+#endif
+                  if(Tp->MinID[Tp->Head[p]].get() > target_MinID.get())
+                    {
+                      Tp->MinID[Tp->Head[p]]              = target_MinID;
+                      Tp->MinIDTask[Tp->Head[p]]          = target_MinIDTask;
+                      Tp->Flags[Tp->Head[p]].MinIDChanged = 1;
+                      numngb++;
+                    }
+                }
+              else if(mode == MODE_LOCAL_NO_EXPORT)
+                {
+                  if(Tp->Head[target] != Tp->Head[p])
+                    {
+                      int no_target = Father[target];
+                      int no_p      = Father[p];
+
+                      Tp->link_two_particles(target, p);
+
+                      if(no_target >= 0)
+                        {
+                          if(FullyLinkedNodePIndex[no_target] >= 0)
+                            {
+                              if(Tp->Head[FullyLinkedNodePIndex[no_target]] != Tp->Head[target])
+                                Terminate("how come that the node is fully linked, but not to us");
+                            }
+                          else
+                            {
+                              // this parent node was not yet fully linked to the final group: Is this the case now?
+                              // If needed, check also all parent nodes
+                              while(treefind_fof_check_single_node_for_full_linking(no_target))
+                                no_target = get_nodep(no_target)->father;
+                            }
+                        }
+
+                      if(no_p >= 0)
+                        {
+                          if(FullyLinkedNodePIndex[no_p] >= 0)
+                            {
+                              if(Tp->Head[FullyLinkedNodePIndex[no_p]] != Tp->Head[p])
+                                Terminate("how come that the node is fully linked, but not to us");
+                            }
+                          else
+                            {
+                              // this parent node was not yet fully linked to the final group: Is this the case now?
+                              // If needed, check also all parent nodes
+                              while(treefind_fof_check_single_node_for_full_linking(no_p))
+                                no_p = get_nodep(no_p)->father;
+                            }
+                        }
+                    }
+                }
+              else
+                Terminate("strange mode");
+            }
+          else if(no < MaxPart + MaxNodes) /* internal node */
+            {
+              fofnode *current = get_nodep(no, shmrank);
+
+              if(current->level == 0)
+                {
+                  /* we always open the root node (its full node length couldn't be stored in the integer type */
+                  no      = current->nextnode; /* no change in shmrank expected here */
+                  shmrank = current->nextnode_shmrank;
+                  continue;
+                }
+              /* check whether the node lies outside our search range */
+
+              if(mode == MODE_IMPORTED_PARTICLES)
+                {
+                  if(no < FirstNonTopLevelNode) /* we reached a top-level node again, which means that we are done with the branch */
+                    break;
+
+                  if(FullyLinkedNodePIndex[no] >= 0)
+                    {
+                      int head = Tp->Head[FullyLinkedNodePIndex[no]];
+
+                      if(head >= 0)
+                        if(Tp->MinID[head].get() <= target_MinID.get())
+                          {
+#if defined(LIGHTCONE_PARTICLES_GROUPS)
+                            if(Tp->DistanceOrigin[FullyLinkedNodePIndex[no]] <= target_DistanceOrigin)
+#endif
+                              {
+                                no      = current->sibling; /* the node can be discarded */
+                                shmrank = current->sibling_shmrank;
+                                continue;
+                              }
+                          }
+                    }
+                }
+              else if(mode == MODE_LOCAL_PARTICLES)
+                {
+                  int p = current->nextnode;
+
+                  /* in case the next node after opening is not a top-level node, we have either reached a leaf node or are in a local
+                   * branch we need to do nothing if we would end up on different shared memory thread */
+                  if(p < MaxPart || (p >= FirstNonTopLevelNode && p < MaxPart + MaxNodes))
+                    {
+                      if(current->nextnode_shmrank != TreeSharedMem_ThisTask)
+                        {
+                          int task = D->ThisTask + current->nextnode_shmrank - TreeSharedMem_ThisTask;
+
+                          if(target >= 0) /* export */
+                            tree_export_node_threads_by_task_and_node(task, no, target, thread);
+
+                          no      = current->sibling; /* in case the node can be discarded */
+                          shmrank = current->sibling_shmrank;
+                          continue;
+                        }
+                    }
+
+                  if(no >= FirstNonTopLevelNode)
+                    {
+                      /* we have a node with only local particles, hence we can skip it for mode == 0 */
+                      no      = current->sibling; /* in case the node can be discarded */
+                      shmrank = current->sibling_shmrank;
+                      continue;
+                    }
+                }
+              else if(mode == MODE_LOCAL_NO_EXPORT)
+                {
+                  int p = current->nextnode;
+
+                  /* in case the next node after opening is not a top-level node, we have either reached a leaf node or are in a local
+                   * branch we need to do nothing if we would end up on different shared memory thread */
+                  if(p < MaxPart || (p >= FirstNonTopLevelNode && p < MaxPart + MaxNodes))
+                    {
+                      if(current->nextnode_shmrank != TreeSharedMem_ThisTask)
+                        {
+                          no      = current->sibling; /* in case the node can be discarded */
+                          shmrank = current->sibling_shmrank;
+
+                          MyIntPosType left[3], right[3];
+
+                          left[0]  = current->range_min[0] - search_min[0];
+                          right[0] = current->range_max[0] - search_min[0];
+
+                          /* check whether we can stop walking along this branch */
+                          if(left[0] > search_range[0] && right[0] > left[0])
+                            continue;
+
+                          left[1]  = current->range_min[1] - search_min[1];
+                          right[1] = current->range_max[1] - search_min[1];
+
+                          /* check whether we can stop walking along this branch */
+                          if(left[1] > search_range[1] && right[1] > left[1])
+                            continue;
+
+                          left[2]  = current->range_min[2] - search_min[2];
+                          right[2] = current->range_max[2] - search_min[2];
+
+                          /* check whether we can stop walking along this branch */
+                          if(left[2] > search_range[2] && right[2] > left[2])
+                            continue;
+
+                          Tp->Flags[target].Nonlocal = 1;
+                          continue;
+                        }
+                    }
+
+                  if(FullyLinkedNodePIndex[no] >= 0)
+                    if(Tp->Head[target] == Tp->Head[FullyLinkedNodePIndex[no]])  // all particles in the node are linked to us anyhow
+                      {
+                        no      = current->sibling; /* in case the node can be discarded */
+                        shmrank = current->sibling_shmrank;
+                        continue;
+                      }
+                }
+
+              MyIntPosType left[3], right[3];
+
+              left[0]  = current->range_min[0] - search_min[0];
+              right[0] = current->range_max[0] - search_min[0];
+
+              /* check whether we can stop walking along this branch */
+              if(left[0] > search_range[0] && right[0] > left[0])
+                {
+                  no      = current->sibling; /* in case the node can be discarded */
+                  shmrank = current->sibling_shmrank;
+                  continue;
+                }
+
+              left[1]  = current->range_min[1] - search_min[1];
+              right[1] = current->range_max[1] - search_min[1];
+
+              /* check whether we can stop walking along this branch */
+              if(left[1] > search_range[1] && right[1] > left[1])
+                {
+                  no      = current->sibling; /* in case the node can be discarded */
+                  shmrank = current->sibling_shmrank;
+                  continue;
+                }
+
+              left[2]  = current->range_min[2] - search_min[2];
+              right[2] = current->range_max[2] - search_min[2];
+
+              /* check whether we can stop walking along this branch */
+              if(left[2] > search_range[2] && right[2] > left[2])
+                {
+                  no      = current->sibling; /* in case the node can be discarded */
+                  shmrank = current->sibling_shmrank;
+                  continue;
+                }
+
+              no      = current->nextnode;         /* ok, we need to open the node */
+              shmrank = current->nextnode_shmrank; /* ok, we need to open the node */
+            }
+          else /* pseudo particle */
+            {
+              if(mode == MODE_LOCAL_PARTICLES)
+                {
+                  if(target >= 0) /* if no target is given, export will not occur */
+                    tree_export_node_threads(no, target, thread);
+                }
+              else if(mode == MODE_LOCAL_NO_EXPORT)
+                {
+                  Tp->Flags[target].Nonlocal = 1;
+                }
+
+              no = get_nextnodep(shmrank)[no - MaxNodes];
+              /* note: here shmrank does not need to change */
+
+              continue;
+            }
+        }
+    }
+
+  return numngb;
+}
+
+template <typename partset>
+void foftree<partset>::fof_link_particles_in_cell_recursive(int no, int q)
+{
+  if(no >= MaxPart && no < MaxPart + MaxNodes) /* internal node */
+    {
+      FullyLinkedNodePIndex[no] = q;
+
+      int p = get_nodep(no)->nextnode;
+
+      /* in case the next node after opening is not a top-level node, we have either reached a leaf node or are in a local
+       * branch. We need to do nothing if we would end up on different shared memory thread */
+      if(p < MaxPart || (p >= FirstNonTopLevelNode && p < MaxPart + MaxNodes))
+        {
+          if(get_nodep(no)->nextnode_shmrank != TreeSharedMem_ThisTask)
+            return;
+        }
+
+      while(p != get_nodep(no)->sibling)
+        {
+          if(p < MaxPart) /* a particle */
+            {
+              if(p != q)
+                {
+                  Tp->link_two_particles(p, q);  // link them if not already linked
+                }
+
+              p = Nextnode[p];
+            }
+          else if(p < MaxPart + MaxNodes) /* an internal node  */
+            {
+              fof_link_particles_in_cell_recursive(p, q);
+
+              p = get_nodep(p)->sibling;
+            }
+          else /* a pseudo particle */
+            p = Nextnode[p - MaxNodes];
+        }
+    }
+}
+
+template <typename partset>
+int foftree<partset>::treefind_fof_return_a_particle_in_cell_recursive(int no)
+{
+  if(no >= MaxPart && no < MaxPart + MaxNodes) /* internal node */
+    {
+      int p = get_nodep(no)->nextnode;
+
+      /* in case the next node after opening is not a top-level node, we have either reached a leaf node or are in a local
+       * branch. We need to do nothing if we would end up on different shared memory thread */
+      if(p < MaxPart || (p >= FirstNonTopLevelNode && p < MaxPart + MaxNodes))
+        {
+          if(get_nodep(no)->nextnode_shmrank != TreeSharedMem_ThisTask)
+            return -1;
+        }
+
+      while(p != get_nodep(no)->sibling)
+        {
+          if(p < MaxPart) /* a particle */
+            {
+              return p;
+
+              p = Nextnode[p];
+            }
+          else if(p < MaxPart + MaxNodes) /* an internal node  */
+            {
+              int ret = treefind_fof_return_a_particle_in_cell_recursive(p);
+
+              if(ret >= 0)
+                return ret;
+
+              p = get_nodep(p)->sibling;
+            }
+          else /* a pseudo particle */
+            p = Nextnode[p - MaxNodes];
+        }
+    }
+
+  return -1;
+}
+
+template <typename partset>
+int foftree<partset>::treefind_fof_check_single_node_for_full_linking(int no)
+{
+  if(no >= MaxPart && no < MaxPart + MaxNodes) /* internal node */
+    {
+      if(FullyLinkedNodePIndex[no] >= 0)  // already linked
+        return 0;
+
+      int head = -1; /* no particle yet */
+
+      int p = get_nodep(no)->nextnode;
+
+      /* in case the next node after opening is not a top-level node, we have either reached a leaf node or are in a local
+       * branch. We need to do nothing if we would end up on different shared memory thread */
+      if(p < MaxPart || (p >= FirstNonTopLevelNode && p < MaxPart + MaxNodes))
+        {
+          if(get_nodep(no)->nextnode_shmrank != TreeSharedMem_ThisTask)
+            return 0;
+        }
+
+      while(p != get_nodep(no)->sibling)
+        {
+          if(p < MaxPart) /* a particle */
+            {
+              if(head == -1)
+                head = Tp->Head[p];
+              else if(head >= 0)
+                {
+                  if(head != Tp->Head[p])
+                    {
+                      head = -2;
+                      break;
+                    }
+                }
+
+              p = Nextnode[p];
+            }
+          else if(p < MaxPart + MaxNodes) /* an internal node  */
+            {
+              if(FullyLinkedNodePIndex[p] >= 0)
+                {
+                  if(head == -1)
+                    head = Tp->Head[FullyLinkedNodePIndex[p]];
+                  else if(head >= 0)
+                    {
+                      if(head != Tp->Head[FullyLinkedNodePIndex[p]])
+                        {
+                          head = -2;
+                          break;
+                        }
+                    }
+                }
+              else
+                {
+                  if(treefind_fof_return_a_particle_in_cell_recursive(no) >= 0)
+                    {
+                      head = -2;
+                      break;
+                    }
+                }
+
+              p = get_nodep(p)->sibling;
+            }
+          else /* a pseudo particle */
+            p = Nextnode[p - MaxNodes];
+        }
+
+      if(head >= 0)
+        {
+          FullyLinkedNodePIndex[no] = treefind_fof_return_a_particle_in_cell_recursive(no);
+
+          if(Tp->Head[FullyLinkedNodePIndex[no]] != head)
+            Terminate("no=%d  FullyLinkedNodePIndex[no]=%d   Tp->Head[FullyLinkedNodePIndex[no]]=%d   head=%d \n", no,
+                      FullyLinkedNodePIndex[no], Tp->Head[FullyLinkedNodePIndex[no]], head);
+
+          return 1;
+        }
+    }
+
+  return 0;
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
diff --git a/src/fof/fof_io.cc b/src/fof/fof_io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..93cde6f12185c6e38694b086976ce8c70aade526
--- /dev/null
+++ b/src/fof/fof_io.cc
@@ -0,0 +1,597 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file fof_io.cc
+ *
+ *  \brief routines for implementing the I/O routines concerned with the group catalogues
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef FOF
+
+#include <hdf5.h>
+#include <mpi.h>
+#include <sys/stat.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../fof/fof_io.h"
+#include "../gitversion/version.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+/*! \file fof_io.c
+ *  \brief parallel I/O routines for the FOF and SUBFIND group finders
+ */
+
+template <typename partset>
+fof_io<partset>::fof_io(fof<partset> *FoF_ptr, MPI_Comm comm, int format) : IO_Def(comm, format)
+{
+  FoF = FoF_ptr;
+
+  this->N_IO_Fields  = 0;
+  this->N_DataGroups = 3;
+  this->header_size  = sizeof(catalogue_header);
+  this->header_buf   = &catalogue_header;
+  this->type_of_file = FILE_IS_GROUPCAT;
+  sprintf(this->info, "FOF/SUBFIND: writing group catalogue");
+
+  init_field("FLEN", "GroupLen", mem_len_type, file_len_type, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].Len, NULL, GROUPS, 0, 0, 0, 0, 0,
+             0, 0, true);
+
+  init_field("FMAS", "GroupMass", MEM_MY_DOUBLE, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].Mass, NULL, GROUPS, 0, 0, 0,
+             0, 0, 0, 0);
+
+  init_field("FPOS", "GroupPos", MEM_MY_DOUBLE, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_G, &FoF->Group[0].Pos[0], NULL, GROUPS, 0, 0,
+             0, 0, 0, 0, 0);
+
+  init_field("FVEL", "GroupVel", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_G, &FoF->Group[0].Vel[0], NULL, GROUPS, 0, 0, 0,
+             0, 0, 0, 0);
+
+  init_field("FLTY", "GroupLenType", mem_len_type, file_len_type, READ_IF_PRESENT, NTYPES, A_G, &FoF->Group[0].LenType[0], NULL,
+             GROUPS, 0, 0, 0, 0, 0, 0, 0, true);
+
+  init_field("FLOT", "GroupOffsetType", MEM_INT64, FILE_INT64, READ_IF_PRESENT, NTYPES, A_G, &FoF->Group[0].OffsetType[0], NULL,
+             GROUPS, 0, 0, 0, 0, 0, 0, 0, true);
+
+  init_field("FMTY", "GroupMassType", MEM_MY_DOUBLE, FILE_MY_IO_FLOAT, READ_IF_PRESENT, NTYPES, A_G, &FoF->Group[0].MassType[0], NULL,
+             GROUPS, 0, 0, 0, 0, 0, 0, 0);
+
+  init_field("FMAS", "GroupAscale", MEM_MY_DOUBLE, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].Ascale, NULL, GROUPS, 0,
+             0, 0, 0, 0, 0, 0);
+
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+  init_field("FPEN", "GroupLenPrevMostBnd", MEM_INT, FILE_INT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].LenPrevMostBnd, NULL, GROUPS, 0,
+             0, 0, 0, 0, 0, 0, true);
+#endif
+
+#ifdef STARFORMATION
+  init_field("FSFR", "GroupSFR", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 1, A_G, &FoF->Group[0].Sfr, NULL, GROUPS, 0, 0, 0, 0, 0,
+             0, 0);
+#endif
+
+#ifdef MERGERTREE
+  init_field("GFLO", "GroupFileOffset", MEM_MY_FILEOFFSET, FILE_NONE, SKIP_ON_READ, 1, A_G, &FoF->Group[0].FileOffset, NULL, GROUPS, 0,
+             0, 0, 0, 0, 0, 0, true);
+#endif
+
+#ifdef SUBFIND
+  // ------------------ additional Group fields -------------------------------------
+
+  init_field("FMM2", "Group_M_Mean200", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].M_Mean200, NULL,
+             GROUPS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("FRM2", "Group_R_Mean200", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].R_Mean200, NULL,
+             GROUPS, 0, 0, 0, 0, 0, 0, 0);
+
+  init_field("FMC2", "Group_M_Crit200", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].M_Crit200, NULL,
+             GROUPS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("FRC2", "Group_R_Crit200", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].R_Crit200, NULL,
+             GROUPS, 0, 0, 0, 0, 0, 0, 0);
+
+  init_field("FMC5", "Group_M_Crit500", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].M_Crit500, NULL,
+             GROUPS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("FRC5", "Group_R_Crit500", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].R_Crit500, NULL,
+             GROUPS, 0, 0, 0, 0, 0, 0, 0);
+
+  init_field("FMT2", "Group_M_TopHat200", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].M_TopHat200, NULL,
+             GROUPS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("FMR2", "Group_R_TopHat200", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].R_TopHat200, NULL,
+             GROUPS, 0, 0, 0, 0, 0, 0, 0);
+
+  init_field("FNSH", "GroupNsubs", MEM_INT, FILE_INT, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].Nsubs, NULL, GROUPS, 0, 0, 0, 0, 0, 0, 0,
+             true);
+
+  init_field("FFSH", "GroupFirstSub", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_G, &FoF->Group[0].FirstSub, NULL, GROUPS, 0, 0, 0,
+             0, 0, 0, 0, true);
+
+  // ------------------- genuine Subhalo fields ------------------------------------
+
+  init_field("SGNR", "SubhaloGroupNr", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].GroupNr, NULL, SUBGROUPS, 0, 0,
+             0, 0, 0, 0, 0, true);
+
+  init_field("SGRG", "SubhaloRankInGr", MEM_INT, FILE_INT, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].SubRankInGr, NULL, SUBGROUPS, 0,
+             0, 0, 0, 0, 0, 0, true);
+
+  init_field("SLEN", "SubhaloLen", mem_len_type, file_len_type, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].Len, NULL, SUBGROUPS, 0, 0,
+             0, 0, 0, 0, 0, true);
+
+  init_field("SMAS", "SubhaloMass", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].Mass, NULL, SUBGROUPS, 0,
+             0, 0, 0, 0, 0, 0);
+
+  init_field("SPOS", "SubhaloPos", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_S, &FoF->Subhalo[0].Pos[0], NULL, SUBGROUPS,
+             0, 0, 0, 0, 0, 0, 0);
+
+  init_field("SVEL", "SubhaloVel", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_S, &FoF->Subhalo[0].Vel[0], NULL, SUBGROUPS,
+             0, 0, 0, 0, 0, 0, 0);
+
+  init_field("SLTY", "SubhaloLenType", mem_len_type, file_len_type, READ_IF_PRESENT, NTYPES, A_S, &FoF->Subhalo[0].LenType[0], NULL,
+             SUBGROUPS, 0, 0, 0, 0, 0, 0, 0, true);
+
+  init_field("SLOT", "SubhaloOffsetType", MEM_INT64, FILE_INT64, READ_IF_PRESENT, NTYPES, A_S, &FoF->Subhalo[0].OffsetType[0], NULL,
+             SUBGROUPS, 0, 0, 0, 0, 0, 0, 0, true);
+
+  init_field("SMTY", "SubhaloMassType", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, NTYPES, A_S, &FoF->Subhalo[0].MassType[0],
+             NULL, SUBGROUPS, 0, 0, 0, 0, 0, 0, 0);
+
+  init_field("SCMP", "SubhaloCM", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_S, &FoF->Subhalo[0].CM[0], NULL, SUBGROUPS, 0,
+             0, 0, 0, 0, 0, 0);
+
+  init_field("SSPI", "SubhaloSpin", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_S, &FoF->Subhalo[0].Spin[0], NULL, SUBGROUPS,
+             0, 0, 0, 0, 0, 0, 0);
+
+  init_field("SSPI", "SubhaloVelDisp", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].SubVelDisp, NULL,
+             SUBGROUPS, 0, 0, 0, 0, 0, 0, 0);
+
+  init_field("SVMX", "SubhaloVmax", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].SubVmax, NULL, SUBGROUPS,
+             0, 0, 0, 0, 0, 0, 0);
+
+  init_field("SVRX", "SubhaloVmaxRad", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].SubVmaxRad, NULL,
+             SUBGROUPS, 0, 0, 0, 0, 0, 0, 0);
+
+  init_field("SHMR", "SubhaloHalfmassRad", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].SubHalfMassRad,
+             NULL, SUBGROUPS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("SHMT", "SubhaloHalfmassRadType", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, NTYPES, A_S,
+             &FoF->Subhalo[0].SubHalfMassRadType[0], NULL, SUBGROUPS, 0, 0, 0, 0, 0, 0, 0);
+
+  init_field("SIDM", "SubhaloIDMostbound", MEM_MY_ID_TYPE, FILE_MY_ID_TYPE, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].SubMostBoundID,
+             NULL, SUBGROUPS, 0, 0, 0, 0, 0, 0, 0, true);
+
+  init_field("SPRT", "SubhaloParentRank", MEM_INT, FILE_INT, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].SubParentRank, NULL, SUBGROUPS,
+             0, 0, 0, 0, 0, 0, 0, true);
+
+#ifdef MERGERTREE
+  init_field("SFLO", "SubhaloFileOffset", MEM_MY_FILEOFFSET, FILE_NONE, SKIP_ON_READ, 1, A_S, &FoF->Subhalo[0].FileOffset, NULL,
+             SUBGROUPS, 0, 0, 0, 0, 0, 0, 0, true);
+#endif
+
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+  init_field("LPMO", "SubhaloLenPrevMostBnd", MEM_INT, FILE_INT, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].SubhaloLenPrevMostBnd, NULL,
+             SUBGROUPS, 0, 0, 0, 0, 0, 0, 0, true);
+#endif
+
+#ifdef STARFORMATION
+  init_field("SSFR", "SubhaloSFR", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].Sfr, NULL, SUBGROUPS, 0,
+             0, 0, 0, 0, 0, 0);
+  init_field("SSFG", "SubhaloGasMassSFR", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_S, &FoF->Subhalo[0].GasMassSfr, NULL,
+             SUBGROUPS, 0, 0, 0, 0, 0, 0, 0);
+#endif
+
+#endif
+}
+
+template <typename partset>
+int fof_io<partset>::get_type_of_element(int index)
+{
+  return 0; /* not needed here */
+}
+
+template <typename partset>
+void fof_io<partset>::set_type_of_element(int index, int type)
+{
+  /* empty */
+}
+
+/* prepare list of ids with assigned group numbers */
+
+template <typename partset>
+void fof_io<partset>::fof_subfind_save_groups(int num, const char *basename, const char *grpcat_dirbasename)
+{
+#ifdef DO_NOT_PRODUCE_BIG_OUTPUT
+  mpi_printf("FOF/SUBFIND: We skip saving group catalogues.\n");
+  return;
+#endif
+
+  char buf[MAXLEN_PATH_EXTRA];
+
+  double t0 = Logs.second();
+  reset_io_byte_count();
+
+  if(All.NumFilesPerSnapshot > 1)
+    {
+      if(ThisTask == 0)
+        {
+          sprintf(buf, "%s/%s_%03d", All.OutputDir, grpcat_dirbasename, num);
+          mkdir(buf, 02755);
+        }
+      MPI_Barrier(Communicator);
+    }
+
+  if(All.NumFilesPerSnapshot > 1)
+    sprintf(buf, "%s/%s_%03d/%s_%03d", All.OutputDir, grpcat_dirbasename, num, basename, num);
+  else
+    sprintf(buf, "%s%s_%03d", All.OutputDir, basename, num);
+
+  write_multiple_files(buf, All.NumFilesPerSnapshot);
+
+  long long byte_count = get_io_byte_count(), byte_count_all;
+  sumup_longs(1, &byte_count, &byte_count_all, Communicator);
+
+  double t1 = Logs.second();
+
+  mpi_printf("FOF/SUBFIND: Group catalogues saved. took = %g sec, total size %g MB, corresponds to effective I/O rate of %g MB/sec\n",
+             Logs.timediff(t0, t1), byte_count_all / (1024.0 * 1024.0), byte_count_all / (1024.0 * 1024.0) / Logs.timediff(t0, t1));
+}
+
+template <typename partset>
+void fof_io<partset>::fof_subfind_load_groups(int num)
+{
+  FoF->TotNgroups = 0;
+
+  char fname[MAXLEN_PATH_EXTRA], fname_multiple[MAXLEN_PATH_EXTRA];
+
+  sprintf(fname_multiple, "%s/groups_%03d/%s_%03d", All.OutputDir, num, "fof_subhalo_tab", num);
+  sprintf(fname, "%s%s_%03d", All.OutputDir, "fof_subhalo_tab", num);
+
+  int num_files = find_files(fname, fname_multiple);
+
+  if(num_files > 1)
+    strcpy(fname, fname_multiple);
+
+  /* we repeat reading the headers of the files two times. In the first iteration, only the
+   * particle numbers ending up on each processor are assembled, followed by memory allocation.
+   * In the second iteration, the data is actually read in.
+   */
+
+  for(int rep = 0; rep < 2; rep++)
+    {
+      FoF->Ngroups   = 0;
+      FoF->Nsubhalos = 0;
+
+      read_files_driver(fname, rep, num_files);
+
+      /* now do the memory allocation */
+      if(rep == 0)
+        {
+          FoF->Group = (typename fof<partset>::group_properties *)Mem.mymalloc_movable(
+              &FoF->Group, "Group", FoF->Ngroups * sizeof(typename fof<partset>::group_properties));
+#ifdef SUBFIND
+          FoF->Subhalo = (typename fof<partset>::subhalo_properties *)Mem.mymalloc_movable(
+              &FoF->Subhalo, "Subhalo", FoF->Nsubhalos * sizeof(typename fof<partset>::subhalo_properties));
+#endif
+        }
+    }
+
+  MPI_Barrier(Communicator);
+
+  mpi_printf("\nFOF/SUBFIND: reading done.\n");
+  mpi_printf("FOF/SUBFIND: Total number of groups read:    %lld\n", (long long int)FoF->TotNgroups);
+  mpi_printf("FOF/SUBFIND: Total number of subhalos read: %lld\n\n", (long long int)FoF->TotNsubhalos);
+
+  MPI_Allreduce(MPI_IN_PLACE, &LegacyFormat, 1, MPI_INT, MPI_MAX, Communicator);
+}
+
+template <typename partset>
+void fof_io<partset>::fill_file_header(int writeTask, int lastTask, long long *n_type, long long *ntot_type)
+{
+  /* determine group/id numbers of each type in file */
+
+  n_type[0] = FoF->Ngroups;
+  n_type[1] = FoF->Nsubhalos;
+  n_type[2] = FoF->Nids;
+
+  if(ThisTask == writeTask)
+    {
+      for(int n = 0; n < 3; n++)
+        ntot_type[n] = n_type[n];
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        {
+          long long nn[3];
+          MPI_Recv(&nn[0], 3, MPI_LONG_LONG, task, TAG_LOCALN, Communicator, MPI_STATUS_IGNORE);
+          for(int n = 0; n < 3; n++)
+            ntot_type[n] += nn[n];
+        }
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        MPI_Send(&ntot_type[0], 3, MPI_LONG_LONG, task, TAG_N, Communicator);
+    }
+  else
+    {
+      MPI_Send(&n_type[0], 3, MPI_LONG_LONG, writeTask, TAG_LOCALN, Communicator);
+      MPI_Recv(&ntot_type[0], 3, MPI_LONG_LONG, writeTask, TAG_N, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* fill file header */
+
+  catalogue_header.Ngroups   = ntot_type[0];
+  catalogue_header.Nsubhalos = ntot_type[1];
+  catalogue_header.Nids      = ntot_type[2];
+
+  catalogue_header.TotNgroups   = FoF->TotNgroups;
+  catalogue_header.TotNsubhalos = FoF->TotNsubhalos;
+  catalogue_header.TotNids      = FoF->TotNids;
+
+  catalogue_header.num_files = All.NumFilesPerSnapshot;
+
+  catalogue_header.time = All.Time;
+  if(All.ComovingIntegrationOn)
+    catalogue_header.redshift = 1.0 / All.Time - 1;
+  else
+    catalogue_header.redshift = 0;
+  catalogue_header.BoxSize = All.BoxSize;
+}
+
+template <typename partset>
+void fof_io<partset>::read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *n_type,
+                                       long long *ntot_type, int *nstart)
+{
+  if(ThisTask == readTask)
+    {
+      if(filenr == 0 && nstart == NULL)
+        {
+          mpi_printf(
+              "\nREAD-FOF: filenr=%d, '%s' contains:\n"
+              "READ-FOF: Type 0 (groups):    %8lld\n"
+              "READ-FOF: Type 1 (subhalos):  %8lld\n"
+              "READ-FOF: Type 2 (ids):       %8lld\n",
+              filenr, fname, catalogue_header.Ngroups, catalogue_header.Nsubhalos, catalogue_header.Nids);
+        }
+    }
+
+  if(FoF->TotNgroups == 0)
+    {
+      FoF->TotNgroups   = catalogue_header.TotNgroups;
+      FoF->TotNsubhalos = catalogue_header.TotNsubhalos;
+      FoF->TotNids      = catalogue_header.TotNids;
+    }
+
+  FoF->Redshift = catalogue_header.redshift;
+  FoF->Time     = catalogue_header.time;
+
+  for(int k = 0; k < 3; k++)
+    n_type[k] = ntot_type[k] = 0;
+
+  /* to collect the gas particles all at the beginning (in case several
+     snapshot files are read on the current CPU) we move the collisionless
+     particles such that a gap of the right size is created */
+
+  {
+    ntot_type[0] = catalogue_header.Ngroups;
+
+    long long n_in_file = catalogue_header.Ngroups;
+    int ntask           = lastTask - readTask + 1;
+    int n_for_this_task = n_in_file / ntask;
+    if((ThisTask - readTask) < (n_in_file % ntask))
+      n_for_this_task++;
+
+    n_type[0] = n_for_this_task;
+
+    if(nstart)
+      memmove(&FoF->Group[n_for_this_task], &FoF->Group[0], FoF->Ngroups * sizeof(typename fof<partset>::group_properties));
+  }
+
+#ifdef SUBFIND
+  {
+    ntot_type[1] = catalogue_header.Nsubhalos;
+
+    long long n_in_file = catalogue_header.Nsubhalos;
+    int ntask           = lastTask - readTask + 1;
+    int n_for_this_task = n_in_file / ntask;
+    if((ThisTask - readTask) < (n_in_file % ntask))
+      n_for_this_task++;
+
+    n_type[1] = n_for_this_task;
+
+    if(nstart)
+      memmove(&FoF->Subhalo[n_for_this_task], &FoF->Subhalo[0], FoF->Nsubhalos * sizeof(typename fof<partset>::subhalo_properties));
+  }
+#endif
+
+  if(nstart)
+    *nstart = 0;
+}
+
+template <typename partset>
+void fof_io<partset>::write_header_fields(hid_t handle)
+{
+  write_scalar_attribute(handle, "Ngroups_ThisFile", &catalogue_header.Ngroups, H5T_NATIVE_UINT64);
+  write_scalar_attribute(handle, "Nsubhalos_ThisFile", &catalogue_header.Nsubhalos, H5T_NATIVE_UINT64);
+  write_scalar_attribute(handle, "Nids_ThisFile", &catalogue_header.Nids, H5T_NATIVE_UINT64);
+
+  write_scalar_attribute(handle, "Ngroups_Total", &catalogue_header.TotNgroups, H5T_NATIVE_UINT64);
+  write_scalar_attribute(handle, "Nsubhalos_Total", &catalogue_header.TotNsubhalos, H5T_NATIVE_UINT64);
+  write_scalar_attribute(handle, "Nids_Total", &catalogue_header.TotNids, H5T_NATIVE_UINT64);
+
+  write_scalar_attribute(handle, "NumFiles", &catalogue_header.num_files, H5T_NATIVE_INT);
+
+  write_scalar_attribute(handle, "Time", &catalogue_header.time, H5T_NATIVE_DOUBLE);
+  write_scalar_attribute(handle, "Redshift", &catalogue_header.redshift, H5T_NATIVE_DOUBLE);
+  write_scalar_attribute(handle, "BoxSize", &catalogue_header.BoxSize, H5T_NATIVE_DOUBLE);
+
+  write_string_attribute(handle, "Git_commit", GIT_COMMIT);
+  write_string_attribute(handle, "Git_date", GIT_DATE);
+}
+
+/*! \brief This function reads the snapshot header in case of hdf5 files (i.e. format 3)
+ *
+ * \param fname file name of the snapshot as given in the parameter file
+ */
+template <typename partset>
+void fof_io<partset>::read_header_fields(const char *fname)
+{
+  memset(&catalogue_header, 0, sizeof(fof_subfind_header));
+
+  hid_t hdf5_file = my_H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
+  hid_t handle    = my_H5Gopen(hdf5_file, "/Header");
+
+  /* now read the header fields */
+  read_scalar_attribute(handle, "Ngroups_ThisFile", &catalogue_header.Ngroups, H5T_NATIVE_UINT64);
+
+  if(read_scalar_attribute(handle, "Nsubhalos_ThisFile", "Nsubgroups_ThisFile", &catalogue_header.Nsubhalos, H5T_NATIVE_UINT64))
+    LegacyFormat = 1;
+
+  read_scalar_attribute(handle, "Nids_ThisFile", &catalogue_header.Nids, H5T_NATIVE_UINT64);
+
+  read_scalar_attribute(handle, "Ngroups_Total", &catalogue_header.TotNgroups, H5T_NATIVE_UINT64);
+  read_scalar_attribute(handle, "Nsubhalos_Total", "Nsubgroups_Total", &catalogue_header.TotNsubhalos, H5T_NATIVE_UINT64);
+  read_scalar_attribute(handle, "Nids_Total", &catalogue_header.TotNids, H5T_NATIVE_UINT64);
+
+  read_scalar_attribute(handle, "NumFiles", &catalogue_header.num_files, H5T_NATIVE_INT);
+
+  read_scalar_attribute(handle, "Time", &catalogue_header.time, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "Redshift", &catalogue_header.redshift, H5T_NATIVE_DOUBLE);
+
+  read_scalar_attribute(handle, "BoxSize", &catalogue_header.BoxSize, H5T_NATIVE_DOUBLE);
+
+  my_H5Gclose(handle, "/Header");
+  my_H5Fclose(hdf5_file, fname);
+}
+
+template <typename partset>
+int fof_io<partset>::get_filenr_from_header(void)
+{
+  return catalogue_header.num_files;
+}
+
+template <typename partset>
+void fof_io<partset>::set_filenr_in_header(int numfiles)
+{
+  catalogue_header.num_files = numfiles;
+}
+
+template <typename partset>
+void fof_io<partset>::read_increase_numbers(int type, int n_for_this_task)
+{
+  switch(type)
+    {
+      case 0:
+        FoF->Ngroups += n_for_this_task;
+        break;
+      case 1:
+        FoF->Nsubhalos += n_for_this_task;
+        break;
+      case 2:
+        FoF->Nids += n_for_this_task;
+        break;
+      default:
+        Terminate("wrong group");
+        break;
+    }
+}
+
+template <typename partset>
+void fof_io<partset>::get_datagroup_name(int type, char *buf)
+{
+  switch(type)
+    {
+      case 0:
+        sprintf(buf, "/Group");
+        break;
+      case 1:
+        sprintf(buf, "/Subhalo");
+        break;
+      case 2:
+        sprintf(buf, "/IDs");
+        break;
+      default:
+        Terminate("wrong group: type=%d", type);
+        break;
+    }
+}
+
+template <typename partset>
+void *fof_io<partset>::get_base_address_of_structure(enum arrays array, int index)
+{
+  switch(array)
+    {
+      case A_G:
+        return (void *)(FoF->Group + index);
+
+#ifdef SUBFIND
+      case A_S:
+        return (void *)(FoF->Subhalo + index);
+#endif
+
+      default:
+        Terminate("we don't expect to get here");
+    }
+
+  return NULL;
+}
+
+template <typename partset>
+void fof_io<partset>::fof_subfind_prepare_ID_list(void)
+{
+  double t0 = Logs.second();
+
+  ID_list = (id_list *)Mem.mymalloc("ID_list", sizeof(id_list) * FoF->Nids);
+
+  long long nids = 0;
+  for(int i = 0; i < FoF->Tp->NumPart; i++)
+    {
+      if(FoF->Tp->PS[i].GroupNr.get() < HALONR_MAX)
+        {
+          if(nids >= FoF->Nids)
+            Terminate("nids >= Nids");
+
+          ID_list[nids].GroupNr = FoF->Tp->PS[i].GroupNr;
+          ID_list[nids].Type    = FoF->Tp->P[i].getType();
+          ID_list[nids].ID      = FoF->Tp->P[i].ID.get();
+#ifdef SUBFIND
+          ID_list[nids].SubRankInGr = FoF->Tp->PS[i].SubRankInGr;
+          ID_list[nids].BindingEgy  = FoF->Tp->PS[i].v.DM_BindingEnergy;
+#endif
+          nids++;
+        }
+    }
+
+  long long totNids;
+  sumup_longs(1, &nids, &totNids, Communicator);
+  if(totNids != FoF->TotNids)
+    Terminate("Task=%d Nids=%lld totNids=%lld TotNids=%lld\n", ThisTask, FoF->Nids, totNids, FoF->TotNids);
+
+  /* sort the particle IDs according to group-number, and optionally subhalo number and binding energy  */
+  mycxxsort_parallel(ID_list, ID_list + FoF->Nids, fof_subfind_compare_ID_list, Communicator);
+
+  double t1 = Logs.second();
+  mpi_printf("FOF/SUBFIND: Particle/cell IDs in groups globally sorted. took = %g sec\n", Logs.timediff(t0, t1));
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof_io<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof_io<lcparticles>;
+#endif
+
+#endif
diff --git a/src/fof/fof_io.h b/src/fof/fof_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d4c92604ad04607c07337af1b8cbb3787125825
--- /dev/null
+++ b/src/fof/fof_io.h
@@ -0,0 +1,103 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file fof_io.h
+ *
+ *  \brief declares class needed for I/O of group catalogues
+ */
+
+#ifndef FOF_IO_H
+#define FOF_IO_H
+
+#include "../fof/fof.h"
+#include "../io/io.h"
+
+template <typename partset>
+class fof_io : public IO_Def
+{
+ public:
+  fof<partset> *FoF;
+
+  fof_io(fof<partset> *FoF_ptr, MPI_Comm comm, int format); /* constructor */
+
+  void fof_subfind_save_groups(int num, const char *basename, const char *grpcat_dirbasename);
+  void fof_subfind_load_groups(int num);
+
+  /* supplied virtual functions */
+  void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart);
+  void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                        int *nstart);
+  void get_datagroup_name(int grnr, char *gname);
+  void write_header_fields(hid_t);
+  void read_header_fields(const char *fname);
+  void read_increase_numbers(int type, int n_for_this_task);
+  int get_filenr_from_header(void);
+  void set_filenr_in_header(int);
+  void *get_base_address_of_structure(enum arrays array, int index);
+  int get_type_of_element(int index);
+  void set_type_of_element(int index, int type);
+
+ public:
+  struct fof_subfind_header
+  {
+    long long Ngroups;
+    long long Nsubhalos;
+    long long Nids;
+    long long TotNgroups;
+    long long TotNsubhalos;
+    long long TotNids;
+    int num_files;
+    double time;
+    double redshift;
+    double BoxSize;
+  };
+  fof_subfind_header catalogue_header;
+
+  int LegacyFormat = 0;
+
+ private:
+  struct id_list
+  {
+    MyIDType ID;
+    MyHaloNrType GroupNr;
+    int Type;
+#ifdef SUBFIND
+    int SubRankInGr;
+    MyFloat BindingEgy;
+#endif
+  };
+  id_list *ID_list;
+
+  void fof_subfind_prepare_ID_list(void);
+
+  static bool fof_subfind_compare_ID_list(const id_list &a, const id_list &b)
+  {
+    if(a.GroupNr < b.GroupNr)
+      return true;
+    if(a.GroupNr > b.GroupNr)
+      return false;
+
+#ifdef SUBFIND
+    if(a.SubRankInGr < b.SubRankInGr)
+      return true;
+    if(a.SubRankInGr > b.SubRankInGr)
+      return false;
+#endif
+
+    if(a.Type < b.Type)
+      return true;
+    if(a.Type > b.Type)
+      return false;
+
+#ifdef SUBFIND
+    return a.BindingEgy < b.BindingEgy;
+#else
+    return a.ID < b.ID;
+#endif
+  }
+};
+
+#endif
diff --git a/src/fof/fof_nearest.cc b/src/fof/fof_nearest.cc
new file mode 100644
index 0000000000000000000000000000000000000000..baa4ab7c925821d5e1eace4def82c354a1240cfd
--- /dev/null
+++ b/src/fof/fof_nearest.cc
@@ -0,0 +1,357 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file fof_nearest.cc
+ *
+ *  \brief routines to find nearest neighbors
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef FOF
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/generic_comm.h"
+#include "../ngbtree/ngbtree.h"
+#include "../system/system.h"
+
+/* local data structure for collecting particle/cell data that is sent to other processors if needed */
+
+struct fofdata_in : data_in_generic
+{
+  MyIntPosType IntPos[3];
+  MyFloat Hsml;
+};
+
+/* local data structure that holds results acquired on remote processors */
+struct fofdata_out
+{
+  MyFloat Distance;
+  MyIDStorage MinID;
+  int MinIDTask;
+#if defined(SUBFIND)
+  MyFloat DM_Hsml;
+#endif
+};
+
+template <typename T_tree, typename T_domain, typename T_partset>
+class fofdata_comm : public generic_comm<fofdata_in, fofdata_out, T_tree, T_domain, T_partset>
+{
+ public:
+  typedef generic_comm<fofdata_in, fofdata_out, T_tree, T_domain, T_partset> gcomm;
+  using gcomm::D;
+  using gcomm::Thread;
+  using gcomm::Tp;  // This makes sure that we can access Tp from the base class without having to use "this->Tp"
+  using gcomm::Tree;
+
+  /* need to call the base class constructor explicitly */
+  fofdata_comm(T_domain *dptr, T_tree *tptr, T_partset *pptr)
+      : generic_comm<fofdata_in, fofdata_out, T_tree, T_domain, T_partset>(dptr, tptr, pptr)
+  {
+  }
+
+  /* routine that fills the relevant particle/cell data into the input structure defined above */
+  void particle2in(fofdata_in *in, int i)
+  {
+    for(int k = 0; k < 3; k++)
+      in->IntPos[k] = Tp->P[i].IntPos[k];
+
+    in->Hsml = Tp->fof_nearest_hsml[i];
+  }
+
+  /* routine to store or combine result data */
+  void out2particle(fofdata_out *out, int i, int mode)
+  {
+    if(out->Distance < Tp->fof_nearest_distance[i])
+      {
+        Tp->fof_nearest_distance[i] = out->Distance;
+        Tp->MinID[i]                = out->MinID;
+        Tp->MinIDTask[i]            = out->MinIDTask;
+#if defined(SUBFIND)
+        Tp->PS[i].v.DM_Hsml = out->DM_Hsml;
+#endif
+      }
+  }
+
+  int evaluate(int target, int mode, int thread_id, int action, fofdata_in *in, int numnodes, node_info *firstnode, fofdata_out &out)
+  {
+    MyIntPosType *intpos = in->IntPos;
+    double h             = in->Hsml;
+    int index            = -1;
+    double r2max         = MAX_REAL_NUMBER;
+
+    /* Now start the actual tree-walk computation for this particle */
+
+    for(int k = 0; k < numnodes; k++)
+      {
+        int no;
+
+        if(mode == MODE_LOCAL_PARTICLES)
+          {
+            no = Tree->MaxPart; /* root node */
+          }
+        else
+          {
+            no = firstnode[k].Node;
+            no = Tree->get_nodep(no)->nextnode; /* open it */
+          }
+
+        int shmrank = Tree->TreeSharedMem_ThisTask;
+
+        while(no >= 0)
+          {
+            if(no < Tree->MaxPart) /* single particle */
+              {
+                int p  = no;
+                auto P = Tree->get_Pp(no, shmrank);
+
+                no = Tree->get_nextnodep(shmrank)[no]; /* note: here shmrank cannot change */
+
+                if(shmrank != Tree->TreeSharedMem_ThisTask)
+                  Terminate("this routine may not consider shared memory particles");
+
+                double dxyz[3];
+                Tp->nearest_image_intpos_to_pos(P->IntPos, intpos, dxyz); /* converts the integer distance to floating point */
+
+                double dist = h;
+
+                if(fabs(dxyz[0]) > dist)
+                  continue;
+                if(fabs(dxyz[1]) > dist)
+                  continue;
+                if(fabs(dxyz[2]) > dist)
+                  continue;
+
+                double r2 = dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2];
+
+                if(r2 < r2max && r2 < h * h)
+                  {
+                    index = p;
+                    r2max = r2;
+                  }
+              }
+            else if(no < Tree->MaxPart + Tree->MaxNodes) /* internal node */
+              {
+                if(mode == MODE_IMPORTED_PARTICLES)
+                  {
+                    if(no < Tree->FirstNonTopLevelNode) /* we reached a top-level node again, which means that we are done with the
+                                                           branch */
+                      break;
+                  }
+
+                fofnode *current = Tree->get_nodep(no, shmrank);
+
+                int nosaved = no;
+
+                no      = current->sibling; /* in case the node can be discarded */
+                shmrank = current->sibling_shmrank;
+
+                double dxyz[3];
+                Tp->nearest_image_intpos_to_pos(current->center.da, intpos,
+                                                dxyz); /* converts the integer distance to floating point */
+
+                double len = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - current->level)) * Tp->FacIntToCoord;
+
+                double dist = h + 0.5 * len;
+
+                if(fabs(dxyz[0]) > dist)
+                  continue;
+                if(fabs(dxyz[1]) > dist)
+                  continue;
+                if(fabs(dxyz[2]) > dist)
+                  continue;
+
+                /* now test against the minimal sphere enclosing everything */
+                dist += FACT1 * len;
+                if(dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2] > dist * dist)
+                  continue;
+
+                int p = current->nextnode;
+
+                /* in case the next node after opening is not a top-level node, we have either reached a leaf node or are in a local
+                 * branch we need to do nothing if we would end up on different shared memory thread */
+                if(p < Tree->MaxPart || (p >= Tree->FirstNonTopLevelNode && p < Tree->MaxPart + Tree->MaxNodes))
+                  {
+                    if(current->nextnode_shmrank != Tree->TreeSharedMem_ThisTask)
+                      {
+                        int task = D->ThisTask + current->nextnode_shmrank - Tree->TreeSharedMem_ThisTask;
+
+                        if(target >= 0) /* export */
+                          Tree->tree_export_node_threads_by_task_and_node(task, nosaved, target, &Thread);
+
+                        no      = current->sibling; /* in case the node can be discarded */
+                        shmrank = current->sibling_shmrank;
+                        continue;
+                      }
+                  }
+
+                no      = current->nextnode; /* ok, we need to open the node */
+                shmrank = current->nextnode_shmrank;
+              }
+            else if(no >= Tree->ImportedNodeOffset) /* point from imported nodelist */
+              {
+                Terminate("do not expect imported points here");
+              }
+            else /* pseudo particle */
+              {
+                if(mode == MODE_LOCAL_PARTICLES)
+                  if(target >= 0)
+                    Tree->tree_export_node_threads(no, target, &Thread);
+
+                no = Tree->get_nextnodep(shmrank)[no - Tree->MaxNodes];
+                /* note: here shmrank does not need to change */
+              }
+          }
+      }
+
+    if(index >= 0)
+      {
+        out.Distance  = sqrt(r2max);
+        out.MinID     = Tp->MinID[Tp->Head[index]];
+        out.MinIDTask = Tp->MinIDTask[Tp->Head[index]];
+#if defined(SUBFIND)
+        out.DM_Hsml = Tp->PS[index].v.DM_Hsml;
+#endif
+      }
+    else
+      {
+        out.Distance = MAX_REAL_NUMBER;
+        out.MinID.set(0);
+        out.MinIDTask = 0;
+#if defined(SUBFIND)
+        out.DM_Hsml = 0;
+#endif
+      }
+
+    return 0;
+  }
+};
+
+template <typename partset>
+double fof<partset>::fof_find_nearest_dmparticle(void)
+{
+#ifdef LEAN
+  return 0;
+#endif
+
+  double tstart = Logs.second();
+
+  mpi_printf("FOF: Start finding nearest dm-particle (presently allocated=%g MB)\n", Mem.getAllocatedBytesInMB());
+
+  Tp->fof_nearest_distance = (MyFloat *)Mem.mymalloc("fof_nearest_distance", sizeof(MyFloat) * Tp->NumPart);
+  Tp->fof_nearest_hsml     = (MyFloat *)Mem.mymalloc("fof_nearest_hsml", sizeof(MyFloat) * Tp->NumPart);
+
+  int *TargetList = (int *)Mem.mymalloc("TargetList", Tp->NumPart * sizeof(int));
+
+  int Nsearch = 0;
+
+  for(int n = 0; n < Tp->NumPart; n++)
+    {
+      if(is_type_secondary_link_type(Tp->P[n].getType()))
+        {
+          Tp->fof_nearest_distance[n] = MAX_REAL_NUMBER;
+          if(Tp->P[n].getType() == 0 && Tp->SphP[n].Hsml > 0)
+            Tp->fof_nearest_hsml[n] = Tp->SphP[n].Hsml;
+          else
+            Tp->fof_nearest_hsml[n] = 0.1 * Tp->LinkL;
+
+          TargetList[Nsearch++] = n;
+        }
+    }
+
+  fofdata_comm<foftree<partset>, domain<partset>, partset> commpattern{FoFDomain, &FoFNgbTree, Tp};
+
+  int iter = 0;
+  long long ntot;
+  /* we will repeat the whole thing for those particles where we didn't find enough neighbours */
+  do
+    {
+      double t0 = Logs.second();
+
+      commpattern.execute(Nsearch, TargetList, MODE_DEFAULT);
+
+      /* do final operations on results */
+      int npleft = 0;
+      for(int n = 0; n < Nsearch; n++)
+        {
+          int i = TargetList[n];
+
+          if(Tp->fof_nearest_distance[i] > 1.0e29)
+            {
+              if(Tp->fof_nearest_hsml[i] < 4 * Tp->LinkL) /* we only search out to a maximum distance */
+                {
+                  /* need to redo this particle */
+                  TargetList[npleft++] = i;
+                  Tp->fof_nearest_hsml[i] *= 2.0;
+                  if(iter >= MAXITER - 10)
+                    {
+                      double pos[3];
+                      Tp->intpos_to_pos(Tp->P[i].IntPos, pos);
+
+                      printf("FOF: i=%d task=%d ID=%d P[i].Type=%d Hsml=%g LinkL=%g nearest=%g pos=(%g|%g|%g)\n", i, ThisTask,
+                             (int)Tp->P[i].ID.get(), Tp->P[i].getType(), Tp->fof_nearest_hsml[i], Tp->LinkL,
+                             Tp->fof_nearest_distance[i], pos[0], pos[1], pos[2]);
+                      myflush(stdout);
+                    }
+                }
+              else
+                {
+                  Tp->fof_nearest_distance[i] = 0; /* we do not continue to search for this particle */
+                }
+            }
+        }
+
+      sumup_large_ints(1, &npleft, &ntot, Communicator);
+
+      Nsearch = npleft;
+
+      double t1 = Logs.second();
+      if(ntot > 0)
+        {
+          iter++;
+          if(iter > 0)
+            mpi_printf("FOF: fof-nearest iteration %d: need to repeat for %lld particles. (took = %g sec)\n", iter, ntot,
+                       Logs.timediff(t0, t1));
+
+          if(iter > MAXITER)
+            Terminate("FOF: failed to converge in fof-nearest\n");
+        }
+    }
+  while(ntot > 0);
+
+  Mem.myfree(TargetList);
+  Mem.myfree(Tp->fof_nearest_hsml);
+  Mem.myfree(Tp->fof_nearest_distance);
+
+  mpi_printf("FOF: done finding nearest dm-particle\n");
+
+  double tend = Logs.second();
+  return Logs.timediff(tstart, tend);
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif /* of FOF */
diff --git a/src/fof/foftree.h b/src/fof/foftree.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bb51237d67f5b0bcc1efbcbbc1947fa7880bf55
--- /dev/null
+++ b/src/fof/foftree.h
@@ -0,0 +1,87 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file foftree.h
+ *
+ *  \brief declares structures and classes needed for the FOF neighbor tree
+ */
+
+#ifndef FOFTREE_H
+#define FOFTREE_H
+
+#include "../data/simparticles.h"
+#include "../tree/tree.h"
+
+struct fofpoint_data
+{
+  MyIntPosType IntPos[3];
+  int no;
+};
+
+struct foreign_fofpoint_data
+{
+  int Nextnode;
+  unsigned char Nextnode_shmrank;
+};
+
+struct fofnode : public basenode
+{
+  MyIntPosType range_min[3];
+  MyIntPosType range_max[3];
+};
+
+template <typename partset>
+class foftree : public tree<fofnode, partset, fofpoint_data, foreign_fofpoint_data>
+{
+ public:
+  typedef tree<fofnode, partset, fofpoint_data, foreign_fofpoint_data> basetree;
+  using basetree::Buildtime;
+  using basetree::D;
+  using basetree::Father;
+  using basetree::FirstNonTopLevelNode;
+  using basetree::get_nextnodep;
+  using basetree::get_nodep;
+  using basetree::get_Pp;
+  using basetree::get_PSp;
+  using basetree::ImportedNodeOffset;
+  using basetree::MaxNodes;
+  using basetree::MaxPart;
+  using basetree::Nextnode;
+  using basetree::NodeIndex;
+  using basetree::Nodes;
+  using basetree::NumNodes;
+  using basetree::Recv_count;
+  using basetree::Recv_offset;
+  using basetree::Send_count;
+  using basetree::Send_offset;
+  using basetree::TopNodes;
+  using basetree::Tp;  // This makes sure that we can access Tp from the base class without having to use "this"
+  using basetree::tree_export_node_threads;
+  using basetree::tree_export_node_threads_by_task_and_node;
+  using basetree::TreeP_offsets;
+  using basetree::TreePS_offsets;
+  using basetree::TreeSharedMem_ThisTask;
+  using basetree::TreeSharedMemBaseAddr;
+  using basetree::TreeSphP_offsets;
+
+  int *FullyLinkedNodePIndex;  // If a tree node is fully linked, this gives one particle of the encompassing group
+
+  void update_node_recursive(int no, int sib, int mode) override;
+  void exchange_topleafdata(void) override;
+  void fill_in_export_points(fofpoint_data *exp_point, int i, int no) override;
+  void report_log_message(void) override;
+
+  int treefind_fof_primary(MyIntPosType *searchcenter, MyNgbTreeFloat hsml, int target, int mode, thread_data *thread, int numnodes,
+                           node_info *firstnode, int *ngblist, MyIDStorage target_MinID, int target_MinIDTask,
+                           double target_DistanceOrigin);
+  int treefind_fof_return_a_particle_in_cell_recursive(int no);
+
+  void fof_link_particles_in_cell_recursive(int no, int q);
+
+  int treefind_fof_check_single_node_for_full_linking(int no);
+};
+
+#endif
diff --git a/src/fof/foftree_build.cc b/src/fof/foftree_build.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7b1789873bf865659a03304ad75efcecb6b7f8f6
--- /dev/null
+++ b/src/fof/foftree_build.cc
@@ -0,0 +1,247 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file foftree_build.cc
+ *
+ *  \brief routines needed for FOF neighbor tree construction
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/foftree.h"
+#include "../gravtree/gravtree.h"
+#include "../io/io.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../sort/peano.h"
+#include "../system/system.h"
+#include "../time_integration/driftfac.h"
+#include "../time_integration/timestep.h"
+
+template <typename partset>
+void foftree<partset>::report_log_message(void)
+{
+  double numnodes = NumNodes, tot_numnodes;
+  MPI_Reduce(&numnodes, &tot_numnodes, 1, MPI_DOUBLE, MPI_SUM, 0, D->Communicator);
+
+  D->mpi_printf("FOFTREE: Ngb-tree construction done. took %g sec  <numnodes>=%g  NTopnodes=%d NTopleaves=%d\n", Buildtime,
+                tot_numnodes / D->NTask, D->NTopnodes, D->NTopleaves);
+}
+
+template <typename partset>
+void foftree<partset>::fill_in_export_points(fofpoint_data *exp_point, int i, int no)
+{
+  Terminate("we don't expect to get here");
+}
+
+template <typename partset>
+void foftree<partset>::exchange_topleafdata(void)
+{
+  struct leafnode_data
+  {
+    MyIntPosType range_min[3];
+    MyIntPosType range_max[3];
+  };
+  leafnode_data *glob_leaf_node_data, *loc_leaf_node_data;
+
+  glob_leaf_node_data = (leafnode_data *)Mem.mymalloc("glob_leaf_node_data", D->NTopleaves * sizeof(leafnode_data));
+
+  /* share the pseudo-particle data accross CPUs */
+  int *recvcounts = (int *)Mem.mymalloc("recvcounts", sizeof(int) * D->NTask);
+  int *recvoffset = (int *)Mem.mymalloc("recvoffset", sizeof(int) * D->NTask);
+  int *bytecounts = (int *)Mem.mymalloc("bytecounts", sizeof(int) * D->NTask);
+  int *byteoffset = (int *)Mem.mymalloc("byteoffset", sizeof(int) * D->NTask);
+
+  for(int task = 0; task < D->NTask; task++)
+    recvcounts[task] = 0;
+
+  for(int n = 0; n < D->NTopleaves; n++)
+    recvcounts[D->TaskOfLeaf[n]]++;
+
+  for(int task = 0; task < D->NTask; task++)
+    bytecounts[task] = recvcounts[task] * sizeof(leafnode_data);
+
+  recvoffset[0] = 0;
+  byteoffset[0] = 0;
+
+  for(int task = 1; task < D->NTask; task++)
+    {
+      recvoffset[task] = recvoffset[task - 1] + recvcounts[task - 1];
+      byteoffset[task] = byteoffset[task - 1] + bytecounts[task - 1];
+    }
+
+  loc_leaf_node_data = (leafnode_data *)Mem.mymalloc("loc_leaf_node_data", recvcounts[D->ThisTask] * sizeof(leafnode_data));
+
+  int idx = 0;
+
+  for(int n = 0; n < D->NTopleaves; n++)
+    {
+      if(D->TaskOfLeaf[n] == D->ThisTask)
+        {
+          int no       = NodeIndex[n];
+          fofnode *nop = &TopNodes[no];
+
+          leafnode_data *locp = &loc_leaf_node_data[idx];
+
+          for(int k = 0; k < 3; k++)
+            {
+              locp->range_min[k] = nop->range_min[k];
+              locp->range_max[k] = nop->range_max[k];
+            }
+
+          idx++;
+        }
+    }
+
+  MPI_Allgatherv(loc_leaf_node_data, bytecounts[D->ThisTask], MPI_BYTE, glob_leaf_node_data, bytecounts, byteoffset, MPI_BYTE,
+                 D->Communicator);
+
+  for(int task = 0; task < D->NTask; task++)
+    recvcounts[task] = 0;
+
+  for(int n = 0; n < D->NTopleaves; n++)
+    {
+      int task = D->TaskOfLeaf[n];
+      if(task != D->ThisTask)
+        {
+          int no       = NodeIndex[n];
+          fofnode *nop = &TopNodes[no];
+
+          int idx              = recvoffset[task] + recvcounts[task]++;
+          leafnode_data *globp = &glob_leaf_node_data[idx];
+
+          for(int k = 0; k < 3; k++)
+            {
+              nop->range_min[k] = globp->range_min[k];
+              nop->range_max[k] = globp->range_max[k];
+            }
+        }
+    }
+
+  Mem.myfree(loc_leaf_node_data);
+  Mem.myfree(byteoffset);
+  Mem.myfree(bytecounts);
+  Mem.myfree(recvoffset);
+  Mem.myfree(recvcounts);
+  Mem.myfree(glob_leaf_node_data);
+}
+
+/*! this routine determines the node ranges a given internal node
+ *  and all its subnodes using a recursive computation.  The result is
+ *  stored in the Nodes[] structure in the sequence of this tree-walk.
+ *  mode = 0: process a leaf branch, mode = 1: process top-level nodes
+ */
+template <typename partset>
+void foftree<partset>::update_node_recursive(int no, int sib, int mode)
+{
+  MyIntPosType range_min[3];
+  MyIntPosType range_max[3];
+
+  if(!(no >= MaxPart && no < MaxPart + MaxNodes)) /* are we an internal node? */
+    Terminate("no internal node\n");
+
+  fofnode *nop = get_nodep(no);
+
+  if(mode == TREE_MODE_TOPLEVEL)
+    {
+      int p = nop->nextnode;
+
+      /* if the next node is not a top-level node, we have reached a leaf node, and we need to do nothing */
+      if(p < MaxPart || p >= FirstNonTopLevelNode)
+        return;
+    }
+
+  for(int k = 0; k < 3; k++)
+    {
+      range_min[k] = ~((MyIntPosType)0);
+      range_max[k] = 0;
+    }
+
+  int p = nop->nextnode;
+
+  while(p != nop->sibling)
+    {
+      if(p >= 0)
+        {
+          if(p >= MaxPart && p < MaxPart + MaxNodes) /* we have an internal node */
+            {
+              int nextsib = get_nodep(p)->sibling;
+
+              update_node_recursive(p, nextsib, mode);
+            }
+
+          if(p < MaxPart) /* a particle */
+            {
+              for(int k = 0; k < 3; k++)
+                {
+                  if(range_min[k] > Tp->P[p].IntPos[k])
+                    range_min[k] = Tp->P[p].IntPos[k];
+
+                  if(range_max[k] < Tp->P[p].IntPos[k])
+                    range_max[k] = Tp->P[p].IntPos[k];
+                }
+
+              p = Nextnode[p];
+            }
+          else if(p < MaxPart + MaxNodes) /* an internal node  */
+            {
+              fofnode *nop = get_nodep(p);
+
+              for(int k = 0; k < 3; k++)
+                {
+                  if(range_min[k] > nop->range_min[k])
+                    range_min[k] = nop->range_min[k];
+
+                  if(range_max[k] < nop->range_max[k])
+                    range_max[k] = nop->range_max[k];
+                }
+
+              p = nop->sibling;
+            }
+          else if(p < MaxPart + MaxNodes + D->NTopleaves) /* a pseudo particle */
+            {
+              /* we are processing a local leaf-node which does not have any particles.
+               * can continue to the next element, which should end the work.
+               */
+              p = Nextnode[p - MaxNodes];
+            }
+          else
+            {
+              /* an imported point */
+
+              Terminate("Ups!");
+              p = Nextnode[p - MaxNodes];
+            }
+        }
+    }
+
+  for(int k = 0; k < 3; k++)
+    {
+      nop->range_min[k] = range_min[k];
+      nop->range_max[k] = range_max[k];
+    }
+}
+
+/* make sure that we instantiate the template */
+#include "../data/simparticles.h"
+template class foftree<simparticles>;
+
+#if defined(LIGHTCONE) && (defined(LIGHTCONE_PARTICLES_GROUPS) || defined(LIGHTCONE_IMAGE_COMP_HSML_VELDISP))
+/* make sure that we instantiate the template */
+#include "../data/lcparticles.h"
+template class foftree<lcparticles>;
+#endif
diff --git a/src/gitversion/version b/src/gitversion/version
new file mode 100644
index 0000000000000000000000000000000000000000..9cd3dc25eb0c5ea45390589b37735ff55e9ed901
--- /dev/null
+++ b/src/gitversion/version
@@ -0,0 +1,7 @@
+#ifndef VERSION_H
+#define VERSION_H
+
+const char* GIT_DATE   = "_DATE_";
+const char* GIT_COMMIT = "_COMMIT_";
+
+#endif
diff --git a/src/gitversion/version.h b/src/gitversion/version.h
new file mode 100644
index 0000000000000000000000000000000000000000..8341b944e2d82dad2511b01c8451d03cbc6c4dba
--- /dev/null
+++ b/src/gitversion/version.h
@@ -0,0 +1,7 @@
+#ifndef VERSION_H
+#define VERSION_H
+
+extern const char *GIT_DATE;
+extern const char *GIT_COMMIT;
+
+#endif
diff --git a/src/gravity/ewald.cc b/src/gravity/ewald.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e31df783dfa61d0bb4313f3eacbf66e0d8e12970
--- /dev/null
+++ b/src/gravity/ewald.cc
@@ -0,0 +1,1875 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file ewald.cc
+ *
+ *  \brief Code for Ewald correction computations.
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../gravity/ewald.h"
+#include "../gravity/ewaldtensors.h"
+#include "../gravtree/gravtree.h"
+#include "../io/io.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/shared_mem_handler.h"
+#include "../sort/cxxsort.h"
+#include "../system/system.h"
+
+/*!
+ *  This file contains the computation of the Ewald correction table, and the corresponding lookup functions.
+ *
+ *   in D0phi we store the correction potential:
+ *
+ *      phi = 1/x + pi/alpha^2 - sum_q (erfc(alpha |x-q|)/|x-q|)  - 4pi/V sum_k exp(-k^2/(4alpha^2))/k^2 cos(k*x)
+ *
+ *   in D1phi we store the first derivative of correction potential
+ *
+ *      dphi/dx_i
+ *
+ *   in D2phi we store the correction tensor (second derivatives of correction potential)
+ *
+ *      d2phi/(dx_i dx_j)
+ *
+ *   in D3phi we store the third order correction tensor (third derivatives of correction potential)
+ *
+ *      d3phi/(dx_i dx_j dx_k)
+ *
+ *   and so on also for D4phi and D5phi
+ */
+
+/*! \brief This function initializes tables with the correction force and the
+ *  correction potential due to the periodic images of a point mass located
+ *  at the origin.
+ *
+ *  These corrections are obtained by Ewald summation. (See for example
+ *  Hernquist, Bouchet, Suto, ApJS, 1991, 75, 231) The correction fields
+ *  are used to obtain the full periodic force if periodic boundaries
+ *  combined with the pure tree algorithm are used. For the TreePM/FMM-PM
+ *  algorithms, the Ewald correction is not used.
+ *
+ *  The correction fields are stored on disk once they are computed. If a
+ *  corresponding file is found, they are loaded from disk to speed up the
+ *  initialization. The Ewald summation issrc/gravtree_forcetest.c done in parallel, i.e. the
+ *  processors share the work to compute the tables if needed.
+ */
+void ewald::ewald_init(void)
+{
+  mpi_printf("EWALD: initialize Ewald correction...\n");
+
+  RegionLen     = All.BoxSize;
+  FacCoordToInt = pow(2.0, BITS_FOR_POSITIONS) / RegionLen;
+  FacIntToCoord = RegionLen / pow(2.0, BITS_FOR_POSITIONS);
+
+  Ewd = (ewald_data *)Mem.mymalloc("Ewd", sizeof(ewald_data) * (ENX + 1) * (ENY + 1) * (ENZ + 1));
+
+  char buf[200];
+  sprintf(buf, "ewald_table_%d-%d-%d_%d-%d-%d_precision%d-order%d.dat", LONG_X, LONG_Y, LONG_Z, ENX, ENY, ENZ, (int)sizeof(MyReal),
+          HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER);
+
+  int recomputeflag = 0;
+
+  if(ThisTask == 0)
+    {
+      FILE *fd;
+      if((fd = fopen(buf, "r")))
+        {
+          mpi_printf("\nEWALD: reading Ewald tables from file `%s'\n", buf);
+
+          ewald_header tabh;
+          my_fread(&tabh, sizeof(ewald_header), 1, fd);
+
+#ifndef GRAVITY_TALLBOX
+          int ewaldtype = -1;
+#else
+          int ewaldtype = GRAVITY_TALLBOX + 1;
+#endif
+          if(tabh.resx != ENX || tabh.resy != ENY || tabh.resz != ENZ || tabh.varsize != sizeof(MyFloat) ||
+             tabh.ewaldtype != ewaldtype)
+            {
+              mpi_printf("\nEWALD: something's wrong with this table file. Discarding it.\n");
+              recomputeflag = 1;
+            }
+          else
+            {
+              my_fread(Ewd, sizeof(ewald_data), (ENX + 1) * (ENY + 1) * (ENZ + 1), fd);
+
+              recomputeflag = 0;
+            }
+          fclose(fd);
+        }
+      else
+        recomputeflag = 1;
+    }
+
+  MPI_Bcast(&recomputeflag, 1, MPI_INT, 0, Communicator);
+
+  if(recomputeflag)
+    {
+      mpi_printf("\nEWALD: No usable Ewald tables in file `%s' found. Recomputing them...\n", buf);
+
+      /* ok, let's recompute things. Actually, we do that in parallel. */
+
+      int size = (ENX + 1) * (ENY + 1) * (ENZ + 1);
+      int first, count;
+
+      subdivide_evenly(size, NTask, ThisTask, &first, &count);
+
+      for(int n = first; n < first + count; n++)
+        {
+          int i = n / ((ENY + 1) * (ENZ + 1));
+          int j = (n - i * (ENY + 1) * (ENZ + 1)) / (ENZ + 1);
+          int k = (n - i * (ENY + 1) * (ENZ + 1) - j * (ENZ + 1));
+
+          if(ThisTask == 0)
+            {
+              if(((n - first) % (count / 20)) == 0)
+                {
+                  printf("%4.1f percent done\n", (n - first) / (count / 100.0));
+                  myflush(stdout);
+                }
+            }
+
+          double xx = 0.5 * DBX * (1.0 / LONG_X) * ((double)i) / ENX;
+          double yy = 0.5 * DBY * (1.0 / LONG_Y) * ((double)j) / ENY;
+          double zz = 0.5 * DBZ * (1.0 / LONG_Z) * ((double)k) / ENZ;
+
+          ewald_data *ewdp = Ewd + ewd_offset(i, j, k);
+
+#ifndef GRAVITY_TALLBOX
+          ewdp->D0phi = ewald_D0(xx, yy, zz);
+          ewdp->D1phi = ewald_D1(xx, yy, zz);
+          ewdp->D2phi = ewald_D2(xx, yy, zz);
+          ewdp->D3phi = ewald_D3(xx, yy, zz);
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 4
+          ewdp->D4phi = ewald_D4(xx, yy, zz);
+#endif
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 5
+          ewdp->D5phi = ewald_D5(xx, yy, zz);
+#endif
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 6
+          ewdp->D6phi = ewald_D6(xx, yy, zz);
+#endif
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 7
+          ewdp->D7phi = ewald_D7(xx, yy, zz);
+#endif
+#else
+          ewdp->D0phi   = ewald_D0(xx, zz, yy);
+
+          vector<double> force = ewald_D1(yy, zz, xx);
+
+          switch(GRAVITY_TALLBOX)
+            {
+              case 0:
+                ewdp->D1phi[0] = force[2];
+                ewdp->D1phi[1] = force[0];
+                ewdp->D1phi[2] = force[1];
+                break;
+
+              case 1:
+                ewdp->D1phi[0] = force[0];
+                ewdp->D1phi[1] = force[2];
+                ewdp->D1phi[2] = force[1];
+                break;
+
+              case 2:
+                ewdp->D1phi[0] = force[0];
+                ewdp->D1phi[1] = force[1];
+                ewdp->D1phi[2] = force[2];
+                break;
+            }
+#endif
+        }
+
+      int *recvcnts = (int *)Mem.mymalloc("recvcnts", NTask * sizeof(int));
+      int *recvoffs = (int *)Mem.mymalloc("recvoffs", NTask * sizeof(int));
+
+      for(int i = 0; i < NTask; i++)
+        {
+          int off, cnt;
+          subdivide_evenly(size, NTask, i, &off, &cnt);
+          recvcnts[i] = cnt * sizeof(ewald_data);
+          recvoffs[i] = off * sizeof(ewald_data);
+        }
+
+      MPI_Allgatherv(MPI_IN_PLACE, size * sizeof(ewald_data), MPI_BYTE, Ewd, recvcnts, recvoffs, MPI_BYTE, Communicator);
+
+      Mem.myfree(recvoffs);
+      Mem.myfree(recvcnts);
+
+      mpi_printf("\nEWALD: writing Ewald tables to file `%s'\n", buf);
+      if(ThisTask == 0)
+        {
+          FILE *fd;
+          if((fd = fopen(buf, "w")))
+            {
+              ewald_header tabh;
+              tabh.resx    = ENX;
+              tabh.resy    = ENY;
+              tabh.resz    = ENZ;
+              tabh.varsize = sizeof(MyFloat);
+#ifndef GRAVITY_TALLBOX
+              tabh.ewaldtype = -1;
+#else
+              tabh.ewaldtype = GRAVITY_TALLBOX + 1;
+#endif
+              my_fwrite(&tabh, sizeof(ewald_header), 1, fd);
+
+              my_fwrite(Ewd, sizeof(ewald_data), (ENX + 1) * (ENY + 1) * (ENZ + 1), fd);
+              fclose(fd);
+            }
+          else
+            Terminate("can't write to file '%s'\n", buf);
+        }
+    }
+  else
+    {
+      /* here we got them from disk */
+      int len = (ENX + 1) * (ENY + 1) * (ENZ + 1) * sizeof(ewald_data);
+      MPI_Bcast(Ewd, len, MPI_BYTE, 0, Communicator);
+    }
+
+  Ewd_fac_intp[0] = 2.0 * EN * LONG_X / All.BoxSize;
+  Ewd_fac_intp[1] = 2.0 * EN * LONG_Y / All.BoxSize;
+  Ewd_fac_intp[2] = 2.0 * EN * LONG_Z / All.BoxSize;
+
+  /* now scale things to the boxsize that is actually used */
+  for(int i = 0; i <= ENX; i++)
+    for(int j = 0; j <= ENY; j++)
+      for(int k = 0; k <= ENZ; k++)
+        {
+          ewald_data *ewdp = Ewd + ewd_offset(i, j, k);
+
+          ewdp->D0phi *= 1 / All.BoxSize; /* potential */
+          ewdp->D1phi *= 1 / pow(All.BoxSize, 2);
+          ewdp->D2phi *= 1 / pow(All.BoxSize, 3);
+          ewdp->D3phi *= 1 / pow(All.BoxSize, 4);
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 4
+          ewdp->D4phi *= 1 / pow(All.BoxSize, 5);
+#endif
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 5
+          ewdp->D5phi *= 1 / pow(All.BoxSize, 6);
+#endif
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 6
+          ewdp->D6phi *= 1 / pow(All.BoxSize, 7);
+#endif
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 7
+          ewdp->D7phi *= 1 / pow(All.BoxSize, 8);
+#endif
+        }
+
+  mpi_printf("EWALD: Initialization of periodic boundaries finished.\n");
+
+  ewald_is_initialized = 1;
+
+  if(Shmem.Island_NTask != Shmem.World_NTask)
+    {
+      // We actually have multiple shared memory nodes in which we set aside one MPI rank for shared memory communication.
+      // In this case, move the ewaldtable to the communication rank in order to consume this memory only once on the node
+
+      if(Shmem.Island_ThisTask == 0)
+        {
+          size_t tab_len = sizeof(ewald_data) * (ENX + 1) * (ENY + 1) * (ENZ + 1);
+
+          MPI_Send(&tab_len, sizeof(tab_len), MPI_BYTE, Shmem.MyShmRankInGlobal, TAG_EWALD_ALLOC, MPI_COMM_WORLD);
+          MPI_Send(Ewd, tab_len, MPI_BYTE, Shmem.MyShmRankInGlobal, TAG_DMOM, MPI_COMM_WORLD);
+        }
+
+      Mem.myfree(Ewd);
+
+      ptrdiff_t off;
+      MPI_Bcast(&off, sizeof(ptrdiff_t), MPI_BYTE, Shmem.Island_NTask - 1, Shmem.SharedMemComm);
+
+      Ewd = (ewald_data *)((char *)Shmem.SharedMemBaseAddr[Shmem.Island_NTask - 1] + off);
+    }
+
+#ifdef EWALD_TEST
+  test_interpolation_accuracy();
+#endif
+}
+
+void ewald::ewald_gridlookup(const MyIntPosType *p_intpos, const MyIntPosType *target_intpos, enum interpolate_options flag,
+                             ewald_data &fper)
+{
+  // we determine the closest available point in our Ewald look-up table
+
+  static MyIntPosType const halflen   = ((MyIntPosType)1) << ((BITS_FOR_POSITIONS - 1) - (EWLEVEL + 1));
+  static MyIntPosType const intlen    = halflen << 1;
+  static MyIntPosType const ewaldmask = ~(intlen - 1);
+
+  MyIntPosType temppos[3] = {p_intpos[0] - target_intpos[0], p_intpos[1] - target_intpos[1], p_intpos[2] - target_intpos[2]};
+
+  MyIntPosType gridpos[3];
+  gridpos[0] = (temppos[0] + halflen) & ewaldmask;
+  gridpos[1] = (temppos[1] + halflen) & ewaldmask;
+  gridpos[2] = (temppos[2] + halflen) & ewaldmask;
+
+  vector<double> off;
+  nearest_image_intpos_to_pos(temppos, gridpos, off.da);
+
+  int i = (gridpos[0] >> (BITS_FOR_POSITIONS - (EWLEVEL + 1)));
+  int j = (gridpos[1] >> (BITS_FOR_POSITIONS - (EWLEVEL + 1)));
+  int k = (gridpos[2] >> (BITS_FOR_POSITIONS - (EWLEVEL + 1)));
+
+  int signx = 1, signy = 1, signz = 1;
+
+  if(i > EN)
+    {
+      i     = 2 * EN - i;
+      signx = -1;
+    }
+  else if(i == EN && gridpos[0] < temppos[0])
+    signx = -1;
+
+  if(j > EN)
+    {
+      j     = 2 * EN - j;
+      signy = -1;
+    }
+  else if(j == EN && gridpos[1] < temppos[1])
+    signy = -1;
+
+  if(k > EN)
+    {
+      k     = 2 * EN - k;
+      signz = -1;
+    }
+  else if(k == EN && gridpos[2] < temppos[2])
+    signz = -1;
+
+  fper = Ewd[ewd_offset(i, j, k)];
+
+  /* change signs as needed */
+
+  fper.D1phi[0] *= signx;
+  fper.D1phi[1] *= signy;
+  fper.D1phi[2] *= signz;
+
+  fper.D2phi[qXY] *= signx * signy;
+  fper.D2phi[qXZ] *= signx * signz;
+  fper.D2phi[qYZ] *= signy * signz;
+
+  fper.D3phi[dXXX] *= signx;
+  fper.D3phi[dXXY] *= signy;
+  fper.D3phi[dXXZ] *= signz;
+  fper.D3phi[dXYY] *= signx;
+  fper.D3phi[dXYZ] *= signx * signy * signz;
+  fper.D3phi[dXZZ] *= signx;
+  fper.D3phi[dYYY] *= signy;
+  fper.D3phi[dYYZ] *= signz;
+  fper.D3phi[dYZZ] *= signy;
+  fper.D3phi[dZZZ] *= signz;
+
+#if EWALD_TAYLOR_ORDER == 3
+
+  fper.D4phi[sXXXY] *= signx * signy;
+  fper.D4phi[sXYYY] *= signx * signy;
+  fper.D4phi[sXXXZ] *= signx * signz;
+  fper.D4phi[sXZZZ] *= signx * signz;
+  fper.D4phi[sYYYZ] *= signy * signz;
+  fper.D4phi[sYZZZ] *= signy * signz;
+  fper.D4phi[sXXYZ] *= signy * signz;
+  fper.D4phi[sXYYZ] *= signx * signz;
+  fper.D4phi[sXYZZ] *= signx * signy;
+
+  // now Taylor corrections
+
+  fper.D0phi += fper.D1phi * off + 0.5 * ((fper.D2phi * off) * off) + (1.0 / 6) * (((fper.D3phi * off) * off) * off);
+  fper.D1phi += fper.D2phi * off + 0.5 * ((fper.D3phi * off) * off) + (1.0 / 6) * (((fper.D4phi * off) * off) * off);
+
+  if(flag == POINTMASS)
+    return;
+
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 5
+  fper.D5phi[rXXXXX] *= signx;
+  fper.D5phi[rYYYYY] *= signy;
+  fper.D5phi[rZZZZZ] *= signz;
+
+  fper.D5phi[rXXXXY] *= signy;
+  fper.D5phi[rXXXXZ] *= signz;
+  fper.D5phi[rXYYYY] *= signx;
+  fper.D5phi[rXZZZZ] *= signx;
+  fper.D5phi[rYYYYZ] *= signz;
+  fper.D5phi[rYZZZZ] *= signy;
+
+  fper.D5phi[rXXXYY] *= signx;
+  fper.D5phi[rXXXZZ] *= signx;
+  fper.D5phi[rXXYYY] *= signy;
+  fper.D5phi[rXXZZZ] *= signz;
+  fper.D5phi[rYYYZZ] *= signy;
+  fper.D5phi[rYYZZZ] *= signz;
+
+  fper.D5phi[rXXYZZ] *= signy;
+  fper.D5phi[rXXYYZ] *= signz;
+  fper.D5phi[rXYYZZ] *= signx;
+
+  fper.D5phi[rXXXYZ] *= signx * signy * signz;
+  fper.D5phi[rXYYYZ] *= signx * signy * signz;
+  fper.D5phi[rXYZZZ] *= signx * signy * signz;
+
+  fper.D2phi += fper.D3phi * off + 0.5 * ((fper.D4phi * off) * off) + (1.0 / 6) * (((fper.D5phi * off) * off) * off);
+#endif
+
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 6
+  fper.D6phi[pXXXXXY] *= signx * signy;
+  fper.D6phi[pXXXXXZ] *= signx * signz;
+  fper.D6phi[pXXXXYZ] *= signy * signz;
+  fper.D6phi[pXXXYYY] *= signx * signy;
+  fper.D6phi[pXXXYYZ] *= signx * signz;
+  fper.D6phi[pXXXYZZ] *= signx * signy;
+  fper.D6phi[pXXXZZZ] *= signx * signz;
+  fper.D6phi[pXXYYYZ] *= signy * signz;
+  fper.D6phi[pXXYZZZ] *= signy * signz;
+  fper.D6phi[pXYYYYY] *= signx * signy;
+  fper.D6phi[pXYYYYZ] *= signx * signz;
+  fper.D6phi[pXYYYZZ] *= signx * signy;
+  fper.D6phi[pXYYZZZ] *= signx * signz;
+  fper.D6phi[pXYZZZZ] *= signx * signy;
+  fper.D6phi[pXZZZZZ] *= signx * signz;
+  fper.D6phi[pYYYYYZ] *= signy * signz;
+  fper.D6phi[pYYYZZZ] *= signy * signz;
+  fper.D6phi[pYZZZZZ] *= signy * signz;
+
+  fper.D3phi += fper.D4phi * off + 0.5 * ((fper.D5phi * off) * off) + (1.0 / 6) * (((fper.D6phi * off) * off) * off);
+#endif
+
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 7
+  fper.D7phi[tXXXXXXX] *= signx;
+  fper.D7phi[tXXXXXXY] *= signy;
+  fper.D7phi[tXXXXXXZ] *= signz;
+  fper.D7phi[tXXXXXYY] *= signx;
+  fper.D7phi[tXXXXXYZ] *= signx * signy * signz;
+  fper.D7phi[tXXXXXZZ] *= signx;
+  fper.D7phi[tXXXXYYY] *= signy;
+  fper.D7phi[tXXXXYYZ] *= signz;
+  fper.D7phi[tXXXXYZZ] *= signy;
+  fper.D7phi[tXXXXZZZ] *= signz;
+  fper.D7phi[tXXXYYYY] *= signx;
+  fper.D7phi[tXXXYYYZ] *= signx * signy * signz;
+  fper.D7phi[tXXXYYZZ] *= signx;
+  fper.D7phi[tXXXYZZZ] *= signx * signy * signz;
+  fper.D7phi[tXXXZZZZ] *= signx;
+  fper.D7phi[tXXYYYYY] *= signy;
+  fper.D7phi[tXXYYYYZ] *= signz;
+  fper.D7phi[tXXYYYZZ] *= signy;
+  fper.D7phi[tXXYYZZZ] *= signz;
+  fper.D7phi[tXXYZZZZ] *= signy;
+  fper.D7phi[tXXZZZZZ] *= signz;
+  fper.D7phi[tXYYYYYY] *= signx;
+  fper.D7phi[tXYYYYYZ] *= signx * signy * signz;
+  fper.D7phi[tXYYYYZZ] *= signx;
+  fper.D7phi[tXYYYZZZ] *= signx * signy * signz;
+  fper.D7phi[tXYYZZZZ] *= signx;
+  fper.D7phi[tXYZZZZZ] *= signx * signy * signz;
+  fper.D7phi[tXZZZZZZ] *= signx;
+  fper.D7phi[tYYYYYYY] *= signy;
+  fper.D7phi[tYYYYYYZ] *= signz;
+  fper.D7phi[tYYYYYZZ] *= signy;
+  fper.D7phi[tYYYYZZZ] *= signz;
+  fper.D7phi[tYYYZZZZ] *= signy;
+  fper.D7phi[tYYZZZZZ] *= signz;
+  fper.D7phi[tYZZZZZZ] *= signy;
+  fper.D7phi[tZZZZZZZ] *= signz;
+
+  fper.D4phi += fper.D5phi * off + 0.5 * ((fper.D6phi * off) * off) + (1.0 / 6) * (((fper.D7phi * off) * off) * off);
+  fper.D5phi += fper.D6phi * off + 0.5 * ((fper.D7phi * off) * off);
+#endif
+
+#else
+
+  // only second order Taylor expansion, i.e. EWALD_TAYLOR_ORDER==2
+
+  // now Taylor corrections
+
+#ifndef GRAVITY_TALLBOX
+  fper.D0phi += fper.D1phi * off + 0.5 * ((fper.D2phi * off) * off);
+  fper.D1phi += fper.D2phi * off + 0.5 * ((fper.D3phi * off) * off);
+#endif
+
+  if(flag == POINTMASS)
+    return;
+
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 4
+  fper.D4phi[sXXXY] *= signx * signy;
+  fper.D4phi[sXYYY] *= signx * signy;
+  fper.D4phi[sXXXZ] *= signx * signz;
+  fper.D4phi[sXZZZ] *= signx * signz;
+  fper.D4phi[sYYYZ] *= signy * signz;
+  fper.D4phi[sYZZZ] *= signy * signz;
+  fper.D4phi[sXXYZ] *= signy * signz;
+  fper.D4phi[sXYYZ] *= signx * signz;
+  fper.D4phi[sXYZZ] *= signx * signy;
+
+  fper.D2phi += fper.D3phi * off + 0.5 * ((fper.D4phi * off) * off);
+#endif
+
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 5
+  fper.D5phi[rXXXXX] *= signx;
+  fper.D5phi[rYYYYY] *= signy;
+  fper.D5phi[rZZZZZ] *= signz;
+
+  fper.D5phi[rXXXXY] *= signy;
+  fper.D5phi[rXXXXZ] *= signz;
+  fper.D5phi[rXYYYY] *= signx;
+  fper.D5phi[rXZZZZ] *= signx;
+  fper.D5phi[rYYYYZ] *= signz;
+  fper.D5phi[rYZZZZ] *= signy;
+
+  fper.D5phi[rXXXYY] *= signx;
+  fper.D5phi[rXXXZZ] *= signx;
+  fper.D5phi[rXXYYY] *= signy;
+  fper.D5phi[rXXZZZ] *= signz;
+  fper.D5phi[rYYYZZ] *= signy;
+  fper.D5phi[rYYZZZ] *= signz;
+
+  fper.D5phi[rXXYZZ] *= signy;
+  fper.D5phi[rXXYYZ] *= signz;
+  fper.D5phi[rXYYZZ] *= signx;
+
+  fper.D5phi[rXXXYZ] *= signx * signy * signz;
+  fper.D5phi[rXYYYZ] *= signx * signy * signz;
+  fper.D5phi[rXYZZZ] *= signx * signy * signz;
+
+  fper.D3phi += fper.D4phi * off + 0.5 * ((fper.D5phi * off) * off);
+#endif
+
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 6
+  fper.D6phi[pXXXXXY] *= signx * signy;
+  fper.D6phi[pXXXXXZ] *= signx * signz;
+  fper.D6phi[pXXXXYZ] *= signy * signz;
+  fper.D6phi[pXXXYYY] *= signx * signy;
+  fper.D6phi[pXXXYYZ] *= signx * signz;
+  fper.D6phi[pXXXYZZ] *= signx * signy;
+  fper.D6phi[pXXXZZZ] *= signx * signz;
+  fper.D6phi[pXXYYYZ] *= signy * signz;
+  fper.D6phi[pXXYZZZ] *= signy * signz;
+  fper.D6phi[pXYYYYY] *= signx * signy;
+  fper.D6phi[pXYYYYZ] *= signx * signz;
+  fper.D6phi[pXYYYZZ] *= signx * signy;
+  fper.D6phi[pXYYZZZ] *= signx * signz;
+  fper.D6phi[pXYZZZZ] *= signx * signy;
+  fper.D6phi[pXZZZZZ] *= signx * signz;
+  fper.D6phi[pYYYYYZ] *= signy * signz;
+  fper.D6phi[pYYYZZZ] *= signy * signz;
+  fper.D6phi[pYZZZZZ] *= signy * signz;
+
+  fper.D4phi += fper.D5phi * off + 0.5 * ((fper.D6phi * off) * off);
+#endif
+
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 7
+  fper.D7phi[tXXXXXXX] *= signx;
+  fper.D7phi[tXXXXXXY] *= signy;
+  fper.D7phi[tXXXXXXZ] *= signz;
+  fper.D7phi[tXXXXXYY] *= signx;
+  fper.D7phi[tXXXXXYZ] *= signx * signy * signz;
+  fper.D7phi[tXXXXXZZ] *= signx;
+  fper.D7phi[tXXXXYYY] *= signy;
+  fper.D7phi[tXXXXYYZ] *= signz;
+  fper.D7phi[tXXXXYZZ] *= signy;
+  fper.D7phi[tXXXXZZZ] *= signz;
+  fper.D7phi[tXXXYYYY] *= signx;
+  fper.D7phi[tXXXYYYZ] *= signx * signy * signz;
+  fper.D7phi[tXXXYYZZ] *= signx;
+  fper.D7phi[tXXXYZZZ] *= signx * signy * signz;
+  fper.D7phi[tXXXZZZZ] *= signx;
+  fper.D7phi[tXXYYYYY] *= signy;
+  fper.D7phi[tXXYYYYZ] *= signz;
+  fper.D7phi[tXXYYYZZ] *= signy;
+  fper.D7phi[tXXYYZZZ] *= signz;
+  fper.D7phi[tXXYZZZZ] *= signy;
+  fper.D7phi[tXXZZZZZ] *= signz;
+  fper.D7phi[tXYYYYYY] *= signx;
+  fper.D7phi[tXYYYYYZ] *= signx * signy * signz;
+  fper.D7phi[tXYYYYZZ] *= signx;
+  fper.D7phi[tXYYYZZZ] *= signx * signy * signz;
+  fper.D7phi[tXYYZZZZ] *= signx;
+  fper.D7phi[tXYZZZZZ] *= signx * signy * signz;
+  fper.D7phi[tXZZZZZZ] *= signx;
+  fper.D7phi[tYYYYYYY] *= signy;
+  fper.D7phi[tYYYYYYZ] *= signz;
+  fper.D7phi[tYYYYYZZ] *= signy;
+  fper.D7phi[tYYYYZZZ] *= signz;
+  fper.D7phi[tYYYZZZZ] *= signy;
+  fper.D7phi[tYYZZZZZ] *= signz;
+  fper.D7phi[tYZZZZZZ] *= signy;
+  fper.D7phi[tZZZZZZZ] *= signz;
+
+  fper.D5phi += fper.D6phi * off + 0.5 * ((fper.D7phi * off) * off);
+#endif
+
+#endif
+}
+
+/*! \brief This function computes the potential correction term by means of Ewald
+ *  summation.
+ *
+ *  \param x, y, z contains the distance vector for which the correction
+ *  term should be computed
+ *  \return the correction term
+ */
+double ewald::ewald_D0(double x, double y, double z)
+{
+  static int printed = 0;
+
+  double D0 = 0.0;
+
+#ifndef GRAVITY_TALLBOX
+
+  double leff   = pow((1.0 / LONG_X) * (1.0 / LONG_Y) * (1.0 / LONG_Z), 1.0 / 3);
+  double alpha  = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 * LONG_X / alpha + 0.5);
+  int qymax = (int)(8.0 * LONG_Y / alpha + 0.5);
+  int qzmax = (int)(8.0 * LONG_Z / alpha + 0.5);
+
+  int nxmax = (int)(2.0 * alpha / LONG_X + 0.5);
+  int nymax = (int)(2.0 * alpha / LONG_Y + 0.5);
+  int nzmax = (int)(2.0 * alpha / LONG_Z + 0.5);
+
+  if(printed == 0)
+    {
+      mpi_printf("EWALD: D0 table: qxmax=%d qymax=%d qzmax=%d   nxmax=%d nymax=%d nzmax=%d\n", qxmax, qymax, qzmax, nxmax, nymax,
+                 nzmax);
+      printed = 1;
+    }
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      for(int nz = -qzmax; nz <= qzmax; nz++)
+        {
+          double dx = x - nx * (1.0 / LONG_X);
+          double dy = y - ny * (1.0 / LONG_Y);
+          double dz = z - nz * (1.0 / LONG_Z);
+
+          double r2 = dx * dx + dy * dy + dz * dz;
+          double r  = sqrt(r2);
+
+          double rinv = (r > 0) ? 1.0 / r : 0.0;
+
+          double g0;
+
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              g0 = -erfc(alpha * r) * rinv;
+            }
+          else
+            {
+              /* we add the 1/r term here to the (0|0|0) entry, followed by differentiation, and the limit r->0 to obtain accurate
+               * results at the origin
+               */
+
+              /* for small r:
+               *
+               *   [1- erfc(a r)]/r  =  2 a/sqrt(pi) * [ 1 - (a r)^2/3 + (a r)^4 / 10 - (a r)^6 / 42 + (a r)^8 / 216 - ...]
+               */
+
+              if((alpha * r) < 0.5)
+                {
+                  g0 = 2.0 * pow(alpha, 1) / sqrt(M_PI) *
+                       (1.0 - pow(alpha * r, 2) / 3.0 + pow(alpha * r, 4) / 10.0 - pow(alpha * r, 6) / 42.0 +
+                        pow(alpha * r, 8) / 216.0 - pow(alpha * r, 10) / 1320.0);
+                }
+              else
+                {
+                  g0 = erf(alpha * r) * rinv;
+                }
+            }
+
+          D0 += g0;
+        }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      for(int nz = -nzmax; nz <= nzmax; nz++)
+        {
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              double kx    = (2.0 * M_PI * LONG_X) * nx;
+              double ky    = (2.0 * M_PI * LONG_Y) * ny;
+              double kz    = (2.0 * M_PI * LONG_Z) * nz;
+              double k2    = kx * kx + ky * ky + kz * kz;
+              double kdotx = (x * kx + y * ky + z * kz);
+
+              D0 += -4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / k2 * exp(-k2 / (4.0 * alpha2)) * cos(kdotx);
+            }
+        }
+
+  D0 += M_PI * (LONG_X * LONG_Y * LONG_Z) / (alpha * alpha);
+
+#else
+  /* in the tallbox case, the third dimension, z, is assumed to be the non-periodic one */
+
+  double leff = sqrt(BOXX * BOXY);
+  double alpha = 2.0 / leff;
+
+  int qxmax = (int)(8.0 / (BOXX * alpha) + 0.5);
+  int qymax = (int)(8.0 / (BOXY * alpha) + 0.5);
+
+  int nxmax = (int)(2.0 * alpha * BOXX + 0.5);
+  int nymax = (int)(2.0 * alpha * BOXY + 0.5);
+
+  if(printed == 0)
+    {
+      mpi_printf("EWALD: D0 table: qxmax=%d qymax=%d   nxmax=%d nymax=%d\n", qxmax, qymax, nxmax, nymax);
+      printed = 1;
+    }
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      {
+        double dx = x - nx * BOXX;
+        double dy = y - ny * BOXY;
+        double r = sqrt(dx * dx + dy * dy + z * z);
+
+        double rinv = (r > 0) ? 1.0 / r : 0.0;
+
+        double g0;
+
+        if(nx != 0 || ny != 0)
+          {
+            g0 = -erfc(alpha * r) * rinv;
+          }
+        else
+          {
+            /* we add the 1/r term here to the (0|0) entry */
+
+            if((alpha * r) < 0.5)
+              {
+                g0 = 2.0 * pow(alpha, 1) / sqrt(M_PI) *
+                     (1.0 - pow(alpha * r, 2) / 3.0 + pow(alpha * r, 4) / 10.0 - pow(alpha * r, 6) / 42.0 + pow(alpha * r, 8) / 216.0 -
+                      pow(alpha * r, 10) / 1320.0);
+              }
+            else
+              {
+                g0 = erf(alpha * r) * rinv;
+              }
+          }
+
+        D0 += g0;
+      }
+
+  double alpha2 = alpha * alpha;
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      {
+        if(nx != 0 || ny != 0)
+          {
+            double kx = (2.0 * M_PI / BOXX) * nx;
+            double ky = (2.0 * M_PI / BOXY) * ny;
+            double k2 = kx * kx + ky * ky;
+            double k = sqrt(k2);
+
+            if(k * z > 0)
+              {
+                double ex = exp(-k * z);
+                if(ex > 0)
+                  D0 += -M_PI / (BOXX * BOXY) * (erfc(k / (2 * alpha) + alpha * z) / ex + ex * erfc(k / (2 * alpha) - alpha * z)) / k;
+              }
+            else
+              {
+                double ex = exp(k * z);
+                if(ex > 0)
+                  D0 += -M_PI / (BOXX * BOXY) * cos(kx * x + ky * y) *
+                        (ex * erfc(k / (2 * alpha) + alpha * z) + erfc(k / (2 * alpha) - alpha * z) / ex) / k;
+              }
+          }
+      }
+
+  D0 += 2.0 * alpha / sqrt(M_PI) + 2 * sqrt(M_PI) / (BOXX * BOXY) * (exp(-alpha2 * z * z) / alpha + sqrt(M_PI) * z * erf(alpha * z));
+
+#endif
+
+  return D0;
+}
+
+/*! \brief This function computes the force correction term (difference between full
+ *  force of infinite lattice and nearest image) by Ewald summation.
+ *
+ *  \param x, y, z contains the distance vector for which the correction
+ *  force should be computed
+ *  \param force  array will containing the correction force
+ */
+vector<double> ewald::ewald_D1(double x, double y, double z)
+{
+  static int printed = 0;
+
+  vector<double> D1 = 0.0;
+
+#ifndef GRAVITY_TALLBOX
+
+  double leff   = pow((1.0 / LONG_X) * (1.0 / LONG_Y) * (1.0 / LONG_Z), 1.0 / 3);
+  double alpha  = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 * LONG_X / alpha + 0.5);
+  int qymax = (int)(8.0 * LONG_Y / alpha + 0.5);
+  int qzmax = (int)(8.0 * LONG_Z / alpha + 0.5);
+
+  int nxmax = (int)(2.0 * alpha / LONG_X + 0.5);
+  int nymax = (int)(2.0 * alpha / LONG_Y + 0.5);
+  int nzmax = (int)(2.0 * alpha / LONG_Z + 0.5);
+
+  if(printed == 0)
+    {
+      mpi_printf("EWALD: D1 table: qxmax=%d qymax=%d qzmax=%d   nxmax=%d nymax=%d nzmax=%d\n", qxmax, qymax, qzmax, nxmax, nymax,
+                 nzmax);
+      printed = 1;
+    }
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      for(int nz = -qzmax; nz <= qzmax; nz++)
+        {
+          double dx = x - nx * (1.0 / LONG_X);
+          double dy = y - ny * (1.0 / LONG_Y);
+          double dz = z - nz * (1.0 / LONG_Z);
+
+          vector<double> dxyz(dx, dy, dz);
+
+          double r2 = dx * dx + dy * dy + dz * dz;
+          double r  = sqrt(r2);
+
+          double rinv  = (r > 0) ? 1.0 / r : 0.0;
+          double r2inv = rinv * rinv;
+          double r3inv = r2inv * rinv;
+
+          double g1;
+
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              g1 = (erfc(alpha * r) + 2.0 * alpha * r / sqrt(M_PI) * exp(-alpha2 * r2)) * r3inv;
+            }
+          else
+            {
+              /* we add the 1/r term here to the (0|0|0) entry, followed by differentiation, and the limit r->0 to obtain accurate
+               * results at the origin
+               */
+
+              /* Note, for small r:
+               *
+               *   [1/- erfc(a r)]/r  =  2 a/sqrt(pi) * [ 1 - (a r)^2/3 + (a r)^4 / 10 - (a r)^6 / 42 + (a r)^8 / 216 - ...]
+               *
+               *   Hence for r = 0:
+               *
+               *   g0 =  2     * alpha   / sqrt(pi)
+               *   g1 = -4/3   * alpha^3 / sqrt(pi)
+               *   g2 =  8/5   * alpha^5 / sqrt(pi)
+               *   g3 = -16/7  * alpha^7 / sqrt(pi)
+               *   g4 =  32/9  * alpha^9 / sqrt(pi)
+               *   g5 = -64/11 * alpha^11/ sqrt(pi)
+               */
+
+              if((alpha * r) < 0.5)
+                {
+                  g1 = 4.0 * pow(alpha, 3) / sqrt(M_PI) *
+                       (-1.0 / 3.0 + pow(alpha * r, 2) / 5.0 - pow(alpha * r, 4) / 14.0 + pow(alpha * r, 6) / 54.0 -
+                        pow(alpha * r, 8) / 264.0 + pow(alpha * r, 10) / 1560.0);
+                }
+              else
+                {
+                  g1 = (-erf(alpha * r) + 2.0 * alpha * r / sqrt(M_PI) * exp(-alpha2 * r2)) * r3inv;
+                }
+            }
+
+          D1 += g1 * dxyz;
+        }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      for(int nz = -nzmax; nz <= nzmax; nz++)
+        {
+          double kx = (2.0 * M_PI * LONG_X) * nx;
+          double ky = (2.0 * M_PI * LONG_Y) * ny;
+          double kz = (2.0 * M_PI * LONG_Z) * nz;
+          double k2 = kx * kx + ky * ky + kz * kz;
+
+          if(k2 > 0)
+            {
+              double kdotx = (x * kx + y * ky + z * kz);
+              double val   = 4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / k2 * exp(-k2 / (4.0 * alpha2)) * sin(kdotx);
+              D1[0] += kx * val;
+              D1[1] += ky * val;
+              D1[2] += kz * val;
+            }
+        }
+#else
+  /* this is the case with periodicity only in two dimensions */
+
+  double leff = sqrt(BOXX * BOXY);
+  double alpha = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 / (BOXX * alpha) + 0.5);
+  int qymax = (int)(8.0 / (BOXY * alpha) + 0.5);
+
+  int nxmax = (int)(2.0 * alpha * BOXX + 0.5);
+  int nymax = (int)(2.0 * alpha * BOXY + 0.5);
+
+  if(printed == 0)
+    {
+      mpi_printf("EWALD: D1 table: qxmax=%d qymax=%d    nxmax=%d nymax=%d\n", qxmax, qymax, nxmax, nymax);
+      printed = 1;
+    }
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      {
+        double dx = x - nx * BOXX;
+        double dy = y - ny * BOXY;
+        double dz = z;
+
+        vector<double> dxyz(dx, dy, dz);
+
+        double r2 = dx * dx + dy * dy + dz * dz;
+        double r = sqrt(r2);
+
+        double rinv = (r > 0) ? 1.0 / r : 0.0;
+        double r2inv = rinv * rinv;
+        double r3inv = r2inv * rinv;
+
+        double g1;
+
+        if(nx != 0 || ny != 0)
+          {
+            g1 = (erfc(alpha * r) + 2.0 * alpha * r / sqrt(M_PI) * exp(-alpha2 * r2)) * r3inv;
+          }
+        else
+          {
+            /* we add the 1/r term here to the (0|0) entry */
+
+            if((alpha * r) < 0.5)
+              {
+                g1 = 4.0 * pow(alpha, 3) / sqrt(M_PI) *
+                     (-1.0 / 3.0 + pow(alpha * r, 2) / 5.0 - pow(alpha * r, 4) / 14.0 + pow(alpha * r, 6) / 54.0 -
+                      pow(alpha * r, 8) / 264.0 + pow(alpha * r, 10) / 1560.0);
+              }
+            else
+              {
+                g1 = (-erf(alpha * r) + 2.0 * alpha * r / sqrt(M_PI) * exp(-alpha2 * r2)) * r3inv;
+              }
+          }
+
+        D1 += g1 * dxyz;
+      }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      {
+        if(nx != 0 || ny != 0)
+          {
+            double kx = (2.0 * M_PI / BOXX) * nx;
+            double ky = (2.0 * M_PI / BOXY) * ny;
+            double k2 = kx * kx + ky * ky;
+            double k = sqrt(k2);
+
+            double val = M_PI / (BOXX * BOXY) * sin(kx * x + ky * y) *
+                         (exp(k * z) * erfc(k / (2 * alpha) + alpha * z) + exp(-k * z) * erfc(k / (2 * alpha) - alpha * z)) / k;
+
+            D1[0] -= -kx * val;
+            D1[1] -= -ky * val;
+            D1[2] -= M_PI / (BOXX * BOXY) * cos(kx * x + ky * y) *
+                     (exp(k * z) * erfc(k / (2 * alpha) + alpha * z) - exp(-k * z) * erfc(k / (2 * alpha) - alpha * z));
+          }
+      }
+
+  D1[2] += 2.0 * M_PI / (BOXX * BOXY) * erf(alpha * z);
+#endif
+
+  return D1;  // now in dimensionless form;
+}
+
+symtensor2<double> ewald::ewald_D2(double x, double y, double z)
+{
+  static int printed = 0;
+
+  symtensor2<double> D2 = 0.0;
+
+  double leff   = pow((1.0 / LONG_X) * (1.0 / LONG_Y) * (1.0 / LONG_Z), 1.0 / 3);
+  double alpha  = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 * LONG_X / alpha + 0.5);
+  int qymax = (int)(8.0 * LONG_Y / alpha + 0.5);
+  int qzmax = (int)(8.0 * LONG_Z / alpha + 0.5);
+
+  int nxmax = (int)(2.0 * alpha / LONG_X + 0.5);
+  int nymax = (int)(2.0 * alpha / LONG_Y + 0.5);
+  int nzmax = (int)(2.0 * alpha / LONG_Z + 0.5);
+
+  if(printed == 0)
+    {
+      mpi_printf("EWALD: D2 table: qxmax=%d qymax=%d qzmax=%d   nxmax=%d nymax=%d nzmax=%d\n", qxmax, qymax, qzmax, nxmax, nymax,
+                 nzmax);
+      printed = 1;
+    }
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      for(int nz = -qzmax; nz <= qzmax; nz++)
+        {
+          double dx = x - nx * (1.0 / LONG_X);
+          double dy = y - ny * (1.0 / LONG_Y);
+          double dz = z - nz * (1.0 / LONG_Z);
+
+          vector<double> dxyz(dx, dy, dz);
+
+          double r2 = dx * dx + dy * dy + dz * dz;
+          double r  = sqrt(r2);
+
+          double rinv  = (r > 0) ? 1.0 / r : 0.0;
+          double r2inv = rinv * rinv;
+          double r3inv = r2inv * rinv;
+          double r5inv = r3inv * r2inv;
+
+          double g1, g2;
+
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              g1 = (erfc(alpha * r) + 2.0 * alpha * r / sqrt(M_PI) * exp(-alpha2 * r2)) * r3inv;
+
+              g2 = -(3.0 * erfc(alpha * r) + (6.0 * alpha * r + 4.0 * pow(alpha * r, 3)) / sqrt(M_PI) * exp(-alpha2 * r2)) * r5inv;
+            }
+          else
+            {
+              /* we add the 1/r term here to the (0|0|0) entry, followed by differentiation, and the limit r->0 to obtain accurate
+               * results at the origin
+               */
+
+              /* Note, for small r:
+               *
+               *   [1/- erfc(a r)]/r  =  2 a/sqrt(pi) * [ 1 - (a r)^2/3 + (a r)^4 / 10 - (a r)^6 / 42 + (a r)^8 / 216 - ...]
+               *
+               *   Hence for r = 0:
+               *
+               *   g0 =  2     * alpha   / sqrt(pi)
+               *   g1 = -4/3   * alpha^3 / sqrt(pi)
+               *   g2 =  8/5   * alpha^5 / sqrt(pi)
+               *   g3 = -16/7  * alpha^7 / sqrt(pi)
+               *   g4 =  32/9  * alpha^9 / sqrt(pi)
+               *   g5 = -64/11 * alpha^11/ sqrt(pi)
+               */
+
+              if((alpha * r) < 0.5)
+                {
+                  g1 = 4.0 * pow(alpha, 3) / sqrt(M_PI) *
+                       (-1.0 / 3.0 + pow(alpha * r, 2) / 5.0 - pow(alpha * r, 4) / 14.0 + pow(alpha * r, 6) / 54.0 -
+                        pow(alpha * r, 8) / 264.0 + pow(alpha * r, 10) / 1560.0);
+
+                  g2 = 8.0 * pow(alpha, 5) / sqrt(M_PI) *
+                       (1.0 / 5.0 - pow(alpha * r, 2) / 7.0 + pow(alpha * r, 4) / 18.0 - pow(alpha * r, 6) / 66.0 +
+                        pow(alpha * r, 8) / 312.0 - pow(alpha * r, 10) / 1800.0);
+                }
+              else
+                {
+                  g1 = (-erf(alpha * r) + 2.0 * alpha * r / sqrt(M_PI) * exp(-alpha2 * r2)) * r3inv;
+
+                  g2 = (3.0 * erf(alpha * r) - (6.0 * alpha * r + 4.0 * pow(alpha * r, 3)) / sqrt(M_PI) * exp(-alpha2 * r2)) * r5inv;
+                }
+            }
+
+          D2 += g2 * (dxyz % dxyz);
+          D2[qXX] += g1;
+          D2[qYY] += g1;
+          D2[qZZ] += g1;
+        }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      for(int nz = -nzmax; nz <= nzmax; nz++)
+        {
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              double kx = (2.0 * M_PI * LONG_X) * nx;
+              double ky = (2.0 * M_PI * LONG_Y) * ny;
+              double kz = (2.0 * M_PI * LONG_Z) * nz;
+              double k2 = kx * kx + ky * ky + kz * kz;
+
+              double kdotx = (x * kx + y * ky + z * kz);
+              double val   = 4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / k2 * exp(-k2 / (4.0 * alpha2)) * cos(kdotx);
+
+              vector<double> kxyz(kx, ky, kz);
+
+              D2 += (val * kxyz) % kxyz;
+            }
+        }
+
+  return D2;
+}
+
+symtensor3<double> ewald::ewald_D3(double x, double y, double z)
+{
+  static int printed = 0;
+
+#ifdef GRAVITY_TALLBOX
+  Terminate("GRAVITY_TALLBOX is not implemented for MULTIPOLE_ORDER >= 3");
+#endif
+
+  symtensor3<double> D3 = 0.0;
+
+  double leff   = pow((1.0 / LONG_X) * (1.0 / LONG_Y) * (1.0 / LONG_Z), 1.0 / 3);
+  double alpha  = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 * LONG_X / alpha + 0.5);
+  int qymax = (int)(8.0 * LONG_Y / alpha + 0.5);
+  int qzmax = (int)(8.0 * LONG_Z / alpha + 0.5);
+
+  int nxmax = (int)(2.0 * alpha / LONG_X + 0.5);
+  int nymax = (int)(2.0 * alpha / LONG_Y + 0.5);
+  int nzmax = (int)(2.0 * alpha / LONG_Z + 0.5);
+
+  if(printed == 0)
+    {
+      mpi_printf("EWALD: D3 table: qxmax=%d qymax=%d qzmax=%d   nxmax=%d nymax=%d nzmax=%d\n", qxmax, qymax, qzmax, nxmax, nymax,
+                 nzmax);
+      printed = 1;
+    }
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      for(int nz = -qzmax; nz <= qzmax; nz++)
+        {
+          double dx = x - nx * (1.0 / LONG_X);
+          double dy = y - ny * (1.0 / LONG_Y);
+          double dz = z - nz * (1.0 / LONG_Z);
+
+          vector<double> dxyz(dx, dy, dz);
+
+          double r2 = dx * dx + dy * dy + dz * dz;
+          double r  = sqrt(r2);
+
+          double rinv  = (r > 0) ? 1.0 / r : 0.0;
+          double r2inv = rinv * rinv;
+          double r3inv = r2inv * rinv;
+          double r4inv = r2inv * r2inv;
+          double r5inv = r2inv * r3inv;
+          double r7inv = r3inv * r4inv;
+
+          double g2, g3;
+
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              g2 = -(3.0 * erfc(alpha * r) + (6.0 * alpha * r + 4.0 * pow(alpha * r, 3)) / sqrt(M_PI) * exp(-alpha2 * r2)) * r5inv;
+
+              g3 = (15.0 * erfc(alpha * r) +
+                    (30.0 * alpha * r + 20.0 * pow(alpha * r, 3) + 8.0 * pow(alpha * r, 5)) / sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r7inv;
+            }
+          else
+            {
+              /* we add the 1/r term here to the (0|0|0) entry, followed by differentiation, and the limit r->0 to obtain accurate
+               * results at the origin
+               */
+
+              /* Note, for small r:
+               *
+               *   [1/- erfc(a r)]/r  =  2 a/sqrt(pi) * [ 1 - (a r)^2/3 + (a r)^4 / 10 - (a r)^6 / 42 + (a r)^8 / 216 - ...]
+               *
+               *   Hence for r = 0:
+               *
+               *   g0 =  2     * alpha   / sqrt(pi)
+               *   g1 = -4/3   * alpha^3 / sqrt(pi)
+               *   g2 =  8/5   * alpha^5 / sqrt(pi)
+               *   g3 = -16/7  * alpha^7 / sqrt(pi)
+               *   g4 =  32/9  * alpha^9 / sqrt(pi)
+               *   g5 = -64/11 * alpha^11/ sqrt(pi)
+               */
+              if((alpha * r) < 0.5)
+                {
+                  g2 = 8.0 * pow(alpha, 5) / sqrt(M_PI) *
+                       (1.0 / 5.0 - pow(alpha * r, 2) / 7.0 + pow(alpha * r, 4) / 18.0 - pow(alpha * r, 6) / 66.0 +
+                        pow(alpha * r, 8) / 312.0 - pow(alpha * r, 10) / 1800.0);
+
+                  g3 = 16.0 * pow(alpha, 7) / sqrt(M_PI) *
+                       (-1.0 / 7.0 + pow(alpha * r, 2) / 9.0 - pow(alpha * r, 4) / 22.0 + pow(alpha * r, 6) / 78.0 -
+                        pow(alpha * r, 8) / 360.0 + pow(alpha * r, 10) / 2040.0);
+                }
+              else
+                {
+                  g2 = (3.0 * erf(alpha * r) - (6.0 * alpha * r + 4.0 * pow(alpha * r, 3)) / sqrt(M_PI) * exp(-alpha2 * r2)) * r5inv;
+
+                  g3 = (-15.0 * erf(alpha * r) +
+                        (30.0 * alpha * r + 20.0 * pow(alpha * r, 3) + 8.0 * pow(alpha * r, 5)) / sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r7inv;
+                }
+            }
+
+          symtensor2<double> aux2 = dxyz % dxyz;
+          symtensor3<double> aux3;
+
+          setup_D3(ADD, D3, dxyz, aux2, aux3, g2, g3);
+        }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      for(int nz = -nzmax; nz <= nzmax; nz++)
+        {
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              double kx = (2.0 * M_PI * LONG_X) * nx;
+              double ky = (2.0 * M_PI * LONG_Y) * ny;
+              double kz = (2.0 * M_PI * LONG_Z) * nz;
+              double k2 = kx * kx + ky * ky + kz * kz;
+
+              double kdotx = (x * kx + y * ky + z * kz);
+              double val   = -4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / k2 * exp(-k2 / (4.0 * alpha2)) * sin(kdotx);
+
+              vector<double> kxyz(kx, ky, kz);
+
+              D3 += (val * kxyz) % (kxyz % kxyz);
+            }
+        }
+
+  return D3;
+}
+
+symtensor4<double> ewald::ewald_D4(double x, double y, double z)
+{
+  static int printed = 0;
+
+#ifdef GRAVITY_TALLBOX
+  Terminate("GRAVITY_TALLBOX is not implemented for MULTIPOLE_ORDER >= 4");
+#endif
+
+  symtensor4<double> D4 = 0.0;
+
+  double leff   = pow((1.0 / LONG_X) * (1.0 / LONG_Y) * (1.0 / LONG_Z), 1.0 / 3);
+  double alpha  = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 * LONG_X / alpha + 0.5);
+  int qymax = (int)(8.0 * LONG_Y / alpha + 0.5);
+  int qzmax = (int)(8.0 * LONG_Z / alpha + 0.5);
+
+  int nxmax = (int)(2.0 * alpha / LONG_X + 0.5);
+  int nymax = (int)(2.0 * alpha / LONG_Y + 0.5);
+  int nzmax = (int)(2.0 * alpha / LONG_Z + 0.5);
+
+  if(printed == 0)
+    {
+      mpi_printf("EWALD: D4 table: qxmax=%d qymax=%d qzmax=%d   nxmax=%d nymax=%d nzmax=%d\n", qxmax, qymax, qzmax, nxmax, nymax,
+                 nzmax);
+      printed = 1;
+    }
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      for(int nz = -qzmax; nz <= qzmax; nz++)
+        {
+          double dx = x - nx * (1.0 / LONG_X);
+          double dy = y - ny * (1.0 / LONG_Y);
+          double dz = z - nz * (1.0 / LONG_Z);
+
+          vector<double> dxyz(dx, dy, dz);
+
+          double r2 = dx * dx + dy * dy + dz * dz;
+          double r  = sqrt(r2);
+
+          double rinv  = (r > 0) ? 1.0 / r : 0.0;
+          double r2inv = rinv * rinv;
+          double r3inv = r2inv * rinv;
+          double r4inv = r2inv * r2inv;
+          double r5inv = r2inv * r3inv;
+          double r7inv = r3inv * r4inv;
+          double r9inv = r4inv * r5inv;
+
+          double g2, g3, g4;
+
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              g2 = -(3.0 * erfc(alpha * r) + (6.0 * alpha * r + 4.0 * pow(alpha * r, 3)) / sqrt(M_PI) * exp(-alpha2 * r2)) * r5inv;
+
+              g3 = (15.0 * erfc(alpha * r) +
+                    (30.0 * alpha * r + 20.0 * pow(alpha * r, 3) + 8.0 * pow(alpha * r, 5)) / sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r7inv;
+
+              g4 = -(105.0 * erfc(alpha * r) +
+                     (210.0 * alpha * r + 140.0 * pow(alpha * r, 3) + 56.0 * pow(alpha * r, 5) + 16.0 * pow(alpha * r, 7)) /
+                         sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r9inv;
+            }
+          else
+            {
+              /* we add the 1/r term here to the (0|0|0) entry, followed by differentiation, and the limit r->0 to obtain accurate
+               * results at the origin
+               */
+
+              /* Note, for small r:
+               *
+               *   [1/- erfc(a r)]/r  =  2 a/sqrt(pi) * [ 1 - (a r)^2/3 + (a r)^4 / 10 - (a r)^6 / 42 + (a r)^8 / 216 - ...]
+               *
+               *   Hence for r = 0:
+               *
+               *   g0 =  2     * alpha   / sqrt(pi)
+               *   g1 = -4/3   * alpha^3 / sqrt(pi)
+               *   g2 =  8/5   * alpha^5 / sqrt(pi)
+               *   g3 = -16/7  * alpha^7 / sqrt(pi)
+               *   g4 =  32/9  * alpha^9 / sqrt(pi)
+               *   g5 = -64/11 * alpha^11/ sqrt(pi)
+               */
+
+              if((alpha * r) < 0.5)
+                {
+                  g2 = 8.0 * pow(alpha, 5) / sqrt(M_PI) *
+                       (1.0 / 5.0 - pow(alpha * r, 2) / 7.0 + pow(alpha * r, 4) / 18.0 - pow(alpha * r, 6) / 66.0 +
+                        pow(alpha * r, 8) / 312.0 - pow(alpha * r, 10) / 1800.0);
+
+                  g3 = 16.0 * pow(alpha, 7) / sqrt(M_PI) *
+                       (-1.0 / 7.0 + pow(alpha * r, 2) / 9.0 - pow(alpha * r, 4) / 22.0 + pow(alpha * r, 6) / 78.0 -
+                        pow(alpha * r, 8) / 360.0 + pow(alpha * r, 10) / 2040.0);
+
+                  g4 = 32.0 * pow(alpha, 9) / sqrt(M_PI) *
+                       (1.0 / 9.0 - pow(alpha * r, 2) / 11.0 + pow(alpha * r, 4) / 26.0 - pow(alpha * r, 6) / 90.0 +
+                        pow(alpha * r, 8) / 408.0 - pow(alpha * r, 10) / 2280.0);
+                }
+              else
+                {
+                  g2 = (3.0 * erf(alpha * r) - (6.0 * alpha * r + 4.0 * pow(alpha * r, 3)) / sqrt(M_PI) * exp(-alpha2 * r2)) * r5inv;
+
+                  g3 = (-15.0 * erf(alpha * r) +
+                        (30.0 * alpha * r + 20.0 * pow(alpha * r, 3) + 8.0 * pow(alpha * r, 5)) / sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r7inv;
+
+                  g4 = (105.0 * erf(alpha * r) -
+                        (210.0 * alpha * r + 140.0 * pow(alpha * r, 3) + 56.0 * pow(alpha * r, 5) + 16.0 * pow(alpha * r, 7)) /
+                            sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r9inv;
+                }
+            }
+
+          symtensor2<double> aux2 = dxyz % dxyz;
+          symtensor3<double> aux3 = dxyz % aux2;
+          symtensor4<double> aux4;
+
+          setup_D4(ADD, D4, dxyz, aux2, aux3, aux4, g2, g3, g4);
+        }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      for(int nz = -nzmax; nz <= nzmax; nz++)
+        {
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              double kx = (2.0 * M_PI * LONG_X) * nx;
+              double ky = (2.0 * M_PI * LONG_Y) * ny;
+              double kz = (2.0 * M_PI * LONG_Z) * nz;
+              double k2 = kx * kx + ky * ky + kz * kz;
+
+              double kdotx = (x * kx + y * ky + z * kz);
+              double val   = -4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / k2 * exp(-k2 / (4.0 * alpha2)) * cos(kdotx);
+
+              vector<double> kxyz(kx, ky, kz);
+
+              D4 += (val * kxyz) % ((kxyz % (kxyz % kxyz)));
+            }
+        }
+
+  return D4;
+}
+
+symtensor5<double> ewald::ewald_D5(double x, double y, double z)
+{
+  static int printed = 0;
+
+#ifdef GRAVITY_TALLBOX
+  Terminate("GRAVITY_TALLBOX is not implemented for MULTIPOLE_ORDER >= 4");
+#endif
+
+  symtensor5<double> D5 = 0.0;
+
+  double leff   = pow((1.0 / LONG_X) * (1.0 / LONG_Y) * (1.0 / LONG_Z), 1.0 / 3);
+  double alpha  = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 * LONG_X / alpha + 0.5);
+  int qymax = (int)(8.0 * LONG_Y / alpha + 0.5);
+  int qzmax = (int)(8.0 * LONG_Z / alpha + 0.5);
+
+  int nxmax = (int)(2.0 * alpha / LONG_X + 0.5);
+  int nymax = (int)(2.0 * alpha / LONG_Y + 0.5);
+  int nzmax = (int)(2.0 * alpha / LONG_Z + 0.5);
+
+  if(printed == 0)
+    {
+      mpi_printf("EWALD: D5 table: qxmax=%d qymax=%d qzmax=%d   nxmax=%d nymax=%d nzmax=%d\n", qxmax, qymax, qzmax, nxmax, nymax,
+                 nzmax);
+      printed = 1;
+    }
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      for(int nz = -qzmax; nz <= qzmax; nz++)
+        {
+          double dx = x - nx * (1.0 / LONG_X);
+          double dy = y - ny * (1.0 / LONG_Y);
+          double dz = z - nz * (1.0 / LONG_Z);
+
+          vector<double> dxyz(dx, dy, dz);
+
+          double r2 = dx * dx + dy * dy + dz * dz;
+          double r  = sqrt(r2);
+
+          double rinv   = (r > 0) ? 1.0 / r : 0.0;
+          double r2inv  = rinv * rinv;
+          double r3inv  = r2inv * rinv;
+          double r4inv  = r2inv * r2inv;
+          double r5inv  = r2inv * r3inv;
+          double r7inv  = r3inv * r4inv;
+          double r9inv  = r4inv * r5inv;
+          double r11inv = r4inv * r7inv;
+
+          double g3, g4, g5;
+
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              g3 = (15.0 * erfc(alpha * r) +
+                    (30.0 * alpha * r + 20.0 * pow(alpha * r, 3) + 8.0 * pow(alpha * r, 5)) / sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r7inv;
+
+              g4 = -(105.0 * erfc(alpha * r) +
+                     (210.0 * alpha * r + 140.0 * pow(alpha * r, 3) + 56.0 * pow(alpha * r, 5) + 16.0 * pow(alpha * r, 7)) /
+                         sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r9inv;
+
+              g5 = (945.0 * erfc(alpha * r) + (1890.0 * alpha * r + 1260.0 * pow(alpha * r, 3) + 504.0 * pow(alpha * r, 5) +
+                                               144.0 * pow(alpha * r, 7) + 32.0 * pow(alpha * r, 9)) /
+                                                  sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r11inv;
+            }
+          else
+            {
+              /* we add the 1/r term here to the (0|0|0) entry, followed by differentiation, and the limit r->0 to obtain accurate
+               * results at the origin
+               */
+
+              /* Note, for small r:
+               *
+               *   [1/- erfc(a r)]/r  =  2 a/sqrt(pi) * [ 1 - (a r)^2/3 + (a r)^4 / 10 - (a r)^6 / 42 + (a r)^8 / 216 - ...]
+               *
+               *   Hence for r = 0:
+               *
+               *   g0 =  2     * alpha   / sqrt(pi)
+               *   g1 = -4/3   * alpha^3 / sqrt(pi)
+               *   g2 =  8/5   * alpha^5 / sqrt(pi)
+               *   g3 = -16/7  * alpha^7 / sqrt(pi)
+               *   g4 =  32/9  * alpha^9 / sqrt(pi)
+               *   g5 = -64/11 * alpha^11/ sqrt(pi)
+               */
+
+              if((alpha * r) < 0.5)
+                {
+                  g3 = 16.0 * pow(alpha, 7) / sqrt(M_PI) *
+                       (-1.0 / 7.0 + pow(alpha * r, 2) / 9.0 - pow(alpha * r, 4) / 22.0 + pow(alpha * r, 6) / 78.0 -
+                        pow(alpha * r, 8) / 360.0 + pow(alpha * r, 10) / 2040.0);
+
+                  g4 = 32.0 * pow(alpha, 9) / sqrt(M_PI) *
+                       (1.0 / 9.0 - pow(alpha * r, 2) / 11.0 + pow(alpha * r, 4) / 26.0 - pow(alpha * r, 6) / 90.0 +
+                        pow(alpha * r, 8) / 408.0 - pow(alpha * r, 10) / 2280.0);
+
+                  g5 = 64.0 * pow(alpha, 11) / sqrt(M_PI) *
+                       (-1.0 / 11.0 + pow(alpha * r, 2) / 13.0 - pow(alpha * r, 4) / 30.0 + pow(alpha * r, 6) / 102.0 -
+                        pow(alpha * r, 8) / 456.0 + pow(alpha * r, 10) / 2520.0);
+                }
+              else
+                {
+                  g3 = (-15.0 * erf(alpha * r) +
+                        (30.0 * alpha * r + 20.0 * pow(alpha * r, 3) + 8.0 * pow(alpha * r, 5)) / sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r7inv;
+
+                  g4 = (105.0 * erf(alpha * r) -
+                        (210.0 * alpha * r + 140.0 * pow(alpha * r, 3) + 56.0 * pow(alpha * r, 5) + 16.0 * pow(alpha * r, 7)) /
+                            sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r9inv;
+
+                  g5 = (-945.0 * erf(alpha * r) + (1890.0 * alpha * r + 1260.0 * pow(alpha * r, 3) + 504.0 * pow(alpha * r, 5) +
+                                                   144.0 * pow(alpha * r, 7) + 32.0 * pow(alpha * r, 9)) /
+                                                      sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r11inv;
+                }
+            }
+
+          symtensor3<double> aux3 = dxyz % (dxyz % dxyz);
+          symtensor4<double> aux4 = dxyz % aux3;
+          symtensor5<double> aux5;
+
+          setup_D5(ADD, D5, dxyz, aux3, aux4, aux5, g3, g4, g5);
+        }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      for(int nz = -nzmax; nz <= nzmax; nz++)
+        {
+          double kx = (2.0 * M_PI * LONG_X) * nx;
+          double ky = (2.0 * M_PI * LONG_Y) * ny;
+          double kz = (2.0 * M_PI * LONG_Z) * nz;
+          double k2 = kx * kx + ky * ky + kz * kz;
+
+          if(k2 > 0)
+            {
+              double kdotx = (x * kx + y * ky + z * kz);
+              double val   = 4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / k2 * exp(-k2 / (4.0 * alpha2)) * sin(kdotx);
+
+              vector<double> kxyz(kx, ky, kz);
+
+              D5 += (val * kxyz) % (kxyz % ((kxyz % (kxyz % kxyz))));
+            }
+        }
+
+  return D5;
+}
+
+symtensor6<double> ewald::ewald_D6(double x, double y, double z)
+{
+  static int printed = 0;
+
+#ifdef GRAVITY_TALLBOX
+  Terminate("GRAVITY_TALLBOX is not implemented for MULTIPOLE_ORDER >= 4");
+#endif
+
+  symtensor6<double> D6 = 0.0;
+
+  double leff   = pow((1.0 / LONG_X) * (1.0 / LONG_Y) * (1.0 / LONG_Z), 1.0 / 3);
+  double alpha  = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 * LONG_X / alpha + 0.5);
+  int qymax = (int)(8.0 * LONG_Y / alpha + 0.5);
+  int qzmax = (int)(8.0 * LONG_Z / alpha + 0.5);
+
+  int nxmax = (int)(2.0 * alpha / LONG_X + 0.5);
+  int nymax = (int)(2.0 * alpha / LONG_Y + 0.5);
+  int nzmax = (int)(2.0 * alpha / LONG_Z + 0.5);
+
+  if(printed == 0)
+    {
+      mpi_printf("EWALD: D6 table: qxmax=%d qymax=%d qzmax=%d   nxmax=%d nymax=%d nzmax=%d\n", qxmax, qymax, qzmax, nxmax, nymax,
+                 nzmax);
+      printed = 1;
+    }
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      for(int nz = -qzmax; nz <= qzmax; nz++)
+        {
+          double dx = x - nx * (1.0 / LONG_X);
+          double dy = y - ny * (1.0 / LONG_Y);
+          double dz = z - nz * (1.0 / LONG_Z);
+
+          vector<double> dxyz(dx, dy, dz);
+
+          double r2 = dx * dx + dy * dy + dz * dz;
+          double r  = sqrt(r2);
+
+          double rinv   = (r > 0) ? 1.0 / r : 0.0;
+          double r2inv  = rinv * rinv;
+          double r3inv  = r2inv * rinv;
+          double r4inv  = r2inv * r2inv;
+          double r5inv  = r2inv * r3inv;
+          double r7inv  = r3inv * r4inv;
+          double r9inv  = r4inv * r5inv;
+          double r11inv = r4inv * r7inv;
+          double r13inv = r4inv * r9inv;
+
+          double g3, g4, g5, g6;
+
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              g3 = (15.0 * erfc(alpha * r) +
+                    (30.0 * alpha * r + 20.0 * pow(alpha * r, 3) + 8.0 * pow(alpha * r, 5)) / sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r7inv;
+
+              g4 = -(105.0 * erfc(alpha * r) +
+                     (210.0 * alpha * r + 140.0 * pow(alpha * r, 3) + 56.0 * pow(alpha * r, 5) + 16.0 * pow(alpha * r, 7)) /
+                         sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r9inv;
+
+              g5 = (945.0 * erfc(alpha * r) + (1890.0 * alpha * r + 1260.0 * pow(alpha * r, 3) + 504.0 * pow(alpha * r, 5) +
+                                               144.0 * pow(alpha * r, 7) + 32.0 * pow(alpha * r, 9)) /
+                                                  sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r11inv;
+
+              g6 = (-10395.0 * erfc(alpha * r) -
+                    2.0 *
+                        (10395.0 * alpha * r + 6930.0 * pow(alpha * r, 3) + 2772.0 * pow(alpha * r, 5) + 792.0 * pow(alpha * r, 7) +
+                         176.0 * pow(alpha * r, 9) + 32.0 * pow(alpha * r, 11)) /
+                        sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r13inv;
+            }
+          else
+            {
+              /* we add the 1/r term here to the (0|0|0) entry, followed by differentiation, and the limit r->0 to obtain accurate
+               * results at the origin
+               */
+
+              /* Note, for small r:
+               *
+               *   [1/- erfc(a r)]/r  =  2 a/sqrt(pi) * [ 1 - (a r)^2/3 + (a r)^4 / 10 - (a r)^6 / 42 + (a r)^8 / 216 - ...]
+               *
+               *   Hence for r = 0:
+               *
+               *   g0 =  2     * alpha   / sqrt(pi)
+               *   g1 = -4/3   * alpha^3 / sqrt(pi)
+               *   g2 =  8/5   * alpha^5 / sqrt(pi)
+               *   g3 = -16/7  * alpha^7 / sqrt(pi)
+               *   g4 =  32/9  * alpha^9 / sqrt(pi)
+               *   g5 = -64/11 * alpha^11/ sqrt(pi)
+               */
+
+              if((alpha * r) < 0.5)
+                {
+                  g3 = 16.0 * pow(alpha, 7) / sqrt(M_PI) *
+                       (-1.0 / 7.0 + pow(alpha * r, 2) / 9.0 - pow(alpha * r, 4) / 22.0 + pow(alpha * r, 6) / 78.0 -
+                        pow(alpha * r, 8) / 360.0 + pow(alpha * r, 10) / 2040.0);
+
+                  g4 = 32.0 * pow(alpha, 9) / sqrt(M_PI) *
+                       (1.0 / 9.0 - pow(alpha * r, 2) / 11.0 + pow(alpha * r, 4) / 26.0 - pow(alpha * r, 6) / 90.0 +
+                        pow(alpha * r, 8) / 408.0 - pow(alpha * r, 10) / 2280.0);
+
+                  g5 = 64.0 * pow(alpha, 11) / sqrt(M_PI) *
+                       (-1.0 / 11.0 + pow(alpha * r, 2) / 13.0 - pow(alpha * r, 4) / 30.0 + pow(alpha * r, 6) / 102.0 -
+                        pow(alpha * r, 8) / 456.0 + pow(alpha * r, 10) / 2520.0);
+
+                  g6 = 128.0 * pow(alpha, 13) / sqrt(M_PI) *
+                       (1.0 / 13.0 - pow(alpha * r, 2) / 15.0 + pow(alpha * r, 4) / 34.0 - pow(alpha * r, 6) / 114.0 +
+                        pow(alpha * r, 8) / 504.0 - pow(alpha * r, 10) / 2760.0);
+                }
+              else
+                {
+                  g3 = (-15.0 * erf(alpha * r) +
+                        (30.0 * alpha * r + 20.0 * pow(alpha * r, 3) + 8.0 * pow(alpha * r, 5)) / sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r7inv;
+
+                  g4 = (105.0 * erf(alpha * r) -
+                        (210.0 * alpha * r + 140.0 * pow(alpha * r, 3) + 56.0 * pow(alpha * r, 5) + 16.0 * pow(alpha * r, 7)) /
+                            sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r9inv;
+
+                  g5 = (-945.0 * erf(alpha * r) + (1890.0 * alpha * r + 1260.0 * pow(alpha * r, 3) + 504.0 * pow(alpha * r, 5) +
+                                                   144.0 * pow(alpha * r, 7) + 32.0 * pow(alpha * r, 9)) /
+                                                      sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r11inv;
+
+                  g6 = (10395.0 * erf(alpha * r) -
+                        2.0 *
+                            (10395.0 * alpha * r + 6930.0 * pow(alpha * r, 3) + 2772.0 * pow(alpha * r, 5) +
+                             792.0 * pow(alpha * r, 7) + 176.0 * pow(alpha * r, 9) + 32.0 * pow(alpha * r, 11)) /
+                            sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r13inv;
+                }
+            }
+
+          setup_D6(ADD, D6, dxyz, g3, g4, g5, g6);
+        }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      for(int nz = -nzmax; nz <= nzmax; nz++)
+        {
+          double kx = (2.0 * M_PI * LONG_X) * nx;
+          double ky = (2.0 * M_PI * LONG_Y) * ny;
+          double kz = (2.0 * M_PI * LONG_Z) * nz;
+          double k2 = kx * kx + ky * ky + kz * kz;
+
+          if(k2 > 0)
+            {
+              double kdotx = (x * kx + y * ky + z * kz);
+              double val   = 4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / k2 * exp(-k2 / (4.0 * alpha2)) * cos(kdotx);
+
+              vector<double> kxyz(kx, ky, kz);
+
+              D6 += (val * kxyz) % (kxyz % (kxyz % ((kxyz % (kxyz % kxyz)))));
+            }
+        }
+
+  return D6;
+}
+
+symtensor7<double> ewald::ewald_D7(double x, double y, double z)
+{
+  static int printed = 0;
+
+#ifdef GRAVITY_TALLBOX
+  Terminate("GRAVITY_TALLBOX is not implemented for MULTIPOLE_ORDER >= 4");
+#endif
+
+  symtensor7<double> D7 = 0.0;
+
+  double leff   = pow((1.0 / LONG_X) * (1.0 / LONG_Y) * (1.0 / LONG_Z), 1.0 / 3);
+  double alpha  = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 * LONG_X / alpha + 0.5);
+  int qymax = (int)(8.0 * LONG_Y / alpha + 0.5);
+  int qzmax = (int)(8.0 * LONG_Z / alpha + 0.5);
+
+  int nxmax = (int)(2.0 * alpha / LONG_X + 0.5);
+  int nymax = (int)(2.0 * alpha / LONG_Y + 0.5);
+  int nzmax = (int)(2.0 * alpha / LONG_Z + 0.5);
+
+  if(printed == 0)
+    {
+      mpi_printf("EWALD: D7 table: qxmax=%d qymax=%d qzmax=%d   nxmax=%d nymax=%d nzmax=%d\n", qxmax, qymax, qzmax, nxmax, nymax,
+                 nzmax);
+      printed = 1;
+    }
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      for(int nz = -qzmax; nz <= qzmax; nz++)
+        {
+          double dx = x - nx * (1.0 / LONG_X);
+          double dy = y - ny * (1.0 / LONG_Y);
+          double dz = z - nz * (1.0 / LONG_Z);
+
+          vector<double> dxyz(dx, dy, dz);
+
+          double r2 = dx * dx + dy * dy + dz * dz;
+          double r  = sqrt(r2);
+
+          double rinv   = (r > 0) ? 1.0 / r : 0.0;
+          double r2inv  = rinv * rinv;
+          double r3inv  = r2inv * rinv;
+          double r4inv  = r2inv * r2inv;
+          double r5inv  = r2inv * r3inv;
+          double r7inv  = r3inv * r4inv;
+          double r9inv  = r4inv * r5inv;
+          double r11inv = r4inv * r7inv;
+          double r13inv = r4inv * r9inv;
+          double r15inv = r4inv * r11inv;
+
+          double g4, g5, g6, g7;
+
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              g4 = -(105.0 * erfc(alpha * r) +
+                     (210.0 * alpha * r + 140.0 * pow(alpha * r, 3) + 56.0 * pow(alpha * r, 5) + 16.0 * pow(alpha * r, 7)) /
+                         sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r9inv;
+
+              g5 = (945.0 * erfc(alpha * r) + (1890.0 * alpha * r + 1260.0 * pow(alpha * r, 3) + 504.0 * pow(alpha * r, 5) +
+                                               144.0 * pow(alpha * r, 7) + 32.0 * pow(alpha * r, 9)) /
+                                                  sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r11inv;
+
+              g6 = (-10395.0 * erfc(alpha * r) -
+                    2.0 *
+                        (10395.0 * alpha * r + 6930.0 * pow(alpha * r, 3) + 2772.0 * pow(alpha * r, 5) + 792.0 * pow(alpha * r, 7) +
+                         176.0 * pow(alpha * r, 9) + 32.0 * pow(alpha * r, 11)) /
+                        sqrt(M_PI) * exp(-alpha2 * r2)) *
+                   r13inv;
+
+              g7 =
+                  (135135.0 * erfc(alpha * r) + 2.0 *
+                                                    (135135.0 * alpha * r + 90090.0 * pow(alpha * r, 3) + 36036.0 * pow(alpha * r, 5) +
+                                                     10296.0 * pow(alpha * r, 7) + 2288.0 * pow(alpha * r, 9) +
+                                                     416.0 * pow(alpha * r, 11) + 64.0 * pow(alpha * r, 13)) /
+                                                    sqrt(M_PI) * exp(-alpha2 * r2)) *
+                  r15inv;
+            }
+          else
+            {
+              /* we add the 1/r term here to the (0|0|0) entry, followed by differentiation, and the limit r->0 to obtain accurate
+               * results at the origin
+               */
+
+              /* Note, for small r:
+               *
+               *   [1/- erfc(a r)]/r  =  2 a/sqrt(pi) * [ 1 - (a r)^2/3 + (a r)^4 / 10 - (a r)^6 / 42 + (a r)^8 / 216 - ...]
+               *
+               *   Hence for r = 0:
+               *
+               *   g0 =  2     * alpha   / sqrt(pi)
+               *   g1 = -4/3   * alpha^3 / sqrt(pi)
+               *   g2 =  8/5   * alpha^5 / sqrt(pi)
+               *   g3 = -16/7  * alpha^7 / sqrt(pi)
+               *   g4 =  32/9  * alpha^9 / sqrt(pi)
+               *   g5 = -64/11 * alpha^11/ sqrt(pi)
+               */
+
+              if((alpha * r) < 0.5)
+                {
+                  g4 = 32.0 * pow(alpha, 9) / sqrt(M_PI) *
+                       (1.0 / 9.0 - pow(alpha * r, 2) / 11.0 + pow(alpha * r, 4) / 26.0 - pow(alpha * r, 6) / 90.0 +
+                        pow(alpha * r, 8) / 408.0 - pow(alpha * r, 10) / 2280.0);
+
+                  g5 = 64.0 * pow(alpha, 11) / sqrt(M_PI) *
+                       (-1.0 / 11.0 + pow(alpha * r, 2) / 13.0 - pow(alpha * r, 4) / 30.0 + pow(alpha * r, 6) / 102.0 -
+                        pow(alpha * r, 8) / 456.0 + pow(alpha * r, 10) / 2520.0);
+
+                  g6 = 128.0 * pow(alpha, 13) / sqrt(M_PI) *
+                       (1.0 / 13.0 - pow(alpha * r, 2) / 15.0 + pow(alpha * r, 4) / 34.0 - pow(alpha * r, 6) / 114.0 +
+                        pow(alpha * r, 8) / 504.0 - pow(alpha * r, 10) / 2760.0);
+
+                  g7 = 256.0 * pow(alpha, 15) / sqrt(M_PI) *
+                       (-1.0 / 15.0 + pow(alpha * r, 2) / 17.0 - pow(alpha * r, 4) / 38.0 + pow(alpha * r, 6) / 126.0 -
+                        pow(alpha * r, 8) / 552.0 + pow(alpha * r, 10) / 3000.0);
+                }
+              else
+                {
+                  g4 = (105.0 * erf(alpha * r) -
+                        (210.0 * alpha * r + 140.0 * pow(alpha * r, 3) + 56.0 * pow(alpha * r, 5) + 16.0 * pow(alpha * r, 7)) /
+                            sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r9inv;
+
+                  g5 = (-945.0 * erf(alpha * r) + (1890.0 * alpha * r + 1260.0 * pow(alpha * r, 3) + 504.0 * pow(alpha * r, 5) +
+                                                   144.0 * pow(alpha * r, 7) + 32.0 * pow(alpha * r, 9)) /
+                                                      sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r11inv;
+
+                  g6 = (10395.0 * erf(alpha * r) -
+                        2.0 *
+                            (10395.0 * alpha * r + 6930.0 * pow(alpha * r, 3) + 2772.0 * pow(alpha * r, 5) +
+                             792.0 * pow(alpha * r, 7) + 176.0 * pow(alpha * r, 9) + 32.0 * pow(alpha * r, 11)) /
+                            sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r13inv;
+
+                  g7 = (-135135.0 * erf(alpha * r) +
+                        2.0 *
+                            (135135.0 * alpha * r + 90090.0 * pow(alpha * r, 3) + 36036.0 * pow(alpha * r, 5) +
+                             10296.0 * pow(alpha * r, 7) + 2288.0 * pow(alpha * r, 9) + 416.0 * pow(alpha * r, 11) +
+                             64.0 * pow(alpha * r, 13)) /
+                            sqrt(M_PI) * exp(-alpha2 * r2)) *
+                       r15inv;
+                }
+            }
+
+          setup_D7(ADD, D7, dxyz, g4, g5, g6, g7);
+        }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      for(int nz = -nzmax; nz <= nzmax; nz++)
+        {
+          double kx = (2.0 * M_PI * LONG_X) * nx;
+          double ky = (2.0 * M_PI * LONG_Y) * ny;
+          double kz = (2.0 * M_PI * LONG_Z) * nz;
+          double k2 = kx * kx + ky * ky + kz * kz;
+
+          if(k2 > 0)
+            {
+              double kdotx = (x * kx + y * ky + z * kz);
+              double val   = -4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / k2 * exp(-k2 / (4.0 * alpha2)) * sin(kdotx);
+
+              vector<double> kxyz(kx, ky, kz);
+
+              D7 += (val * kxyz) % (kxyz % (kxyz % (kxyz % (kxyz % (kxyz % kxyz)))));
+            }
+        }
+
+  return D7;
+}
diff --git a/src/gravity/ewald.h b/src/gravity/ewald.h
new file mode 100644
index 0000000000000000000000000000000000000000..77e4317f76c518fc6e4446ead59ffe8a0fe67b8f
--- /dev/null
+++ b/src/gravity/ewald.h
@@ -0,0 +1,129 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file ewald.h
+ *
+ *  \brief definition of class that implements the functionality needed for Ewald corrections
+ */
+
+#ifndef EWALD_H
+#define EWALD_H
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../data/symtensors.h"
+#include "../gravity/ewaldtensors.h"
+#include "../gravtree/gravtree.h"
+#include "../io/io.h"
+#include "../system/system.h"
+
+/*!
+ *
+ *  This file contains the definitions of the Ewald correction lookup table code.
+ */
+
+#if defined(GRAVITY_TALLBOX) && defined(FMM)
+#error "FFM and GRAVITY_TALLBOX cannot be used together"
+#endif
+
+#define EWLEVEL 6
+
+#define EN (1 << EWLEVEL)  //!<  Base dimension of cubical Ewald lookup table, for one octant
+
+#define ENX (DBX * EN)
+#define ENY (DBY * EN)
+#define ENZ (DBZ * EN)
+
+/*!  \brief Ewald correction functionality.
+ *
+ * This class collects all the functionality provided for Ewald table lookups.
+ */
+class ewald : public intposconvert, public io_streamcount, public setcomm
+{
+ public:
+  ewald(void) : setcomm("delayed init") {}
+
+  void ewald_init(void);
+
+  struct ewald_header
+  {
+    int resx, resy, resz, varsize, ewaldtype;
+  };
+
+  enum interpolate_options
+  {
+    POINTMASS,
+    MULTIPOLES,
+    LONGRANGE_FORCETEST
+  };
+
+  void ewald_corr(double dx, double dy, double dz, enum interpolate_options, ewald_data &fper);
+  void ewald_corr_exact(double dx, double dy, double dz, enum interpolate_options flag, ewald_data &fper);
+  void ewald_gridlookup(const MyIntPosType *p_intpos, const MyIntPosType *target_intpos, enum interpolate_options flag,
+                        ewald_data &fper);
+
+  double ewald_D0(double x, double y, double z);
+  vector<double> ewald_D1(double x, double y, double z);
+  symtensor2<double> ewald_D2(double x, double y, double z);
+  symtensor3<double> ewald_D3(double x, double y, double z);
+  symtensor4<double> ewald_D4(double x, double y, double z);
+  symtensor5<double> ewald_D5(double x, double y, double z);
+  symtensor6<double> ewald_D6(double x, double y, double z);
+  symtensor7<double> ewald_D7(double x, double y, double z);
+
+  ewaldtensor6<double> ewald_P6(void);
+  ewaldtensor8<double> ewald_P8(void);
+  ewaldtensor10<double> ewald_P10(void);
+
+ private:
+  ewald_data *Ewd;  // points to an [ENX + 1][ENY + 1][ENZ + 1] array
+
+  inline int ewd_offset(int i, int j, int k) { return (i * (ENY + 1) + j) * (ENZ + 1) + k; }
+
+  double Ewd_fac_intp[3];
+
+  /*
+   *   in D0phi we store the correction potential:
+   *
+   *      phi = 1/x + pi/alpha^2 - sum_q (erfc(alpha |x-q|)/|x-q|)  - 4pi/V sum_k exp(-k^2/(4alpha^2))/k^2 cos(k*x)
+   *
+   *   in D1phi we store the correction force (first derivative of correction potential)
+   *
+   *      dphi/dx_i
+   *
+   *   in D2Phi we store the correction tensor (second derivatives of correction potential)
+   *
+   *      d2phi/(dx_i dx_j)
+   *
+   *   in D3phi we store the third order correction tensor (third derivatives of correction potential)
+   *
+   *      d3phi/(dx_i dx_j dx_k)
+   */
+
+  int ewald_is_initialized = 0;
+
+  void test_interpolation_accuracy(void);
+
+  void ewald_interpolate(double dx, double dy, double dz, enum interpolate_options, ewald_data &fper);
+
+ public:
+#if defined(EVALPOTENTIAL) && defined(FMM) && defined(PERIODIC) && !defined(PMGRID)
+  double ewald_gridlookup_origin_D0(void) { return Ewd->D0phi; }
+#endif
+};
+
+extern ewald Ewald;
+
+#endif
diff --git a/src/gravity/ewald_test.cc b/src/gravity/ewald_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a97647882f9eb434bfa27af3d1d22b727ba7b41
--- /dev/null
+++ b/src/gravity/ewald_test.cc
@@ -0,0 +1,1158 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file ewald_test.cc
+ *
+ *  \brief some unit test routines for the table look-up in the ewald correction
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../gravity/ewald.h"
+#include "../gravity/ewaldtensors.h"
+#include "../gravtree/gravtree.h"
+#include "../io/io.h"
+#include "../main/simulation.h"
+#include "../sort/cxxsort.h"
+#include "../system/system.h"
+
+#ifdef EWALD_TEST
+
+/*!
+ *  We here use trilinear interpolation
+ *  to get it from the precomputed tables, which contain one octant
+ *  around the target particle at the origin. The other octants are
+ *  obtained from it by exploiting the symmetry properties.
+ *
+ *  \param dx x component of the distance between the two particles
+ *  \param dx y component of the distance between the two particles
+ *  \param dx z component of the distance between the two particles
+ *  \param fper pointer to array containing the correction force
+ *  \param fper[4] contains the correction potential
+ */
+void ewald::ewald_corr(double dx, double dy, double dz, enum interpolate_options flag, ewald_data &fper)
+{
+  if(!ewald_is_initialized)
+    Terminate("How come that Ewald tables are not initialized?");
+
+  int signx, signy, signz;
+
+  if(dx < 0)
+    {
+      dx    = -dx;
+      signx = -1;
+    }
+  else
+    signx = +1;
+
+  if(dy < 0)
+    {
+      dy    = -dy;
+      signy = -1;
+    }
+  else
+    signy = +1;
+
+  if(dz < 0)
+    {
+      dz    = -dz;
+      signz = -1;
+    }
+  else
+    signz = +1;
+
+  ewald_interpolate(dx, dy, dz, flag, fper);
+
+  /* change signs as needed */
+
+  fper.D1phi[0] *= signx;
+  fper.D1phi[1] *= signy;
+  fper.D1phi[2] *= signz;
+
+  if(flag == POINTMASS)
+    return;
+
+  fper.D2phi[qXY] *= signx * signy;
+  fper.D2phi[qXZ] *= signx * signz;
+  fper.D2phi[qYZ] *= signy * signz;
+
+  fper.D3phi[dXXX] *= signx;
+  fper.D3phi[dXXY] *= signy;
+  fper.D3phi[dXXZ] *= signz;
+  fper.D3phi[dXYY] *= signx;
+  fper.D3phi[dXYZ] *= signx * signy * signz;
+  fper.D3phi[dXZZ] *= signx;
+  fper.D3phi[dYYY] *= signy;
+  fper.D3phi[dYYZ] *= signz;
+  fper.D3phi[dYZZ] *= signy;
+  fper.D3phi[dZZZ] *= signz;
+
+  fper.D4phi[sXXXY] *= signx * signy;
+  fper.D4phi[sXYYY] *= signx * signy;
+  fper.D4phi[sXXXZ] *= signx * signz;
+  fper.D4phi[sXZZZ] *= signx * signz;
+  fper.D4phi[sYYYZ] *= signy * signz;
+  fper.D4phi[sYZZZ] *= signy * signz;
+  fper.D4phi[sXXYZ] *= signy * signz;
+  fper.D4phi[sXYYZ] *= signx * signz;
+  fper.D4phi[sXYZZ] *= signx * signy;
+
+  fper.D5phi[rXXXXX] *= signx;
+  fper.D5phi[rYYYYY] *= signy;
+  fper.D5phi[rZZZZZ] *= signz;
+
+  fper.D5phi[rXXXXY] *= signy;
+  fper.D5phi[rXXXXZ] *= signz;
+  fper.D5phi[rXYYYY] *= signx;
+  fper.D5phi[rXZZZZ] *= signx;
+  fper.D5phi[rYYYYZ] *= signz;
+  fper.D5phi[rYZZZZ] *= signy;
+
+  fper.D5phi[rXXXYY] *= signx;
+  fper.D5phi[rXXXZZ] *= signx;
+  fper.D5phi[rXXYYY] *= signy;
+  fper.D5phi[rXXZZZ] *= signz;
+  fper.D5phi[rYYYZZ] *= signy;
+  fper.D5phi[rYYZZZ] *= signz;
+
+  fper.D5phi[rXXYZZ] *= signy;
+  fper.D5phi[rXXYYZ] *= signz;
+  fper.D5phi[rXYYZZ] *= signx;
+
+  fper.D5phi[rXXXYZ] *= signx * signy * signz;
+  fper.D5phi[rXYYYZ] *= signx * signy * signz;
+  fper.D5phi[rXYZZZ] *= signx * signy * signz;
+}
+
+void ewald::ewald_corr_exact(double dx, double dy, double dz, enum interpolate_options flag, ewald_data &fper)
+{
+  double fac = 1.0 / All.BoxSize;
+  double x   = dx * fac;
+  double y   = dy * fac;
+  double z   = dz * fac;
+
+  fper.D0phi = pow(fac, 1) * ewald_D0(x, y, z);
+  fper.D1phi = pow(fac, 2) * ewald_D1(x, y, z);
+
+  if(flag == POINTMASS)
+    return;
+
+  fper.D2phi = pow(fac, 3) * ewald_D2(x, y, z);
+  fper.D3phi = pow(fac, 4) * ewald_D3(x, y, z);
+  fper.D4phi = pow(fac, 5) * ewald_D4(x, y, z);
+  fper.D5phi = pow(fac, 6) * ewald_D5(x, y, z);
+}
+
+void ewald::ewald_interpolate(double dx, double dy, double dz, enum interpolate_options flag, ewald_data &fper)
+{
+  double u = dx * Ewd_fac_intp[0];
+  int i    = (int)u;
+  if(i >= ENX)
+    i = ENX - 1;
+  u -= i;
+
+  double v = dy * Ewd_fac_intp[1];
+  int j    = (int)v;
+  if(j >= ENY)
+    j = ENY - 1;
+  v -= j;
+
+  double w = dz * Ewd_fac_intp[2];
+  int k    = (int)w;
+  if(k >= ENZ)
+    k = ENZ - 1;
+  w -= k;
+
+  double f1 = (1 - u) * (1 - v) * (1 - w);
+  double f2 = (1 - u) * (1 - v) * (w);
+  double f3 = (1 - u) * (v) * (1 - w);
+  double f4 = (1 - u) * (v) * (w);
+  double f5 = (u) * (1 - v) * (1 - w);
+  double f6 = (u) * (1 - v) * (w);
+  double f7 = (u) * (v) * (1 - w);
+  double f8 = (u) * (v) * (w);
+
+  ewald_data &C1 = Ewd[ewd_offset(i, j, k)];
+  ewald_data &C2 = Ewd[ewd_offset(i, j, k + 1)];
+  ewald_data &C3 = Ewd[ewd_offset(i, j + 1, k)];
+  ewald_data &C4 = Ewd[ewd_offset(i, j + 1, k + 1)];
+  ewald_data &C5 = Ewd[ewd_offset(i + 1, j, k)];
+  ewald_data &C6 = Ewd[ewd_offset(i + 1, j, k + 1)];
+  ewald_data &C7 = Ewd[ewd_offset(i + 1, j + 1, k)];
+  ewald_data &C8 = Ewd[ewd_offset(i + 1, j + 1, k + 1)];
+
+#ifdef EVALPOTENTIAL
+  fper.D0phi =
+      f1 * C1.D0phi + f2 * C2.D0phi + f3 * C3.D0phi + f4 * C4.D0phi + f5 * C5.D0phi + f6 * C6.D0phi + f7 * C7.D0phi + f8 * C8.D0phi;
+#endif
+  fper.D1phi =
+      f1 * C1.D1phi + f2 * C2.D1phi + f3 * C3.D1phi + f4 * C4.D1phi + f5 * C5.D1phi + f6 * C6.D1phi + f7 * C7.D1phi + f8 * C8.D1phi;
+
+  if(flag == POINTMASS)
+    return;
+
+  fper.D2phi =
+      f1 * C1.D2phi + f2 * C2.D2phi + f3 * C3.D2phi + f4 * C4.D2phi + f5 * C5.D2phi + f6 * C6.D2phi + f7 * C7.D2phi + f8 * C8.D2phi;
+
+  fper.D3phi =
+      f1 * C1.D3phi + f2 * C2.D3phi + f3 * C3.D3phi + f4 * C4.D3phi + f5 * C5.D3phi + f6 * C6.D3phi + f7 * C7.D3phi + f8 * C8.D3phi;
+
+  fper.D4phi =
+      f1 * C1.D4phi + f2 * C2.D4phi + f3 * C3.D4phi + f4 * C4.D4phi + f5 * C5.D4phi + f6 * C6.D4phi + f7 * C7.D4phi + f8 * C8.D4phi;
+
+  fper.D5phi =
+      f1 * C1.D5phi + f2 * C2.D5phi + f3 * C3.D5phi + f4 * C4.D5phi + f5 * C5.D5phi + f6 * C6.D5phi + f7 * C7.D5phi + f8 * C8.D5phi;
+}
+
+void ewald::test_interpolation_accuracy(void)
+{
+  init_rng(42);
+
+  printf("\n\n");
+
+  double errsum0 = 0;
+  double errmax0 = 0;
+  double count0  = 0;
+  for(int i = 0; i < 1000; i++)
+    {
+      double x = get_random_number() - 0.5;
+      double y = get_random_number() - 0.5;
+      double z = get_random_number() - 0.5;
+
+      double r = sqrt(x * x + y * y + z * z);
+
+      double D0phi_exact = (1.0 / All.BoxSize) * ewald_D0(x, y, z);
+
+      ewald_data ew;
+      ewald_corr(x * All.BoxSize, y * All.BoxSize, z * All.BoxSize, ewald::MULTIPOLES, ew);
+
+      double err = fabs((D0phi_exact - ew.D0phi) / D0phi_exact);
+
+      errsum0 += err;
+      if(err > errmax0)
+        errmax0 = err;
+      count0++;
+
+      if(err > 0.1)
+        printf("%4d   r=%g D0_exact=%g  D0_interpol=%g  rel_error=%g\n", i, r, D0phi_exact, ew.D0phi, err);
+    }
+
+  double errsum1 = 0;
+  double errmax1 = 0;
+  double count1  = 0;
+  for(int i = 0; i < 1000; i++)
+    {
+      double x = get_random_number() - 0.5;
+      double y = get_random_number() - 0.5;
+      double z = get_random_number() - 0.5;
+
+      double r = sqrt(x * x + y * y + z * z);
+
+      vector<double> D1phi_exact = pow(All.BoxSize, -2) * ewald_D1(x, y, z);
+
+      ewald_data ew;
+      ewald_corr(x * All.BoxSize, y * All.BoxSize, z * All.BoxSize, ewald::MULTIPOLES, ew);
+
+      double norm = D1phi_exact.norm();
+
+      for(int j = 0; j < 3; j++)
+        {
+          double err = fabs(D1phi_exact[j] - ew.D1phi[j]) / norm;
+          errsum1 += err;
+          if(err > errmax1)
+            errmax1 = err;
+          count1++;
+
+          if(err > 0.1)
+            {
+              printf("%4d  r=%g bin=%d  D1_exact[%d]=%g  D1_interpol[%d]=%g  rel_error=%g\n", i, r,
+                     (int)(r * All.BoxSize * Ewd_fac_intp[0]), j, D1phi_exact[j], j, ew.D1phi[j], err);
+            }
+        }
+    }
+
+  double errsum2 = 0;
+  double errmax2 = 0;
+  double count2  = 0;
+  for(int i = 0; i < 1000; i++)
+    {
+      double x = get_random_number() - 0.5;
+      double y = get_random_number() - 0.5;
+      double z = get_random_number() - 0.5;
+
+      double r = sqrt(x * x + y * y + z * z);
+
+      symtensor2<double> D2phi_exact = pow(All.BoxSize, -3) * ewald_D2(x, y, z);
+
+      ewald_data ew;
+      ewald_corr(x * All.BoxSize, y * All.BoxSize, z * All.BoxSize, ewald::MULTIPOLES, ew);
+
+      double norm = D2phi_exact.norm();
+
+      for(int j = 0; j < 6; j++)
+        {
+          double err = fabs((D2phi_exact[j] - ew.D2phi[j]) / norm);
+          errsum2 += err;
+          if(err > errmax2)
+            errmax2 = err;
+          count2++;
+
+          if(err > 0.1)
+            printf("%4d  r=%g  D2_exact[%d]=%g  D2_interpol[%d]=%g  rel_error=%g\n", i, r, j, D2phi_exact[j], j, ew.D2phi[j], err);
+        }
+    }
+
+  double errsum3 = 0;
+  double errmax3 = 0;
+  double count3  = 0;
+  for(int i = 0; i < 1000; i++)
+    {
+      double x = get_random_number() - 0.5;
+      double y = get_random_number() - 0.5;
+      double z = get_random_number() - 0.5;
+
+      double r = sqrt(x * x + y * y + z * z);
+
+      symtensor3<double> D3phi_exact = pow(All.BoxSize, -4) * ewald_D3(x, y, z);
+
+      ewald_data ew;
+      ewald_corr(x * All.BoxSize, y * All.BoxSize, z * All.BoxSize, ewald::MULTIPOLES, ew);
+
+      double norm = D3phi_exact.norm();
+
+      for(int j = 0; j < 10; j++)
+        {
+          double err = fabs((D3phi_exact[j] - ew.D3phi[j]) / norm);
+          errsum3 += err;
+          if(err > errmax3)
+            errmax3 = err;
+          count3++;
+
+          if(err > 0.1)
+            printf("%4d   r=%g  D3_exact[%d]=%g  D3_interpol[%d]=%g  rel_error=%g\n", i, r, j, D3phi_exact[j], j, ew.D3phi[j], err);
+        }
+    }
+
+  double errsum4 = 0;
+  double errmax4 = 0;
+  double count4  = 0;
+  for(int i = 0; i < 1000; i++)
+    {
+      double x = get_random_number() - 0.5;
+      double y = get_random_number() - 0.5;
+      double z = get_random_number() - 0.5;
+
+      double r = sqrt(x * x + y * y + z * z);
+
+      symtensor4<double> D4phi_exact = pow(All.BoxSize, -5) * ewald_D4(x, y, z);
+
+      ewald_data ew;
+      ewald_corr(x * All.BoxSize, y * All.BoxSize, z * All.BoxSize, ewald::MULTIPOLES, ew);
+
+      double norm = D4phi_exact.norm();
+
+      for(int j = 0; j < 15; j++)
+        {
+          double err = fabs((D4phi_exact[j] - ew.D4phi[j]) / norm);
+          errsum4 += err;
+          if(err > errmax4)
+            errmax4 = err;
+          count4++;
+
+          if(err > 0.1)
+            printf("%4d  r=%g  D4_exact[%d]=%g  D4_interpol[%d]=%g  rel_error=%g\n", i, r, j, D4phi_exact[j], j, ew.D4phi[j], err);
+        }
+    }
+
+  double errsum5 = 0;
+  double errmax5 = 0;
+  double count5  = 0;
+  for(int i = 0; i < 1000; i++)
+    {
+      double x = get_random_number() - 0.5;
+      double y = get_random_number() - 0.5;
+      double z = get_random_number() - 0.5;
+
+      symtensor5<double> D5phi_exact = pow(All.BoxSize, -6) * ewald_D5(x, y, z);
+
+      ewald_data ew;
+      ewald_corr(x * All.BoxSize, y * All.BoxSize, z * All.BoxSize, ewald::MULTIPOLES, ew);
+
+      double norm = D5phi_exact.norm();
+
+      for(int j = 0; j < 21; j++)
+        {
+          double err = fabs((D5phi_exact[j] - ew.D5phi[j]) / norm);
+          errsum5 += err;
+          if(err > errmax5)
+            errmax5 = err;
+          count5++;
+        }
+    }
+
+  printf("\n\n");
+
+  printf("D0:   max error = %g   mean error=%g\n", errmax0, errsum0 / count0);
+  printf("D1:   max error = %g   mean error=%g\n", errmax1, errsum1 / count1);
+  printf("D2:   max error = %g   mean error=%g\n", errmax2, errsum2 / count2);
+  printf("D3:   max error = %g   mean error=%g\n", errmax3, errsum3 / count3);
+  printf("D4:   max error = %g   mean error=%g\n", errmax4, errsum4 / count4);
+  printf("D5:   max error = %g   mean error=%g\n", errmax5, errsum5 / count5);
+
+  printf("\n\n");
+
+  {
+    double errsum0 = 0;
+    double errmax0 = 0;
+    double count0  = 0;
+    for(int i = 0; i < 1000; i++)
+      {
+        double x = get_random_number() - 0.5;
+        double y = get_random_number() - 0.5;
+        double z = get_random_number() - 0.5;
+
+        double D0phi_exact = (1.0 / All.BoxSize) * ewald_D0(x, y, z);
+
+        ewald_data ew;
+        double posd[3] = {x * All.BoxSize, y * All.BoxSize, z * All.BoxSize};
+        MyIntPosType pos[3];
+        pos_to_signedintpos(posd, (MySignedIntPosType *)pos);
+        MyIntPosType ref[3] = {0, 0, 0};
+        ewald_gridlookup(pos, ref, ewald::MULTIPOLES, ew);
+
+        double err = fabs((D0phi_exact - ew.D0phi) / D0phi_exact);
+
+        errsum0 += err;
+        if(err > errmax0)
+          errmax0 = err;
+        count0++;
+      }
+
+    double errsum1 = 0;
+    double errmax1 = 0;
+    double count1  = 0;
+    for(int i = 0; i < 1000; i++)
+      {
+        double x = get_random_number() - 0.5;
+        double y = get_random_number() - 0.5;
+        double z = get_random_number() - 0.5;
+
+        vector<double> D1phi_exact = pow(All.BoxSize, -2) * ewald_D1(x, y, z);
+
+        ewald_data ew;
+        double posd[3] = {x * All.BoxSize, y * All.BoxSize, z * All.BoxSize};
+        MyIntPosType pos[3];
+        pos_to_signedintpos(posd, (MySignedIntPosType *)pos);
+        MyIntPosType ref[3] = {0, 0, 0};
+        ewald_gridlookup(pos, ref, ewald::MULTIPOLES, ew);
+
+        double norm = D1phi_exact.norm();
+
+        for(int j = 0; j < 3; j++)
+          {
+            double err = fabs(D1phi_exact[j] - ew.D1phi[j]) / norm;
+
+            errsum1 += err;
+            if(err > errmax1)
+              errmax1 = err;
+            count1++;
+          }
+      }
+
+    double errsum2 = 0;
+    double errmax2 = 0;
+    double count2  = 0;
+    for(int i = 0; i < 1000; i++)
+      {
+        double x = get_random_number() - 0.5;
+        double y = get_random_number() - 0.5;
+        double z = get_random_number() - 0.5;
+
+        symtensor2<double> D2phi_exact = pow(All.BoxSize, -3) * ewald_D2(x, y, z);
+
+        ewald_data ew;
+        double posd[3] = {x * All.BoxSize, y * All.BoxSize, z * All.BoxSize};
+        MyIntPosType pos[3];
+        pos_to_signedintpos(posd, (MySignedIntPosType *)pos);
+        MyIntPosType ref[3] = {0, 0, 0};
+        ewald_gridlookup(pos, ref, ewald::MULTIPOLES, ew);
+
+        double norm = D2phi_exact.norm();
+
+        for(int j = 0; j < 6; j++)
+          {
+            double err = fabs((D2phi_exact[j] - ew.D2phi[j]) / norm);
+            errsum2 += err;
+            if(err > errmax2)
+              errmax2 = err;
+            count2++;
+          }
+      }
+
+    double errsum3 = 0;
+    double errmax3 = 0;
+    double count3  = 0;
+    for(int i = 0; i < 1000; i++)
+      {
+        double x = get_random_number() - 0.5;
+        double y = get_random_number() - 0.5;
+        double z = get_random_number() - 0.5;
+
+        symtensor3<double> D3phi_exact = pow(All.BoxSize, -4) * ewald_D3(x, y, z);
+
+        ewald_data ew;
+        double posd[3] = {x * All.BoxSize, y * All.BoxSize, z * All.BoxSize};
+        MyIntPosType pos[3];
+        pos_to_signedintpos(posd, (MySignedIntPosType *)pos);
+        MyIntPosType ref[3] = {0, 0, 0};
+        ewald_gridlookup(pos, ref, ewald::MULTIPOLES, ew);
+
+        double norm = D3phi_exact.norm();
+
+        for(int j = 0; j < 10; j++)
+          {
+            double err = fabs((D3phi_exact[j] - ew.D3phi[j]) / norm);
+            errsum3 += err;
+            if(err > errmax3)
+              errmax3 = err;
+            count3++;
+          }
+      }
+
+    double errsum4 = 0;
+    double errmax4 = 0;
+    double count4  = 0;
+    for(int i = 0; i < 1000; i++)
+      {
+        double x = get_random_number() - 0.5;
+        double y = get_random_number() - 0.5;
+        double z = get_random_number() - 0.5;
+
+        symtensor4<double> D4phi_exact = pow(All.BoxSize, -5) * ewald_D4(x, y, z);
+
+        ewald_data ew;
+        double posd[3] = {x * All.BoxSize, y * All.BoxSize, z * All.BoxSize};
+        MyIntPosType pos[3];
+        pos_to_signedintpos(posd, (MySignedIntPosType *)pos);
+        MyIntPosType ref[3] = {0, 0, 0};
+        ewald_gridlookup(pos, ref, ewald::MULTIPOLES, ew);
+
+        double norm = D4phi_exact.norm();
+
+        for(int j = 0; j < 15; j++)
+          {
+            double err = fabs((D4phi_exact[j] - ew.D4phi[j]) / norm);
+            errsum4 += err;
+            if(err > errmax4)
+              errmax4 = err;
+            count4++;
+          }
+      }
+
+    double errsum5 = 0;
+    double errmax5 = 0;
+    double count5  = 0;
+    for(int i = 0; i < 1000; i++)
+      {
+        double x = get_random_number() - 0.5;
+        double y = get_random_number() - 0.5;
+        double z = get_random_number() - 0.5;
+
+        symtensor5<double> D5phi_exact = pow(All.BoxSize, -6) * ewald_D5(x, y, z);
+
+        ewald_data ew;
+        double posd[3] = {x * All.BoxSize, y * All.BoxSize, z * All.BoxSize};
+        MyIntPosType pos[3];
+        pos_to_signedintpos(posd, (MySignedIntPosType *)pos);
+        MyIntPosType ref[3] = {0, 0, 0};
+        ewald_gridlookup(pos, ref, ewald::MULTIPOLES, ew);
+
+        double norm = D5phi_exact.norm();
+
+        for(int j = 0; j < 21; j++)
+          {
+            double err = fabs((D5phi_exact[j] - ew.D5phi[j]) / norm);
+            errsum5 += err;
+            if(err > errmax5)
+              errmax5 = err;
+            count5++;
+          }
+      }
+
+    printf("Grid look-up: \n\n");
+
+    printf("D0:   max error = %g   mean error=%g\n", errmax0, errsum0 / count0);
+    printf("D1:   max error = %g   mean error=%g\n", errmax1, errsum1 / count1);
+    printf("D2:   max error = %g   mean error=%g\n", errmax2, errsum2 / count2);
+    printf("D3:   max error = %g   mean error=%g\n", errmax3, errsum3 / count3);
+    printf("D4:   max error = %g   mean error=%g\n", errmax4, errsum4 / count4);
+    printf("D5:   max error = %g   mean error=%g\n", errmax5, errsum5 / count5);
+  }
+
+  Terminate("stop");
+}
+
+ewaldtensor6<double> ewald::ewald_P6(void)
+{
+#ifdef GRAVITY_TALLBOX
+  Terminate("GRAVITY_TALLBOX is not implemented");
+#endif
+
+  ewaldtensor6<double> P6 = 0.0;
+
+  double leff   = pow((1.0 / LONG_X) * (1.0 / LONG_Y) * (1.0 / LONG_Z), 1.0 / 3);
+  double alpha  = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 * LONG_X / alpha + 0.5);
+  int qymax = (int)(8.0 * LONG_Y / alpha + 0.5);
+  int qzmax = (int)(8.0 * LONG_Z / alpha + 0.5);
+
+  int nxmax = (int)(2.0 * alpha / LONG_X + 0.5);
+  int nymax = (int)(2.0 * alpha / LONG_Y + 0.5);
+  int nzmax = (int)(2.0 * alpha / LONG_Z + 0.5);
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      for(int nz = -qzmax; nz <= qzmax; nz++)
+        {
+          double dx = -nx * (1.0 / LONG_X);
+          double dy = -ny * (1.0 / LONG_Y);
+          double dz = -nz * (1.0 / LONG_Z);
+
+          vector<double> dxyz(dx, dy, dz);
+
+          double r2 = dx * dx + dy * dy + dz * dz;
+          double r  = sqrt(r2);
+
+          double rinv  = (r > 0) ? 1.0 / r : 0.0;
+          double r2inv = rinv * rinv;
+          double r3inv = r2inv * rinv;
+          double r4inv = r2inv * r2inv;
+          double r7inv = r3inv * r4inv;
+
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              // derivatives of f(r)  = -Erfc[alpha * r] /r
+
+              double ar   = alpha * r;
+              double ar2  = ar * ar;
+              double ar3  = ar * ar * ar;
+              double ar5  = ar3 * ar * ar;
+              double ar7  = ar5 * ar * ar;
+              double ar9  = ar7 * ar * ar;
+              double ar11 = ar9 * ar * ar;
+              double xir2 = pow(dx * rinv, 2);
+              double xir4 = pow(dx * rinv, 4);
+              double xir6 = pow(dx * rinv, 6);
+
+              double yir2 = pow(dy * rinv, 2);
+              double zir2 = pow(dz * rinv, 2);
+
+              P6.XXXXXX += (450.0 * ar + 300.0 * ar3 + 120.0 * ar5 - xir2 * (9450.0 * ar + 6300.0 * ar3 + 2520.0 * ar5 + 720.0 * ar7) +
+                            xir4 * (28350.0 * ar + 18900.0 * ar3 + 7560.0 * ar5 + 2160.0 * ar7 + 480.0 * ar9) -
+                            xir6 * (20790.0 * ar + 13860.0 * ar3 + 5544.0 * ar5 + 1584.0 * ar7 + 352.0 * ar9 + 64.0 * ar11)) /
+                               sqrt(M_PI) * exp(-ar2) * r7inv +
+                           erfc(ar) * (225.0 - 4725.0 * xir2 + 14175.0 * xir4 - 10395.0 * xir6) * r7inv;
+
+              P6.XXXXYY += (90.0 * ar + 60.0 * ar3 + 24.0 * ar5 - xir2 * (1260.0 * ar + 840.0 * ar3 + 336.0 * ar5 + 96.0 * ar7) +
+                            xir4 * (1890.0 * ar + 1260.0 * ar3 + 504.0 * ar5 + 144.0 * ar7 + 32.0 * ar9) -
+                            yir2 * (630.0 * ar + 420.0 * ar3 + 168.0 * ar5 + 48.0 * ar7) +
+                            xir2 * yir2 * (11340.0 * ar + 7560.0 * ar3 + 3024.0 * ar5 + 864.0 * ar7 + 192.0 * ar9) -
+                            xir4 * yir2 * (20790.0 * ar + 13860.0 * ar3 + 5544.0 * ar5 + 1584.0 * ar7 + 352.0 * ar9 + 64.0 * ar11)) /
+                               sqrt(M_PI) * exp(-ar2) * r7inv +
+                           erfc(ar) *
+                               (45.0 - 630.0 * xir2 + 945.0 * xir4 - 315.0 * yir2 + 5670.0 * xir2 * yir2 - 10395.0 * xir4 * yir2) *
+                               r7inv;
+
+              P6.XXYYZZ +=
+                  (30.0 * ar + 20.0 * ar3 + 8.0 * ar5 - (xir2 + yir2 + zir2) * (210.0 * ar + 140.0 * ar3 + 56.0 * ar5 + 16.0 * ar7) +
+                   +(xir2 * yir2 + xir2 * zir2 + yir2 * zir2) * (1890.0 * ar + 1260.0 * ar3 + 504.0 * ar5 + 144.0 * ar7 + 32.0 * ar9) -
+                   xir2 * yir2 * zir2 * (20790.0 * ar + 13860.0 * ar3 + 5544.0 * ar5 + 1584.0 * ar7 + 352.0 * ar9 + 64.0 * ar11)) /
+                      sqrt(M_PI) * exp(-ar2) * r7inv +
+                  erfc(ar) *
+                      (15.0 - 105.0 * (xir2 + yir2 + zir2) + 945.0 * (xir2 * yir2 + xir2 * zir2 + yir2 * zir2) -
+                       10395.0 * xir2 * yir2 * zir2) *
+                      r7inv;
+            }
+          else
+            {
+              /* we add the 1/r term here to the (0|0|0) entry, followed by differentiation, and the limit r->0 to obtain accurate
+               * results at the origin
+               */
+
+              /* Note, for small r:
+               *
+               *   [1/- erfc(a r)]/r  =  2 a/sqrt(pi) * [ 1 - (a r)^2/3 + (a r)^4 / 10 - (a r)^6 / 42 + (a r)^8 / 216 - ...]
+               *
+               */
+
+              P6.XXXXXX += (-240.0) * pow(alpha, 7) / (7.0 * sqrt(M_PI));
+              P6.XXXXYY += (-48.0) * pow(alpha, 7) / (7.0 * sqrt(M_PI));
+              P6.XXYYZZ += 0;
+            }
+        }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      for(int nz = -nzmax; nz <= nzmax; nz++)
+        {
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              double kx = (2.0 * M_PI * LONG_X) * nx;
+              double ky = (2.0 * M_PI * LONG_Y) * ny;
+              double kz = (2.0 * M_PI * LONG_Z) * nz;
+              double k2 = kx * kx + ky * ky + kz * kz;
+
+              double val = 4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / k2 * exp(-k2 / (4.0 * alpha2));
+
+              P6.XXXXXX += val * pow(kx, 6);
+              P6.XXXXYY += val * pow(kx, 4) * pow(ky, 2);
+              P6.XXYYZZ += val * pow(kx, 2) * pow(ky, 2) * pow(kz, 2);
+            }
+        }
+
+  return P6;
+}
+
+ewaldtensor8<double> ewald::ewald_P8(void)
+{
+#ifdef GRAVITY_TALLBOX
+  Terminate("GRAVITY_TALLBOX is not implemented");
+#endif
+
+  ewaldtensor8<double> P8 = 0.0;
+
+  double leff   = pow((1.0 / LONG_X) * (1.0 / LONG_Y) * (1.0 / LONG_Z), 1.0 / 3);
+  double alpha  = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 * LONG_X / alpha + 0.5);
+  int qymax = (int)(8.0 * LONG_Y / alpha + 0.5);
+  int qzmax = (int)(8.0 * LONG_Z / alpha + 0.5);
+
+  int nxmax = (int)(2.0 * alpha / LONG_X + 0.5);
+  int nymax = (int)(2.0 * alpha / LONG_Y + 0.5);
+  int nzmax = (int)(2.0 * alpha / LONG_Z + 0.5);
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      for(int nz = -qzmax; nz <= qzmax; nz++)
+        {
+          double dx = -nx * (1.0 / LONG_X);
+          double dy = -ny * (1.0 / LONG_Y);
+          double dz = -nz * (1.0 / LONG_Z);
+
+          vector<double> dxyz(dx, dy, dz);
+
+          double r2 = dx * dx + dy * dy + dz * dz;
+          double r  = sqrt(r2);
+
+          double rinv  = (r > 0) ? 1.0 / r : 0.0;
+          double r2inv = rinv * rinv;
+          double r3inv = r2inv * rinv;
+          double r4inv = r2inv * r2inv;
+          double r5inv = r2inv * r3inv;
+          double r9inv = r4inv * r5inv;
+
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              // derivatives of f(r)  = -Erfc[alpha * r] /r
+
+              double ar   = alpha * r;
+              double ar2  = ar * ar;
+              double ar3  = ar * ar * ar;
+              double ar5  = ar3 * ar * ar;
+              double ar7  = ar5 * ar * ar;
+              double ar9  = ar7 * ar * ar;
+              double ar11 = ar9 * ar * ar;
+              double ar13 = ar11 * ar * ar;
+              double ar15 = ar13 * ar * ar;
+              double xir2 = pow(dx * rinv, 2);
+              double xir4 = pow(dx * rinv, 4);
+              double xir6 = pow(dx * rinv, 6);
+              double xir8 = pow(dx * rinv, 8);
+
+              double yir2 = pow(dy * rinv, 2);
+              double yir4 = pow(dy * rinv, 4);
+              double zir2 = pow(dz * rinv, 2);
+
+              P8.XXXXXXXX +=
+                  (-(22050.0 * ar + 14700.0 * ar3 + 5880.0 * ar5 + 1680.0 * ar7) +
+                   xir2 * (793800.0 * ar + 529200.0 * ar3 + 211680.0 * ar5 + 60480.0 * ar7 + 13440.0 * ar9) -
+                   xir4 * (4365900.0 * ar + 2910600.0 * ar3 + 1164240.0 * ar5 + 332640.0 * ar7 + 73920.0 * ar9 + 13440.0 * ar11) +
+                   xir6 * (7567560.0 * ar + 5045040.0 * ar3 + 2018016.0 * ar5 + 576576.0 * ar7 + 128128.0 * ar9 + 23296.0 * ar11 +
+                           3584 * ar13) -
+                   xir8 * (4054050.0 * ar + 2702700.0 * ar3 + 1081080.0 * ar5 + 308880.0 * ar7 + 68640.0 * ar9 + 12480.0 * ar11 +
+                           1920.0 * ar13 + 256.0 * ar15)) /
+                      sqrt(M_PI) * exp(-ar2) * r9inv +
+                  erfc(ar) * (-11025.0 + 396900.0 * xir2 - 2182950.0 * xir4 + 3783780.0 * xir6 - 2027025.0 * xir8) * r9inv;
+
+              P8.XXXXXXYY =
+                  (-(3150.0 * ar + 2100.0 * ar3 + 840.0 * ar5 + 240.0 * ar7) +
+                   xir2 * (85050.0 * ar + 56700.0 * ar3 + 22680.0 * ar5 + 6480.0 * ar7 + 1440.0 * ar9) -
+                   xir4 * (311850.0 * ar + 207900.0 * ar3 + 83160.0 * ar5 + 23760.0 * ar7 + 5280.0 * ar9 + 960.0 * ar11) +
+                   xir6 *
+                       (270270.0 * ar + 180180.0 * ar3 + 72072.0 * ar5 + 20592.0 * ar7 + 4576.0 * ar9 + 832.0 * ar11 + 128.0 * ar13) +
+                   yir2 * (28350.0 * ar + 18900 * ar3 + 7560.0 * ar5 + 2160 * ar7 + 480 * ar9) -
+                   xir2 * yir2 * (935550 * ar + 623700.0 * ar3 + 249480.0 * ar5 + 71280.0 * ar7 + 15840.0 * ar9 + 2880.0 * ar11) +
+                   xir4 * yir2 *
+                       (4054050.0 * ar + 2702700.0 * ar3 + 1081080.0 * ar5 + 308880.0 * ar7 + 68640.0 * ar9 + 12480.0 * ar11 +
+                        1920.0 * ar13) -
+                   xir6 * yir2 *
+                       (4054050.0 * ar + 2702700.0 * ar3 + 1081080.0 * ar5 + 308880.0 * ar7 + 68640.0 * ar9 + 12480.0 * ar11 +
+                        1920.0 * ar13 + 256.0 * ar15)) /
+                      sqrt(M_PI) * exp(-ar2) * r9inv +
+                  erfc(ar) *
+                      (-1575.0 + 42525.0 * xir2 - 155925.0 * xir4 + 135135.0 * xir6 + 14175.0 * yir2 - 467775.0 * xir2 * yir2 +
+                       2027025.0 * xir4 * yir2 - 2027025 * xir6 * yir2) *
+                      r9inv;
+
+              P8.XXXXYYYY =
+                  (-(1890.0 * ar + 1260.0 * ar3 + 504.0 * ar5 + 144.0 * ar7) +
+                   (xir2 + yir2) * (34020.0 * ar + 22680.0 * ar3 + 9072.0 * ar5 + 2592.0 * ar7 + 576.0 * ar9) -
+                   (xir4 + yir4) * (62370.0 * ar + 41580.0 * ar3 + 16632.0 * ar5 + 4752.0 * ar7 + 1056.0 * ar9 + 192.0 * ar11) +
+                   -xir2 * yir2 * (748440.0 * ar + 498960.0 * ar3 + 199584.0 * ar5 + 57024.0 * ar7 + 12672.0 * ar9 + 2304.0 * ar11) +
+
+                   (xir4 * yir2 + xir2 * yir4) * (1621620.0 * ar + 1081080.0 * ar3 + 432432.0 * ar5 + 123552.0 * ar7 + 27456.0 * ar9 +
+                                                  4992.0 * ar11 + 768.0 * ar13) -
+                   xir4 * yir4 *
+                       (4054050.0 * ar + 2702700.0 * ar3 + 1081080.0 * ar5 + 308880.0 * ar7 + 68640.0 * ar9 + 12480.0 * ar11 +
+                        1920.0 * ar13 + 256.0 * ar15)) /
+                      sqrt(M_PI) * exp(-ar2) * r9inv +
+                  erfc(ar) *
+                      (-945.0 + 17010.0 * (xir2 + yir2) - 31185.0 * (xir4 + yir4) - 374220.0 * xir2 * yir2 +
+                       810810.0 * (xir4 * yir2 + xir2 * yir4) - 2027025.0 * xir4 * yir4) *
+                      r9inv;
+
+              P8.XXXXYYZZ = (-(630.0 * ar + 420.0 * ar3 + 168.0 * ar5 + 48.0 * ar7) +
+                             (xir2) * (11340.0 * ar + 7560.0 * ar3 + 3024.0 * ar5 + 864.0 * ar7 + 192.0 * ar9) -
+                             (xir4) * (20790.0 * ar + 13860.0 * ar3 + 5544.0 * ar5 + 1584.0 * ar7 + 352.0 * ar9 + 64.0 * ar11) +
+                             +(yir2 + zir2) * (5670.0 * ar + 3780.0 * ar3 + 1512.0 * ar5 + 432.0 * ar7 + 96.0 * ar9) -
+                             (xir2 * yir2 + xir2 * zir2) *
+                                 (124740 * ar + 83160.0 * ar3 + 33264.0 * ar5 + 9504 * ar7 + 2112.0 * ar9 + 384.0 * ar11) +
+                             (xir4 * yir2 + xir4 * zir2) * (270270.0 * ar + 180180.0 * ar3 + 72072 * ar5 + 20592.0 * ar7 +
+                                                            4576.0 * ar9 + 832.0 * ar11 + 128.0 * ar13) -
+                             yir2 * zir2 * (62370.0 * ar + 41580 * ar3 + 16632.0 * ar5 + 4752 * ar7 + 1056.0 * ar9 + 192.0 * ar11) +
+                             (xir2 * yir2 * zir2) * (1621620.0 * ar + 1081080.0 * ar3 + 432432.0 * ar5 + 123552.0 * ar7 +
+                                                     27456.0 * ar9 + 4992.0 * ar11 + 768.0 * ar13) -
+                             xir4 * yir2 * zir2 *
+                                 (4054050.0 * ar + 2702700.0 * ar3 + 1081080.0 * ar5 + 308880.0 * ar7 + 68640.0 * ar9 +
+                                  12480.0 * ar11 + 1920.0 * ar13 + 256.0 * ar15)) /
+                                sqrt(M_PI) * exp(-ar2) * r9inv +
+                            erfc(ar) *
+                                (-315.0 + 5670.0 * xir2 - 10395.0 * xir4 + 2835.0 * (yir2 + zir2) -
+                                 62370.0 * (xir2 * yir2 + xir2 * zir2) + 135135.0 * xir4 * (yir2 + zir2) - 31185.0 * yir2 * zir2 +
+                                 810810.0 * xir2 * yir2 * zir2 - 2027025 * xir4 * yir2 * zir2) *
+                                r9inv;
+            }
+          else
+            {
+              /* we add the 1/r term here to the (0|0|0) entry, followed by differentiation, and the limit r->0 to obtain accurate
+               * results at the origin
+               */
+
+              /* Note, for small r:
+               *
+               *   [1/- erfc(a r)]/r  =  2 a/sqrt(pi) * [ 1 - (a r)^2/3 + (a r)^4 / 10 - (a r)^6 / 42 + (a r)^8 / 216 - ...]
+               *
+               */
+
+              P8.XXXXXXXX += 1120.0 * pow(alpha, 9) / (3.0 * sqrt(M_PI));
+              P8.XXXXXXYY += 160.0 * pow(alpha, 9) / (3.0 * sqrt(M_PI));
+              P8.XXXXYYYY += 0;
+              P8.XXXXYYZZ += 32.0 * pow(alpha, 9) / (3.0 * sqrt(M_PI));
+            }
+        }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      for(int nz = -nzmax; nz <= nzmax; nz++)
+        {
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              double kx = (2.0 * M_PI * LONG_X) * nx;
+              double ky = (2.0 * M_PI * LONG_Y) * ny;
+              double kz = (2.0 * M_PI * LONG_Z) * nz;
+              double k2 = kx * kx + ky * ky + kz * kz;
+
+              double val = -4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / k2 * exp(-k2 / (4.0 * alpha2));
+
+              P8.XXXXXXXX += val * pow(kx, 8);
+              P8.XXXXXXYY += val * pow(kx, 6) * pow(ky, 2);
+              P8.XXXXYYYY += val * pow(kx, 4) * pow(ky, 4);
+              P8.XXXXYYZZ += val * pow(kx, 4) * pow(ky, 2) * pow(kz, 2);
+            }
+        }
+
+  return P8;
+}
+
+ewaldtensor10<double> ewald::ewald_P10(void)
+{
+#ifdef GRAVITY_TALLBOX
+  Terminate("GRAVITY_TALLBOX is not implemented");
+#endif
+
+  ewaldtensor10<double> P10 = 0.0;
+
+  double leff   = pow((1.0 / LONG_X) * (1.0 / LONG_Y) * (1.0 / LONG_Z), 1.0 / 3);
+  double alpha  = 2.0 / leff;
+  double alpha2 = alpha * alpha;
+
+  int qxmax = (int)(8.0 * LONG_X / alpha + 0.5);
+  int qymax = (int)(8.0 * LONG_Y / alpha + 0.5);
+  int qzmax = (int)(8.0 * LONG_Z / alpha + 0.5);
+
+  int nxmax = (int)(2.0 * alpha / LONG_X + 0.5);
+  int nymax = (int)(2.0 * alpha / LONG_Y + 0.5);
+  int nzmax = (int)(2.0 * alpha / LONG_Z + 0.5);
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      for(int nz = -qzmax; nz <= qzmax; nz++)
+        {
+          double dx = -nx * (1.0 / LONG_X);
+          double dy = -ny * (1.0 / LONG_Y);
+          double dz = -nz * (1.0 / LONG_Z);
+
+          vector<double> dxyz(dx, dy, dz);
+
+          double r2 = dx * dx + dy * dy + dz * dz;
+          double r  = sqrt(r2);
+
+          double rinv   = (r > 0) ? 1.0 / r : 0.0;
+          double r2inv  = rinv * rinv;
+          double r3inv  = r2inv * rinv;
+          double r4inv  = r2inv * r2inv;
+          double r7inv  = r3inv * r4inv;
+          double r11inv = r4inv * r7inv;
+
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              double ar   = alpha * r;
+              double ar2  = ar * ar;
+              double ar3  = ar * ar * ar;
+              double ar5  = ar3 * ar * ar;
+              double ar7  = ar5 * ar * ar;
+              double ar9  = ar7 * ar * ar;
+              double ar11 = ar9 * ar * ar;
+              double ar13 = ar11 * ar * ar;
+              double ar15 = ar13 * ar * ar;
+              double ar17 = ar15 * ar * ar;
+              double ar19 = ar17 * ar * ar;
+
+              double xir2  = pow(dx * rinv, 2);
+              double xir4  = pow(dx * rinv, 4);
+              double xir6  = pow(dx * rinv, 6);
+              double xir8  = pow(dx * rinv, 8);
+              double xir10 = pow(dx * rinv, 10);
+
+              double yir2 = pow(dy * rinv, 2);
+              double yir4 = pow(dy * rinv, 4);
+              double zir2 = pow(dz * rinv, 2);
+
+              P10.XXXXXXXXXX +=
+                  ((1786050.0 * ar + 1190700.0 * ar3 + 476280.0 * ar5 + 136080.0 * ar7 + 30240 * ar9) -
+                   xir2 * (98232750.0 * ar + 65488500.0 * ar3 + 26195400.0 * ar5 + 7484400.0 * ar7 + 1663200.0 * ar9 + 302400 * ar11) +
+                   xir4 * (851350500.0 * ar + 567567000.0 * ar3 + 227026800.0 * ar5 + 64864800.0 * ar7 + 14414400.0 * ar9 +
+                           2620800.0 * ar11 + 403200 * ar13) -
+                   xir6 * (2554051500.0 * ar + 1702701000.0 * ar3 + 681080400.0 * ar5 + 194594400.0 * ar7 + 43243200.0 * ar9 +
+                           7862400.0 * ar11 + 1209600.0 * ar13 + 161280 * ar15) +
+                   xir8 * (3101348250.0 * ar + 2067565500.0 * ar3 + 827026200.0 * ar5 + 236293200.0 * ar7 + 52509600.0 * ar9 +
+                           9547200.0 * ar11 + 1468800.0 * ar13 + 195840.0 * ar15 + 23040 * ar17) -
+                   xir10 * (1309458150 * ar + 872972100 * ar3 + 349188840 * ar5 + 99768240 * ar7 + 22170720 * ar9 + 4031040 * ar11 +
+                            620160 * ar13 + 82688 * ar15 + 9728 * ar17 + 1024.0 * ar19)) /
+                      sqrt(M_PI) * exp(-ar2) * r11inv +
+                  erfc(ar) *
+                      (893025.0 - 49116375.0 * xir2 + 425675250.0 * xir4 - 1277025750 * xir6 + 1550674125.0 * xir8 -
+                       654729075.0 * xir10) *
+                      r11inv;
+
+              P10.XXXXXXXXYY +=
+                  ((198450.0 * ar + 132300.0 * ar3 + 52920.0 * ar5 + 15120.0 * ar7 + 3360.0 * ar9) -
+                   xir2 * (8731800.0 * ar + 5821200.0 * ar3 + 2328480.0 * ar5 + 665280.0 * ar7 + 147840.0 * ar9 + 26880.0 * ar11) +
+                   xir4 * (56756700.0 * ar + 37837800.0 * ar3 + 15135120.0 * ar5 + 4324320.0 * ar7 + 960960.0 * ar9 + 174720.0 * ar11 +
+                           26880.0 * ar13) -
+                   xir6 * (113513400.0 * ar + 75675600.0 * ar3 + 30270240.0 * ar5 + 8648640.0 * ar7 + 1921920.0 * ar9 +
+                           349440.0 * ar11 + 53760.0 * ar13 + 7168.0 * ar15) +
+                   xir8 * (68918850.0 * ar + 45945900.0 * ar3 + 18378360.0 * ar5 + 5250960.0 * ar7 + 1166880.0 * ar9 +
+                           212160.0 * ar11 + 32640.0 * ar13 + 4352.0 * ar15 + 512.0 * ar17) -
+                   yir2 * (2182950.0 * ar + 1455300.0 * ar3 + 582120.0 * ar5 + 166320.0 * ar7 + 36960.0 * ar9 + 6720.0 * ar11) +
+                   xir2 * yir2 *
+                       (113513400.0 * ar + 75675600.0 * ar3 + 30270240.0 * ar5 + 8648640.0 * ar7 + 1921920.0 * ar9 + 349440.0 * ar11 +
+                        53760.0 * ar13) -
+                   xir4 * yir2 *
+                       (851350500.0 * ar + 567567000.0 * ar3 + 227026800.0 * ar5 + 64864800.0 * ar7 + 14414400.0 * ar9 +
+                        2620800.0 * ar11 + 403200.0 * ar13 + 53760.0 * ar15) +
+                   xir6 * yir2 *
+                       (1929727800.0 * ar + 1286485200.0 * ar3 + 514594080.0 * ar5 + 147026880.0 * ar7 + 32672640.0 * ar9 +
+                        5940480.0 * ar11 + 913920.0 * ar13 + 121856.0 * ar15 + 14336.0 * ar17) -
+                   xir8 * yir2 *
+                       (1309458150.0 * ar + 872972100.0 * ar3 + 349188840.0 * ar5 + 99768240 * ar7 + 22170720 * ar9 + 4031040 * ar11 +
+                        620160 * ar13 + 82688 * ar15 + 9728 * ar17 + 1024.0 * ar19)) /
+                      sqrt(M_PI) * exp(-ar2) * r11inv +
+                  erfc(ar) *
+                      (99225.0 - 4365900.0 * xir2 + 28378350.0 * xir4 - 56756700 * xir6 + 34459425.0 * xir8 - 1091475.0 * yir2 +
+                       56756700.0 * xir2 * yir2 - 425675250.0 * xir4 * yir2 + 964863900.0 * xir6 * yir2 - 654729075.0 * xir8 * yir2) *
+                      r11inv;
+
+              P10.XXXXXXYYYY +=
+                  ((85050.0 * ar + 56700.0 * ar3 + 22680.0 * ar5 + 6480.0 * ar7 + 1440.0 * ar9) -
+                   xir2 * (2806650.0 * ar + 1871100.0 * ar3 + 748440.0 * ar5 + 213840.0 * ar7 + 47520.0 * ar9 + 8640.0 * ar11) +
+                   xir4 * (12162150.0 * ar + 8108100.0 * ar3 + 3243240.0 * ar5 + 926640.0 * ar7 + 205920.0 * ar9 + 37440.0 * ar11 +
+                           5760.0 * ar13) -
+                   xir6 * (12162150.0 * ar + 8108100.0 * ar3 + 3243240.0 * ar5 + 926640.0 * ar7 + 205920.0 * ar9 + 37440.0 * ar11 +
+                           5760.0 * ar13 + 768.0 * ar15) -
+                   yir2 * (1871100.0 * ar + 1247400.0 * ar3 + 498960.0 * ar5 + 142560.0 * ar7 + 31680.0 * ar9 + 5760.0 * ar11) +
+                   xir2 * yir2 *
+                       (72972900.0 * ar + 48648600 * ar3 + 19459440.0 * ar5 + 5559840.0 * ar7 + 1235520.0 * ar9 + 224640.0 * ar11 +
+                        34560.0 * ar13) -
+                   xir4 * yir2 *
+                       (364864500.0 * ar + 243243000.0 * ar3 + 97297200.0 * ar5 + 27799200.0 * ar7 + 6177600.0 * ar9 +
+                        1123200.0 * ar11 + 172800.0 * ar13 + 23040.0 * ar15) +
+                   xir6 * yir2 *
+                       (413513100.0 * ar + 275675400.0 * ar3 + 110270160.0 * ar5 + 31505760.0 * ar7 + 7001280.0 * ar9 +
+                        1272960.0 * ar11 + 195840.0 * ar13 + 26112.0 * ar15 + 3072.0 * ar17) +
+                   yir4 * (4054050.0 * ar + 2702700.0 * ar3 + 1081080.0 * ar5 + 308880.0 * ar7 + 68640.0 * ar9 + 12480.0 * ar11 +
+                           1920 * ar13) -
+                   xir2 * yir4 *
+                       (182432250.0 * ar + 121621500.0 * ar3 + 48648600.0 * ar5 + 13899600 * ar7 + 3088800 * ar9 + 561600 * ar11 +
+                        86400 * ar13 + 11520 * ar15) +
+                   xir4 * yir4 *
+                       (1033782750.0 * ar + 689188500.0 * ar3 + 275675400.0 * ar5 + 78764400 * ar7 + 17503200 * ar9 + 3182400 * ar11 +
+                        489600 * ar13 + 65280 * ar15 + 7680 * ar17) -
+                   xir6 * yir4 *
+                       (1309458150.0 * ar + 872972100.0 * ar3 + 349188840.0 * ar5 + 99768240.0 * ar7 + 22170720.0 * ar9 +
+                        4031040.0 * ar11 + 620160.0 * ar13 + 82688.0 * ar15 + 9728.0 * ar17 + 1024.0 * ar19)) /
+                      sqrt(M_PI) * exp(-ar2) * r11inv +
+                  erfc(ar) *
+                      (42525.0 - 1403325 * xir2 + 6081075.0 * xir4 - 6081075 * xir6 - 935550.0 * yir2 + 36486450.0 * xir2 * yir2 -
+                       182432250.0 * xir4 * yir2 + 206756550.0 * xir6 * yir2 + 2027025.0 * yir4 - 91216125.0 * xir2 * yir4 +
+                       516891375.0 * xir4 * yir4 - 654729075.0 * xir6 * yir4) *
+                      r11inv;
+
+              P10.XXXXXXYYZZ +=
+                  ((28350 * ar + 18900 * ar3 + 7560 * ar5 + 2160 * ar7 + 480 * ar9) -
+                   xir2 * (935550 * ar + 623700 * ar3 + 249480 * ar5 + 71280 * ar7 + 15840 * ar9 + 2880 * ar11) +
+                   xir4 * (4054050 * ar + 2702700 * ar3 + 1081080 * ar5 + 308880 * ar7 + 68640 * ar9 + 12480 * ar11 + 1920 * ar13) -
+                   xir6 * (4054050 * ar + 2702700 * ar3 + 1081080 * ar5 + 308880 * ar7 + 68640 * ar9 + 12480 * ar11 + 1920 * ar13 +
+                           256 * ar15) -
+                   (yir2 + zir2) * (311850 * ar + 207900 * ar3 + 83160 * ar5 + 23760 * ar7 + 5280 * ar9 + 960 * ar11) +
+                   (xir2 * yir2 + xir2 * zir2) *
+                       (12162150 * ar + 8108100 * ar3 + 3243240 * ar5 + 926640 * ar7 + 205920 * ar9 + 37440 * ar11 + 5760 * ar13) -
+                   (xir4 * yir2 + xir4 * zir2) * (60810750 * ar + 40540500 * ar3 + 16216200 * ar5 + 4633200 * ar7 + 1029600 * ar9 +
+                                                  187200 * ar11 + 28800 * ar13 + 3840 * ar15) +
+                   (xir6 * yir2 + xir6 * zir2) * (68918850 * ar + 45945900 * ar3 + 18378360 * ar5 + 5250960 * ar7 + 1166880 * ar9 +
+                                                  212160 * ar11 + 32640 * ar13 + 4352 * ar15 + 512 * ar17) +
+                   yir2 * zir2 *
+                       (4054050 * ar + 2702700 * ar3 + 1081080.0 * ar5 + 308880.0 * ar7 + 68640.0 * ar9 + 12480.0 * ar11 +
+                        1920 * ar13) -
+                   xir2 * yir2 * zir2 *
+                       (182432250.0 * ar + 121621500.0 * ar3 + 48648600.0 * ar5 + 13899600 * ar7 + 3088800 * ar9 + 561600 * ar11 +
+                        86400 * ar13 + 11520 * ar15) +
+                   xir4 * yir2 * zir2 *
+                       (1033782750.0 * ar + 689188500.0 * ar3 + 275675400.0 * ar5 + 78764400 * ar7 + 17503200 * ar9 + 3182400 * ar11 +
+                        489600 * ar13 + 65280 * ar15 + 7680 * ar17) -
+                   xir6 * yir2 * zir2 *
+                       (1309458150.0 * ar + 872972100.0 * ar3 + 349188840.0 * ar5 + 99768240.0 * ar7 + 22170720.0 * ar9 +
+                        4031040.0 * ar11 + 620160.0 * ar13 + 82688.0 * ar15 + 9728.0 * ar17 + 1024.0 * ar19)) /
+                      sqrt(M_PI) * exp(-ar2) * r11inv +
+                  erfc(ar) *
+                      (14175.0 - 467775 * xir2 + 2027025 * xir4 - 2027025 * xir6 - 155925.0 * (yir2 + zir2) +
+                       6081075.0 * xir2 * (yir2 + zir2) - 30405375.0 * xir4 * (yir2 + zir2) + 34459425.0 * xir6 * (yir2 + zir2) +
+                       2027025.0 * yir2 * zir2 - 91216125.0 * xir2 * yir2 * zir2 + 516891375.0 * xir4 * yir2 * zir2 -
+                       654729075.0 * xir6 * yir2 * zir2) *
+                      r11inv;
+
+              P10.XXXXYYYYZZ +=
+                  ((17010 * ar + 11340 * ar3 + 4536 * ar5 + 1296 * ar7 + 288 * ar9) -
+                   (xir2 + yir2) * (374220 * ar + 249480 * ar3 + 99792 * ar5 + 28512 * ar7 + 6336 * ar9 + 1152 * ar11) +
+                   (xir4 + yir4) * (810810 * ar + 540540 * ar3 + 216216 * ar5 + 61776 * ar7 + 13728 * ar9 + 2496 * ar11 + 384 * ar13) +
+                   xir2 * yir2 *
+                       (9729720 * ar + 6486480 * ar3 + 2594592 * ar5 + 741312 * ar7 + 164736 * ar9 + 29952 * ar11 + 4608 * ar13) -
+                   (xir4 * yir2 + xir2 * yir4) * (24324300 * ar + 16216200 * ar3 + 6486480 * ar5 + 1853280 * ar7 + 411840 * ar9 +
+                                                  74880 * ar11 + 11520 * ar13 + 1536 * ar15) +
+                   xir4 * yir4 *
+                       (68918850 * ar + 45945900 * ar3 + 18378360 * ar5 + 5250960 * ar7 + 1166880 * ar9 + 212160 * ar11 +
+                        32640 * ar13 + 4352 * ar15 + 512.0 * ar17) -
+                   zir2 * (187110 * ar + 124740 * ar3 + 49896 * ar5 + 14256 * ar7 + 3168 * ar9 + 576 * ar11) +
+                   (xir2 + yir2) * zir2 *
+                       (4864860 * ar + 3243240 * ar3 + 1297296 * ar5 + 370656 * ar7 + 82368 * ar9 + 14976 * ar11 + 2304 * ar13) -
+                   (xir4 + yir4) * zir2 *
+                       (12162150 * ar + 8108100 * ar3 + 3243240 * ar5 + 926640 * ar7 + 205920 * ar9 + 37440 * ar11 + 5760 * ar13 +
+                        768 * ar15) -
+                   xir2 * yir2 * zir2 *
+                       (145945800 * ar + 97297200 * ar3 + 38918880 * ar5 + 11119680 * ar7 + 2471040 * ar9 + 449280 * ar11 +
+                        69120 * ar13 + 9216 * ar15) +
+                   (xir4 * yir2 + xir2 * yir4) * zir2 *
+                       (413513100 * ar + 275675400 * ar3 + 110270160 * ar5 + 31505760 * ar7 + 7001280 * ar9 + 1272960 * ar11 +
+                        195840 * ar13 + 26112 * ar15 + 3072 * ar17) -
+                   xir4 * yir4 * zir2 *
+                       (1309458150.0 * ar + 872972100.0 * ar3 + 349188840.0 * ar5 + 99768240.0 * ar7 + 22170720.0 * ar9 +
+                        4031040.0 * ar11 + 620160.0 * ar13 + 82688.0 * ar15 + 9728.0 * ar17 + 1024.0 * ar19)) /
+                      sqrt(M_PI) * exp(-ar2) * r11inv +
+                  erfc(ar) *
+                      (8505.0 - 187110 * (xir2 + yir2) + 405405 * (xir4 + yir4) + 4864860 * xir2 * yir2 -
+                       12162150 * (xir4 * yir2 + xir2 * yir4) + 34459425 * xir4 * yir4 - 93555 * zir2 +
+                       2432430 * (xir2 + yir2) * zir2 - 6081075 * (xir4 + yir4) * zir2 - 72972900 * xir2 * yir2 * zir2 +
+                       206756550 * (xir4 * yir2 + xir2 * yir4) * zir2 - 654729075.0 * xir4 * yir4 * zir2) *
+                      r11inv;
+            }
+          else
+            {
+              /* we add the 1/r term here to the (0|0|0) entry, followed by differentiation, and the limit r->0 to obtain accurate
+               * results at the origin
+               */
+
+              /* Note, for small r:
+               *
+               *   [1/- erfc(a r)]/r  =  2 a/sqrt(pi) * [ 1 - (a r)^2/3 + (a r)^4 / 10 - (a r)^6 / 42 + (a r)^8 / 216 - ...]
+               *
+               */
+
+              P10.XXXXXXXXXX += (-60480.0) * pow(alpha, 11) / (11.0 * sqrt(M_PI));
+              P10.XXXXXXXXYY += (-6720.0) * pow(alpha, 11) / (11.0 * sqrt(M_PI));
+              P10.XXXXXXYYYY += (-2880.0) * pow(alpha, 11) / (11.0 * sqrt(M_PI));
+              P10.XXXXXXYYZZ += (-960.0) * pow(alpha, 11) / (11.0 * sqrt(M_PI));
+              P10.XXXXYYYYZZ += (-576.0) * pow(alpha, 11) / (11.0 * sqrt(M_PI));
+            }
+        }
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      for(int nz = -nzmax; nz <= nzmax; nz++)
+        {
+          if(nx != 0 || ny != 0 || nz != 0)
+            {
+              double kx = (2.0 * M_PI * LONG_X) * nx;
+              double ky = (2.0 * M_PI * LONG_Y) * ny;
+              double kz = (2.0 * M_PI * LONG_Z) * nz;
+              double k2 = kx * kx + ky * ky + kz * kz;
+
+              double val = 4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / k2 * exp(-k2 / (4.0 * alpha2));
+
+              P10.XXXXXXXXXX += val * pow(kx, 10);
+              P10.XXXXXXXXYY += val * pow(kx, 8) * pow(ky, 2);
+              P10.XXXXXXYYYY += val * pow(kx, 6) * pow(ky, 4);
+              P10.XXXXXXYYZZ += val * pow(kx, 6) * pow(ky, 2) * pow(kz, 2);
+              P10.XXXXYYYYZZ += val * pow(kx, 4) * pow(ky, 4) * pow(kz, 2);
+            }
+        }
+
+  return P10;
+}
+
+#endif
diff --git a/src/gravity/ewaldtensors.h b/src/gravity/ewaldtensors.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcdcb6df30995599d6275c9e9d987f7ad5a6a20e
--- /dev/null
+++ b/src/gravity/ewaldtensors.h
@@ -0,0 +1,650 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file ewaldtensors.h
+ *
+ *  \brief defines derivative tensors with cubic symmetry for Ewald correction
+ */
+
+#ifndef GRAVITY_EWALDTENSORS_H
+#define GRAVITY_EWALDTENSORS_H
+
+#include "../data/symtensors.h"
+
+// derivative tensors for Ewald correction - they have few independent elements due to cubic symmetry
+
+template <typename T>
+class ewaldtensor0
+{
+ public:
+  T x0;
+
+  // constructor
+  ewaldtensor0() {}
+
+  // constructor
+  ewaldtensor0(const T x) { x0 = x; }
+
+  inline ewaldtensor0 &operator+=(const ewaldtensor0 &right)
+  {
+    x0 += right.x0;
+
+    return *this;
+  }
+};
+
+template <typename T>
+class ewaldtensor2
+{
+ public:
+  T XX;
+
+  // constructor
+  ewaldtensor2() {}
+
+  // constructor
+  ewaldtensor2(const T x) { XX = x; }
+
+  inline ewaldtensor2 &operator+=(const ewaldtensor2 &right)
+  {
+    XX += right.XX;
+
+    return *this;
+  }
+};
+
+template <typename T>
+class ewaldtensor4
+{
+ public:
+  T XXXX;
+  T XXYY;
+
+  // constructor
+  ewaldtensor4() {}
+
+  // constructor
+  ewaldtensor4(const T x)
+  {
+    XXXX = x;
+    XXYY = x;
+  }
+
+  inline ewaldtensor4 &operator+=(const ewaldtensor4 &right)
+  {
+    XXXX += right.XXXX;
+    XXYY += right.XXYY;
+
+    return *this;
+  }
+};
+
+template <typename T>
+class ewaldtensor6
+{
+ public:
+  T XXXXXX;
+  T XXXXYY;
+  T XXYYZZ;
+
+  // constructor
+  ewaldtensor6() {}
+
+  // constructor
+  ewaldtensor6(const T x)
+  {
+    XXXXXX = x;
+    XXXXYY = x;
+    XXYYZZ = x;
+  }
+
+  inline ewaldtensor6 &operator+=(const ewaldtensor6 &right)
+  {
+    XXXXXX += right.XXXXXX;
+    XXXXYY += right.XXXXYY;
+    XXYYZZ += right.XXYYZZ;
+
+    return *this;
+  }
+};
+
+template <typename T>
+class ewaldtensor8
+{
+ public:
+  T XXXXXXXX;
+  T XXXXXXYY;
+  T XXXXYYYY;
+  T XXXXYYZZ;
+
+  // constructor
+  ewaldtensor8() {}
+
+  // constructor
+  ewaldtensor8(const T x)
+  {
+    XXXXXXXX = x;
+    XXXXXXYY = x;
+    XXXXYYYY = x;
+    XXXXYYZZ = x;
+  }
+
+  inline ewaldtensor8 &operator+=(const ewaldtensor8 &right)
+  {
+    XXXXXXXX += right.XXXXXXXX;
+    XXXXXXYY += right.XXXXXXYY;
+    XXXXYYYY += right.XXXXYYYY;
+    XXXXYYZZ += right.XXXXYYZZ;
+
+    return *this;
+  }
+};
+
+template <typename T>
+class ewaldtensor10
+{
+ public:
+  T XXXXXXXXXX;
+  T XXXXXXXXYY;
+  T XXXXXXYYYY;
+  T XXXXXXYYZZ;
+  T XXXXYYYYZZ;
+
+  // constructor
+  ewaldtensor10() {}
+
+  // constructor
+  ewaldtensor10(const T x)
+  {
+    XXXXXXXXXX = x;
+    XXXXXXXXYY = x;
+    XXXXXXYYYY = x;
+    XXXXXXYYZZ = x;
+    XXXXYYYYZZ = x;
+  }
+
+  inline ewaldtensor10 &operator+=(const ewaldtensor10 &right)
+  {
+    XXXXXXXXXX += right.XXXXXXXXXX;
+    XXXXXXXXYY += right.XXXXXXXXYY;
+    XXXXXXYYYY += right.XXXXXXYYYY;
+    XXXXXXYYZZ += right.XXXXXXYYZZ;
+    XXXXYYYYZZ += right.XXXXYYYYZZ;
+
+    return *this;
+  }
+};
+
+// multiply with a scalar factor
+template <typename T>
+inline ewaldtensor6<T> operator*(const double fac, const ewaldtensor6<T> &S)
+{
+  ewaldtensor6<T> res;
+
+  res.XXXXXX = fac * S.XXXXXX;
+  res.XXXXYY = fac * S.XXXXYY;
+  res.XXYYZZ = fac * S.XXYYZZ;
+
+  return res;
+}
+
+// multiply with a scalar factor
+template <typename T>
+inline ewaldtensor8<T> operator*(const double fac, const ewaldtensor8<T> &S)
+{
+  ewaldtensor8<T> res;
+
+  res.XXXXXXXX = fac * S.XXXXXXXX;
+  res.XXXXXXYY = fac * S.XXXXXXYY;
+  res.XXXXYYYY = fac * S.XXXXYYYY;
+  res.XXXXYYZZ = fac * S.XXXXYYZZ;
+
+  return res;
+}
+
+// multiply with a scalar factor
+template <typename T>
+inline ewaldtensor10<T> operator*(const double fac, const ewaldtensor10<T> &S)
+{
+  ewaldtensor10<T> res;
+
+  res.XXXXXXXXXX = fac * S.XXXXXXXXXX;
+  res.XXXXXXXXYY = fac * S.XXXXXXXXYY;
+  res.XXXXXXYYYY = fac * S.XXXXXXYYYY;
+  res.XXXXXXYYZZ = fac * S.XXXXXXYYZZ;
+  res.XXXXYYYYZZ = fac * S.XXXXYYYYZZ;
+
+  return res;
+}
+
+// contract a 2-ewaldtensor with a symmetric 2-tensor to yield a scalar
+template <typename T>
+inline T operator*(const ewaldtensor2<T> &S, const symtensor2<T> &D)
+{
+  T res = S.XX * D.da[qZZ] + S.XX * D.da[qYY] + S.XX * D.da[qXX];
+
+  return res;
+}
+
+// contract a 4-ewaldtensor with a symmetric 4-tensor to yield a scalar
+template <typename T>
+inline T operator*(const ewaldtensor4<T> &S, const symtensor4<T> &D)
+{
+  T res = S.XXXX * D.da[sZZZZ] + 6.0 * S.XXYY * D.da[sYYZZ] + 6.0 * S.XXYY * D.da[sXXZZ] + S.XXXX * D.da[sYYYY] +
+          6.0 * S.XXYY * D.da[sXXYY] + S.XXXX * D.da[sXXXX];
+
+  return res;
+}
+
+// contract a 4-ewaldtensor with a symmetric 3-tensor to yield a vector
+template <typename T>
+inline vector<T> operator*(const ewaldtensor4<T> &S, const symtensor3<T> &D)
+{
+  vector<T> res;
+
+  res.da[0] = S.XXXX * D.da[dZZZ] + 3.0 * S.XXYY * D.da[dYYZ] + 3.0 * S.XXYY * D.da[dXXZ];
+
+  res.da[1] = 3.0 * S.XXYY * D.da[dYZZ] + S.XXXX * D.da[dYYY] + 3.0 * S.XXYY * D.da[dXXY];
+
+  res.da[2] = 3.0 * S.XXYY * D.da[dXZZ] + 3.0 * S.XXYY * D.da[dXYY] + S.XXXX * D.da[dXXX];
+
+  return res;
+}
+
+// contract a 6-ewaldtensor with a symmetric 5-tensor to yield a vector
+template <typename T>
+inline vector<T> operator*(const ewaldtensor6<T> &S, const symtensor5<T> &D)
+{
+  vector<T> res;
+
+  res.da[0] = S.XXXXXX * D.da[rZZZZZ] + 10.0 * S.XXXXYY * D.da[rYYZZZ] + 10.0 * S.XXXXYY * D.da[rXXZZZ] +
+              5.0 * S.XXXXYY * D.da[rYYYYZ] + 30.0 * S.XXYYZZ * D.da[rXXYYZ] + 5.0 * S.XXXXYY * D.da[rXXXXZ];
+
+  res.da[1] = 5.0 * S.XXXXYY * D.da[rYZZZZ] + 10.0 * S.XXXXYY * D.da[rYYYZZ] + 30.0 * S.XXYYZZ * D.da[rXXYZZ] +
+              S.XXXXXX * D.da[rYYYYY] + 10.0 * S.XXXXYY * D.da[rXXYYY] + 5.0 * S.XXXXYY * D.da[rXXXXY];
+
+  res.da[2] = 5.0 * S.XXXXYY * D.da[rXZZZZ] + 30.0 * S.XXYYZZ * D.da[rXYYZZ] + 10.0 * S.XXXXYY * D.da[rXXXZZ] +
+              5.0 * S.XXXXYY * D.da[rXYYYY] + 10.0 * S.XXXXYY * D.da[rXXXYY] + S.XXXXXX * D.da[rXXXXX];
+
+  return res;
+}
+
+// contract a 2-ewaldtensor with a 0-tensor to yield a 2-tensor
+template <typename T>
+inline symtensor2<T> operator*(const ewaldtensor2<T> &S, const T &Q0)
+{
+  symtensor2<T> res;
+
+  res.da[qXX] = S.XX * Q0;
+
+  res.da[qXY] = 0.0;
+
+  res.da[qXZ] = 0.0;
+
+  res.da[qYY] = S.XX * Q0;
+
+  res.da[qYZ] = 0.0;
+
+  res.da[qZZ] = S.XX * Q0;
+
+  return res;
+}
+
+// contract a 4-ewaldtensor with a symmetric 2-tensor to yield a 2-tensor
+template <typename T>
+inline symtensor2<T> operator*(const ewaldtensor4<T> &S, const symtensor2<T> &D)
+{
+  symtensor2<T> res;
+
+  res.da[qXX] = S.XXXX * D.da[qZZ] + S.XXYY * D.da[qYY] + S.XXYY * D.da[qXX];
+
+  res.da[qXY] = 2.0 * S.XXYY * D.da[qYZ];
+
+  res.da[qXZ] = 2.0 * S.XXYY * D.da[qXZ];
+
+  res.da[qYY] = S.XXYY * D.da[qZZ] + S.XXXX * D.da[qYY] + S.XXYY * D.da[qXX];
+
+  res.da[qYZ] = 2.0 * S.XXYY * D.da[qXY];
+
+  res.da[qZZ] = S.XXYY * D.da[qZZ] + S.XXYY * D.da[qYY] + S.XXXX * D.da[qXX];
+
+  return res;
+}
+
+// contract a 6-ewaldtensor with a symmetric 4-tensor to yield a 3-tensor
+template <typename T>
+inline symtensor2<T> operator*(const ewaldtensor6<T> &S, const symtensor4<T> &D)
+{
+  symtensor2<T> res;
+
+  res.da[qXX] = S.XXXXXX * D.da[sZZZZ] + 6.0 * S.XXXXYY * D.da[sYYZZ] + 6.0 * S.XXXXYY * D.da[sXXZZ] + S.XXXXYY * D.da[sYYYY] +
+                6.0 * S.XXYYZZ * D.da[sXXYY] + S.XXXXYY * D.da[sXXXX];
+
+  res.da[qXY] = 4.0 * S.XXXXYY * D.da[sYZZZ] + 4.0 * S.XXXXYY * D.da[sYYYZ] + 12.0 * S.XXYYZZ * D.da[sXXYZ];
+
+  res.da[qXZ] = 4.0 * S.XXXXYY * D.da[sXZZZ] + 12.0 * S.XXYYZZ * D.da[sXYYZ] + 4.0 * S.XXXXYY * D.da[sXXXZ];
+
+  res.da[qYY] = S.XXXXYY * D.da[sZZZZ] + 6.0 * S.XXXXYY * D.da[sYYZZ] + 6.0 * S.XXYYZZ * D.da[sXXZZ] + S.XXXXXX * D.da[sYYYY] +
+                6.0 * S.XXXXYY * D.da[sXXYY] + S.XXXXYY * D.da[sXXXX];
+
+  res.da[qYZ] = 12.0 * S.XXYYZZ * D.da[sXYZZ] + 4.0 * S.XXXXYY * D.da[sXYYY] + 4.0 * S.XXXXYY * D.da[sXXXY];
+
+  res.da[qZZ] = S.XXXXYY * D.da[sZZZZ] + 6.0 * S.XXYYZZ * D.da[sYYZZ] + 6.0 * S.XXXXYY * D.da[sXXZZ] + S.XXXXYY * D.da[sYYYY] +
+                6.0 * S.XXXXYY * D.da[sXXYY] + S.XXXXXX * D.da[sXXXX];
+
+  return res;
+}
+
+// contract a 6-ewaldtensor with a symmetric 3-tensor to yield a 3-tensor
+template <typename T>
+inline symtensor3<T> operator*(const ewaldtensor6<T> &S, const symtensor3<T> &D)
+{
+  symtensor3<T> res;
+
+  res.da[dXXX] = S.XXXXXX * D.da[dZZZ] + 3.0 * S.XXXXYY * D.da[dYYZ] + 3.0 * S.XXXXYY * D.da[dXXZ];
+
+  res.da[dXXY] = 3.0 * S.XXXXYY * D.da[dYZZ] + S.XXXXYY * D.da[dYYY] + 3.0 * S.XXYYZZ * D.da[dXXY];
+
+  res.da[dXXZ] = 3.0 * S.XXXXYY * D.da[dXZZ] + 3.0 * S.XXYYZZ * D.da[dXYY] + S.XXXXYY * D.da[dXXX];
+
+  res.da[dXYY] = S.XXXXYY * D.da[dZZZ] + 3.0 * S.XXXXYY * D.da[dYYZ] + 3.0 * S.XXYYZZ * D.da[dXXZ];
+
+  res.da[dXYZ] = 6.0 * S.XXYYZZ * D.da[dXYZ];
+
+  res.da[dXZZ] = S.XXXXYY * D.da[dZZZ] + 3.0 * S.XXYYZZ * D.da[dYYZ] + 3.0 * S.XXXXYY * D.da[dXXZ];
+
+  res.da[dYYY] = 3.0 * S.XXXXYY * D.da[dYZZ] + S.XXXXXX * D.da[dYYY] + 3.0 * S.XXXXYY * D.da[dXXY];
+
+  res.da[dYYZ] = 3.0 * S.XXYYZZ * D.da[dXZZ] + 3.0 * S.XXXXYY * D.da[dXYY] + S.XXXXYY * D.da[dXXX];
+
+  res.da[dYZZ] = 3.0 * S.XXYYZZ * D.da[dYZZ] + S.XXXXYY * D.da[dYYY] + 3.0 * S.XXXXYY * D.da[dXXY];
+
+  res.da[dZZZ] = 3.0 * S.XXXXYY * D.da[dXZZ] + 3.0 * S.XXXXYY * D.da[dXYY] + S.XXXXXX * D.da[dXXX];
+
+  return res;
+}
+
+// contract a 8-ewaldtensor with a symmetric 5-tensor to yield a 3-tensor
+template <typename T>
+inline symtensor3<T> operator*(const ewaldtensor8<T> &S, const symtensor5<T> &D)
+{
+  symtensor3<T> res;
+
+  res.da[dXXX] = S.XXXXXXXX * D.da[rZZZZZ] + 10.0 * S.XXXXXXYY * D.da[rYYZZZ] + 10.0 * S.XXXXXXYY * D.da[rXXZZZ] +
+                 5.0 * S.XXXXYYYY * D.da[rYYYYZ] + 30.0 * S.XXXXYYZZ * D.da[rXXYYZ] + 5.0 * S.XXXXYYYY * D.da[rXXXXZ];
+
+  res.da[dXXY] = 5.0 * S.XXXXXXYY * D.da[rYZZZZ] + 10.0 * S.XXXXYYYY * D.da[rYYYZZ] + 30.0 * S.XXXXYYZZ * D.da[rXXYZZ] +
+                 S.XXXXXXYY * D.da[rYYYYY] + 10.0 * S.XXXXYYZZ * D.da[rXXYYY] + 5.0 * S.XXXXYYZZ * D.da[rXXXXY];
+
+  res.da[dXXZ] = 5.0 * S.XXXXXXYY * D.da[rXZZZZ] + 30.0 * S.XXXXYYZZ * D.da[rXYYZZ] + 10.0 * S.XXXXYYYY * D.da[rXXXZZ] +
+                 5.0 * S.XXXXYYZZ * D.da[rXYYYY] + 10.0 * S.XXXXYYZZ * D.da[rXXXYY] + S.XXXXXXYY * D.da[rXXXXX];
+
+  res.da[dXYY] = S.XXXXXXYY * D.da[rZZZZZ] + 10.0 * S.XXXXYYYY * D.da[rYYZZZ] + 10.0 * S.XXXXYYZZ * D.da[rXXZZZ] +
+                 5.0 * S.XXXXXXYY * D.da[rYYYYZ] + 30.0 * S.XXXXYYZZ * D.da[rXXYYZ] + 5.0 * S.XXXXYYZZ * D.da[rXXXXZ];
+
+  res.da[dXYZ] = 20.0 * S.XXXXYYZZ * D.da[rXYZZZ] + 20.0 * S.XXXXYYZZ * D.da[rXYYYZ] + 20.0 * S.XXXXYYZZ * D.da[rXXXYZ];
+
+  res.da[dXZZ] = S.XXXXXXYY * D.da[rZZZZZ] + 10.0 * S.XXXXYYZZ * D.da[rYYZZZ] + 10.0 * S.XXXXYYYY * D.da[rXXZZZ] +
+                 5.0 * S.XXXXYYZZ * D.da[rYYYYZ] + 30.0 * S.XXXXYYZZ * D.da[rXXYYZ] + 5.0 * S.XXXXXXYY * D.da[rXXXXZ];
+
+  res.da[dYYY] = 5.0 * S.XXXXYYYY * D.da[rYZZZZ] + 10.0 * S.XXXXXXYY * D.da[rYYYZZ] + 30.0 * S.XXXXYYZZ * D.da[rXXYZZ] +
+                 S.XXXXXXXX * D.da[rYYYYY] + 10.0 * S.XXXXXXYY * D.da[rXXYYY] + 5.0 * S.XXXXYYYY * D.da[rXXXXY];
+
+  res.da[dYYZ] = 5.0 * S.XXXXYYZZ * D.da[rXZZZZ] + 30.0 * S.XXXXYYZZ * D.da[rXYYZZ] + 10.0 * S.XXXXYYZZ * D.da[rXXXZZ] +
+                 5.0 * S.XXXXXXYY * D.da[rXYYYY] + 10.0 * S.XXXXYYYY * D.da[rXXXYY] + S.XXXXXXYY * D.da[rXXXXX];
+
+  res.da[dYZZ] = 5.0 * S.XXXXYYZZ * D.da[rYZZZZ] + 10.0 * S.XXXXYYZZ * D.da[rYYYZZ] + 30.0 * S.XXXXYYZZ * D.da[rXXYZZ] +
+                 S.XXXXXXYY * D.da[rYYYYY] + 10.0 * S.XXXXYYYY * D.da[rXXYYY] + 5.0 * S.XXXXXXYY * D.da[rXXXXY];
+
+  res.da[dZZZ] = 5.0 * S.XXXXYYYY * D.da[rXZZZZ] + 30.0 * S.XXXXYYZZ * D.da[rXYYZZ] + 10.0 * S.XXXXXXYY * D.da[rXXXZZ] +
+                 5.0 * S.XXXXYYYY * D.da[rXYYYY] + 10.0 * S.XXXXXXYY * D.da[rXXXYY] + S.XXXXXXXX * D.da[rXXXXX];
+
+  return res;
+}
+
+// contract a 4-ewaldtensor with a symmetric 0-tensor to yield a 4-tensor
+template <typename T>
+inline symtensor4<T> operator*(const ewaldtensor4<T> &S, const T &Q0)
+{
+  symtensor4<T> res;
+
+  res.da[sXXXX] = S.XXXX * Q0;
+
+  res.da[sXXXY] = 0.0;
+
+  res.da[sXXXZ] = 0.0;
+
+  res.da[sXXYY] = S.XXYY * Q0;
+
+  res.da[sXXYZ] = 0.0;
+
+  res.da[sXXZZ] = S.XXYY * Q0;
+
+  res.da[sXYYY] = 0.0;
+
+  res.da[sXYYZ] = 0.0;
+
+  res.da[sXYZZ] = 0.0;
+
+  res.da[sXZZZ] = 0.0;
+
+  res.da[sYYYY] = S.XXXX * Q0;
+
+  res.da[sYYYZ] = 0.0;
+
+  res.da[sYYZZ] = S.XXYY * Q0;
+
+  res.da[sYZZZ] = 0.0;
+
+  res.da[sZZZZ] = S.XXXX * Q0;
+
+  return res;
+}
+
+// contract a 6-ewaldtensor with a symmetric 2-tensor to yield a 4-tensor
+template <typename T>
+inline symtensor4<T> operator*(const ewaldtensor6<T> &S, const symtensor2<T> &D)
+{
+  symtensor4<T> res;
+
+  res.da[sXXXX] = S.XXXXXX * D.da[qZZ] + S.XXXXYY * D.da[qYY] + S.XXXXYY * D.da[qXX];
+
+  res.da[sXXXY] = 2.0 * S.XXXXYY * D.da[qYZ];
+
+  res.da[sXXXZ] = 2.0 * S.XXXXYY * D.da[qXZ];
+
+  res.da[sXXYY] = S.XXXXYY * D.da[qZZ] + S.XXXXYY * D.da[qYY] + S.XXYYZZ * D.da[qXX];
+
+  res.da[sXXYZ] = 2.0 * S.XXYYZZ * D.da[qXY];
+
+  res.da[sXXZZ] = S.XXXXYY * D.da[qZZ] + S.XXYYZZ * D.da[qYY] + S.XXXXYY * D.da[qXX];
+
+  res.da[sXYYY] = 2.0 * S.XXXXYY * D.da[qYZ];
+
+  res.da[sXYYZ] = 2.0 * S.XXYYZZ * D.da[qXZ];
+
+  res.da[sXYZZ] = 2.0 * S.XXYYZZ * D.da[qYZ];
+
+  res.da[sXZZZ] = 2.0 * S.XXXXYY * D.da[qXZ];
+
+  res.da[sYYYY] = S.XXXXYY * D.da[qZZ] + S.XXXXXX * D.da[qYY] + S.XXXXYY * D.da[qXX];
+
+  res.da[sYYYZ] = 2.0 * S.XXXXYY * D.da[qXY];
+
+  res.da[sYYZZ] = S.XXYYZZ * D.da[qZZ] + S.XXXXYY * D.da[qYY] + S.XXXXYY * D.da[qXX];
+
+  res.da[sYZZZ] = 2.0 * S.XXXXYY * D.da[qXY];
+
+  res.da[sZZZZ] = S.XXXXYY * D.da[qZZ] + S.XXXXYY * D.da[qYY] + S.XXXXXX * D.da[qXX];
+  return res;
+}
+
+// contract a 8-ewaldtensor with a symmetric 4-tensor to yield a 4-tensor
+template <typename T>
+inline symtensor4<T> operator*(const ewaldtensor8<T> &S, const symtensor4<T> &D)
+{
+  symtensor4<T> res;
+
+  res.da[sXXXX] = S.XXXXXXXX * D.da[sZZZZ] + 6.0 * S.XXXXXXYY * D.da[sYYZZ] + 6.0 * S.XXXXXXYY * D.da[sXXZZ] +
+                  S.XXXXYYYY * D.da[sYYYY] + 6.0 * S.XXXXYYZZ * D.da[sXXYY] + S.XXXXYYYY * D.da[sXXXX];
+
+  res.da[sXXXY] = 4.0 * S.XXXXXXYY * D.da[sYZZZ] + 4.0 * S.XXXXYYYY * D.da[sYYYZ] + 12.0 * S.XXXXYYZZ * D.da[sXXYZ];
+
+  res.da[sXXXZ] = 4.0 * S.XXXXXXYY * D.da[sXZZZ] + 12.0 * S.XXXXYYZZ * D.da[sXYYZ] + 4.0 * S.XXXXYYYY * D.da[sXXXZ];
+
+  res.da[sXXYY] = S.XXXXXXYY * D.da[sZZZZ] + 6.0 * S.XXXXYYYY * D.da[sYYZZ] + 6.0 * S.XXXXYYZZ * D.da[sXXZZ] +
+                  S.XXXXXXYY * D.da[sYYYY] + 6.0 * S.XXXXYYZZ * D.da[sXXYY] + S.XXXXYYZZ * D.da[sXXXX];
+
+  res.da[sXXYZ] = 12.0 * S.XXXXYYZZ * D.da[sXYZZ] + 4.0 * S.XXXXYYZZ * D.da[sXYYY] + 4.0 * S.XXXXYYZZ * D.da[sXXXY];
+
+  res.da[sXXZZ] = S.XXXXXXYY * D.da[sZZZZ] + 6.0 * S.XXXXYYZZ * D.da[sYYZZ] + 6.0 * S.XXXXYYYY * D.da[sXXZZ] +
+                  S.XXXXYYZZ * D.da[sYYYY] + 6.0 * S.XXXXYYZZ * D.da[sXXYY] + S.XXXXXXYY * D.da[sXXXX];
+
+  res.da[sXYYY] = 4.0 * S.XXXXYYYY * D.da[sYZZZ] + 4.0 * S.XXXXXXYY * D.da[sYYYZ] + 12.0 * S.XXXXYYZZ * D.da[sXXYZ];
+
+  res.da[sXYYZ] = 4.0 * S.XXXXYYZZ * D.da[sXZZZ] + 12.0 * S.XXXXYYZZ * D.da[sXYYZ] + 4.0 * S.XXXXYYZZ * D.da[sXXXZ];
+
+  res.da[sXYZZ] = 4.0 * S.XXXXYYZZ * D.da[sYZZZ] + 4.0 * S.XXXXYYZZ * D.da[sYYYZ] + 12.0 * S.XXXXYYZZ * D.da[sXXYZ];
+
+  res.da[sXZZZ] = 4.0 * S.XXXXYYYY * D.da[sXZZZ] + 12.0 * S.XXXXYYZZ * D.da[sXYYZ] + 4.0 * S.XXXXXXYY * D.da[sXXXZ];
+
+  res.da[sYYYY] = S.XXXXYYYY * D.da[sZZZZ] + 6.0 * S.XXXXXXYY * D.da[sYYZZ] + 6.0 * S.XXXXYYZZ * D.da[sXXZZ] +
+                  S.XXXXXXXX * D.da[sYYYY] + 6.0 * S.XXXXXXYY * D.da[sXXYY] + S.XXXXYYYY * D.da[sXXXX];
+
+  res.da[sYYYZ] = 12.0 * S.XXXXYYZZ * D.da[sXYZZ] + 4.0 * S.XXXXXXYY * D.da[sXYYY] + 4.0 * S.XXXXYYYY * D.da[sXXXY];
+
+  res.da[sYYZZ] = S.XXXXYYZZ * D.da[sZZZZ] + 6.0 * S.XXXXYYZZ * D.da[sYYZZ] + 6.0 * S.XXXXYYZZ * D.da[sXXZZ] +
+                  S.XXXXXXYY * D.da[sYYYY] + 6.0 * S.XXXXYYYY * D.da[sXXYY] + S.XXXXXXYY * D.da[sXXXX];
+
+  res.da[sYZZZ] = 12.0 * S.XXXXYYZZ * D.da[sXYZZ] + 4.0 * S.XXXXYYYY * D.da[sXYYY] + 4.0 * S.XXXXXXYY * D.da[sXXXY];
+
+  res.da[sZZZZ] = S.XXXXYYYY * D.da[sZZZZ] + 6.0 * S.XXXXYYZZ * D.da[sYYZZ] + 6.0 * S.XXXXXXYY * D.da[sXXZZ] +
+                  S.XXXXYYYY * D.da[sYYYY] + 6.0 * S.XXXXXXYY * D.da[sXXYY] + S.XXXXXXXX * D.da[sXXXX];
+
+  return res;
+}
+
+// contract a 8-ewaldtensor with a symmetric 3-tensor to yield a 5-tensor
+template <typename T>
+inline symtensor5<T> operator*(const ewaldtensor8<T> &S, const symtensor3<T> &D)
+{
+  symtensor5<T> res;
+
+  res.da[rXXXXX] = S.XXXXXXXX * D.da[dZZZ] + 3.0 * S.XXXXXXYY * D.da[dYYZ] + 3.0 * S.XXXXXXYY * D.da[dXXZ];
+
+  res.da[rXXXXY] = 3.0 * S.XXXXXXYY * D.da[dYZZ] + S.XXXXYYYY * D.da[dYYY] + 3.0 * S.XXXXYYZZ * D.da[dXXY];
+
+  res.da[rXXXXZ] = 3.0 * S.XXXXXXYY * D.da[dXZZ] + 3.0 * S.XXXXYYZZ * D.da[dXYY] + S.XXXXYYYY * D.da[dXXX];
+
+  res.da[rXXXYY] = S.XXXXXXYY * D.da[dZZZ] + 3.0 * S.XXXXYYYY * D.da[dYYZ] + 3.0 * S.XXXXYYZZ * D.da[dXXZ];
+
+  res.da[rXXXYZ] = 6.0 * S.XXXXYYZZ * D.da[dXYZ];
+
+  res.da[rXXXZZ] = S.XXXXXXYY * D.da[dZZZ] + 3.0 * S.XXXXYYZZ * D.da[dYYZ] + 3.0 * S.XXXXYYYY * D.da[dXXZ];
+
+  res.da[rXXYYY] = 3.0 * S.XXXXYYYY * D.da[dYZZ] + S.XXXXXXYY * D.da[dYYY] + 3.0 * S.XXXXYYZZ * D.da[dXXY];
+
+  res.da[rXXYYZ] = 3.0 * S.XXXXYYZZ * D.da[dXZZ] + 3.0 * S.XXXXYYZZ * D.da[dXYY] + S.XXXXYYZZ * D.da[dXXX];
+
+  res.da[rXXYZZ] = 3.0 * S.XXXXYYZZ * D.da[dYZZ] + S.XXXXYYZZ * D.da[dYYY] + 3.0 * S.XXXXYYZZ * D.da[dXXY];
+
+  res.da[rXXZZZ] = 3.0 * S.XXXXYYYY * D.da[dXZZ] + 3.0 * S.XXXXYYZZ * D.da[dXYY] + S.XXXXXXYY * D.da[dXXX];
+
+  res.da[rXYYYY] = S.XXXXYYYY * D.da[dZZZ] + 3.0 * S.XXXXXXYY * D.da[dYYZ] + 3.0 * S.XXXXYYZZ * D.da[dXXZ];
+
+  res.da[rXYYYZ] = 6.0 * S.XXXXYYZZ * D.da[dXYZ];
+
+  res.da[rXYYZZ] = S.XXXXYYZZ * D.da[dZZZ] + 3.0 * S.XXXXYYZZ * D.da[dYYZ] + 3.0 * S.XXXXYYZZ * D.da[dXXZ];
+
+  res.da[rXYZZZ] = 6.0 * S.XXXXYYZZ * D.da[dXYZ];
+
+  res.da[rXZZZZ] = S.XXXXYYYY * D.da[dZZZ] + 3.0 * S.XXXXYYZZ * D.da[dYYZ] + 3.0 * S.XXXXXXYY * D.da[dXXZ];
+
+  res.da[rYYYYY] = 3.0 * S.XXXXXXYY * D.da[dYZZ] + S.XXXXXXXX * D.da[dYYY] + 3.0 * S.XXXXXXYY * D.da[dXXY];
+
+  res.da[rYYYYZ] = 3.0 * S.XXXXYYZZ * D.da[dXZZ] + 3.0 * S.XXXXXXYY * D.da[dXYY] + S.XXXXYYYY * D.da[dXXX];
+
+  res.da[rYYYZZ] = 3.0 * S.XXXXYYZZ * D.da[dYZZ] + S.XXXXXXYY * D.da[dYYY] + 3.0 * S.XXXXYYYY * D.da[dXXY];
+
+  res.da[rYYZZZ] = 3.0 * S.XXXXYYZZ * D.da[dXZZ] + 3.0 * S.XXXXYYYY * D.da[dXYY] + S.XXXXXXYY * D.da[dXXX];
+
+  res.da[rYZZZZ] = 3.0 * S.XXXXYYZZ * D.da[dYZZ] + S.XXXXYYYY * D.da[dYYY] + 3.0 * S.XXXXXXYY * D.da[dXXY];
+
+  res.da[rZZZZZ] = 3.0 * S.XXXXXXYY * D.da[dXZZ] + 3.0 * S.XXXXXXYY * D.da[dXYY] + S.XXXXXXXX * D.da[dXXX];
+
+  return res;
+}
+
+// contract a a 10-ewaldtensor with a symmetric 5-tensor to yield a 5-tensor
+template <typename T>
+inline symtensor5<T> operator*(const ewaldtensor10<T> &S, const symtensor5<T> &D)
+{
+  symtensor5<T> res;
+
+  res.da[rXXXXX] = S.XXXXXXXXXX * D.da[rZZZZZ] + 10.0 * S.XXXXXXXXYY * D.da[rYYZZZ] + 10.0 * S.XXXXXXXXYY * D.da[rXXZZZ] +
+                   5.0 * S.XXXXXXYYYY * D.da[rYYYYZ] + 30.0 * S.XXXXXXYYZZ * D.da[rXXYYZ] + 5.0 * S.XXXXXXYYYY * D.da[rXXXXZ];
+
+  res.da[rXXXXY] = 5.0 * S.XXXXXXXXYY * D.da[rYZZZZ] + 10.0 * S.XXXXXXYYYY * D.da[rYYYZZ] + 30.0 * S.XXXXXXYYZZ * D.da[rXXYZZ] +
+                   S.XXXXXXYYYY * D.da[rYYYYY] + 10.0 * S.XXXXYYYYZZ * D.da[rXXYYY] + 5.0 * S.XXXXYYYYZZ * D.da[rXXXXY];
+
+  res.da[rXXXXZ] = 5.0 * S.XXXXXXXXYY * D.da[rXZZZZ] + 30.0 * S.XXXXXXYYZZ * D.da[rXYYZZ] + 10.0 * S.XXXXXXYYYY * D.da[rXXXZZ] +
+                   5.0 * S.XXXXYYYYZZ * D.da[rXYYYY] + 10.0 * S.XXXXYYYYZZ * D.da[rXXXYY] + S.XXXXXXYYYY * D.da[rXXXXX];
+
+  res.da[rXXXYY] = S.XXXXXXXXYY * D.da[rZZZZZ] + 10.0 * S.XXXXXXYYYY * D.da[rYYZZZ] + 10.0 * S.XXXXXXYYZZ * D.da[rXXZZZ] +
+                   5.0 * S.XXXXXXYYYY * D.da[rYYYYZ] + 30.0 * S.XXXXYYYYZZ * D.da[rXXYYZ] + 5.0 * S.XXXXYYYYZZ * D.da[rXXXXZ];
+
+  res.da[rXXXYZ] = 20.0 * S.XXXXXXYYZZ * D.da[rXYZZZ] + 20.0 * S.XXXXYYYYZZ * D.da[rXYYYZ] + 20.0 * S.XXXXYYYYZZ * D.da[rXXXYZ];
+
+  res.da[rXXXZZ] = S.XXXXXXXXYY * D.da[rZZZZZ] + 10.0 * S.XXXXXXYYZZ * D.da[rYYZZZ] + 10.0 * S.XXXXXXYYYY * D.da[rXXZZZ] +
+                   5.0 * S.XXXXYYYYZZ * D.da[rYYYYZ] + 30.0 * S.XXXXYYYYZZ * D.da[rXXYYZ] + 5.0 * S.XXXXXXYYYY * D.da[rXXXXZ];
+
+  res.da[rXXYYY] = 5.0 * S.XXXXXXYYYY * D.da[rYZZZZ] + 10.0 * S.XXXXXXYYYY * D.da[rYYYZZ] + 30.0 * S.XXXXYYYYZZ * D.da[rXXYZZ] +
+                   S.XXXXXXXXYY * D.da[rYYYYY] + 10.0 * S.XXXXXXYYZZ * D.da[rXXYYY] + 5.0 * S.XXXXYYYYZZ * D.da[rXXXXY];
+
+  res.da[rXXYYZ] = 5.0 * S.XXXXXXYYZZ * D.da[rXZZZZ] + 30.0 * S.XXXXYYYYZZ * D.da[rXYYZZ] + 10.0 * S.XXXXYYYYZZ * D.da[rXXXZZ] +
+                   5.0 * S.XXXXXXYYZZ * D.da[rXYYYY] + 10.0 * S.XXXXYYYYZZ * D.da[rXXXYY] + S.XXXXXXYYZZ * D.da[rXXXXX];
+
+  res.da[rXXYZZ] = 5.0 * S.XXXXXXYYZZ * D.da[rYZZZZ] + 10.0 * S.XXXXYYYYZZ * D.da[rYYYZZ] + 30.0 * S.XXXXYYYYZZ * D.da[rXXYZZ] +
+                   S.XXXXXXYYZZ * D.da[rYYYYY] + 10.0 * S.XXXXYYYYZZ * D.da[rXXYYY] + 5.0 * S.XXXXXXYYZZ * D.da[rXXXXY];
+
+  res.da[rXXZZZ] = 5.0 * S.XXXXXXYYYY * D.da[rXZZZZ] + 30.0 * S.XXXXYYYYZZ * D.da[rXYYZZ] + 10.0 * S.XXXXXXYYYY * D.da[rXXXZZ] +
+                   5.0 * S.XXXXYYYYZZ * D.da[rXYYYY] + 10.0 * S.XXXXXXYYZZ * D.da[rXXXYY] + S.XXXXXXXXYY * D.da[rXXXXX];
+
+  res.da[rXYYYY] = S.XXXXXXYYYY * D.da[rZZZZZ] + 10.0 * S.XXXXXXYYYY * D.da[rYYZZZ] + 10.0 * S.XXXXYYYYZZ * D.da[rXXZZZ] +
+                   5.0 * S.XXXXXXXXYY * D.da[rYYYYZ] + 30.0 * S.XXXXXXYYZZ * D.da[rXXYYZ] + 5.0 * S.XXXXYYYYZZ * D.da[rXXXXZ];
+
+  res.da[rXYYYZ] = 20.0 * S.XXXXYYYYZZ * D.da[rXYZZZ] + 20.0 * S.XXXXXXYYZZ * D.da[rXYYYZ] + 20.0 * S.XXXXYYYYZZ * D.da[rXXXYZ];
+
+  res.da[rXYYZZ] = S.XXXXXXYYZZ * D.da[rZZZZZ] + 10.0 * S.XXXXYYYYZZ * D.da[rYYZZZ] + 10.0 * S.XXXXYYYYZZ * D.da[rXXZZZ] +
+                   5.0 * S.XXXXXXYYZZ * D.da[rYYYYZ] + 30.0 * S.XXXXYYYYZZ * D.da[rXXYYZ] + 5.0 * S.XXXXXXYYZZ * D.da[rXXXXZ];
+
+  res.da[rXYZZZ] = 20.0 * S.XXXXYYYYZZ * D.da[rXYZZZ] + 20.0 * S.XXXXYYYYZZ * D.da[rXYYYZ] + 20.0 * S.XXXXXXYYZZ * D.da[rXXXYZ];
+
+  res.da[rXZZZZ] = S.XXXXXXYYYY * D.da[rZZZZZ] + 10.0 * S.XXXXYYYYZZ * D.da[rYYZZZ] + 10.0 * S.XXXXXXYYYY * D.da[rXXZZZ] +
+                   5.0 * S.XXXXYYYYZZ * D.da[rYYYYZ] + 30.0 * S.XXXXXXYYZZ * D.da[rXXYYZ] + 5.0 * S.XXXXXXXXYY * D.da[rXXXXZ];
+
+  res.da[rYYYYY] = 5.0 * S.XXXXXXYYYY * D.da[rYZZZZ] + 10.0 * S.XXXXXXXXYY * D.da[rYYYZZ] + 30.0 * S.XXXXXXYYZZ * D.da[rXXYZZ] +
+                   S.XXXXXXXXXX * D.da[rYYYYY] + 10.0 * S.XXXXXXXXYY * D.da[rXXYYY] + 5.0 * S.XXXXXXYYYY * D.da[rXXXXY];
+
+  res.da[rYYYYZ] = 5.0 * S.XXXXYYYYZZ * D.da[rXZZZZ] + 30.0 * S.XXXXXXYYZZ * D.da[rXYYZZ] + 10.0 * S.XXXXYYYYZZ * D.da[rXXXZZ] +
+                   5.0 * S.XXXXXXXXYY * D.da[rXYYYY] + 10.0 * S.XXXXXXYYYY * D.da[rXXXYY] + S.XXXXXXYYYY * D.da[rXXXXX];
+
+  res.da[rYYYZZ] = 5.0 * S.XXXXYYYYZZ * D.da[rYZZZZ] + 10.0 * S.XXXXXXYYZZ * D.da[rYYYZZ] + 30.0 * S.XXXXYYYYZZ * D.da[rXXYZZ] +
+                   S.XXXXXXXXYY * D.da[rYYYYY] + 10.0 * S.XXXXXXYYYY * D.da[rXXYYY] + 5.0 * S.XXXXXXYYYY * D.da[rXXXXY];
+
+  res.da[rYYZZZ] = 5.0 * S.XXXXYYYYZZ * D.da[rXZZZZ] + 30.0 * S.XXXXYYYYZZ * D.da[rXYYZZ] + 10.0 * S.XXXXXXYYZZ * D.da[rXXXZZ] +
+                   5.0 * S.XXXXXXYYYY * D.da[rXYYYY] + 10.0 * S.XXXXXXYYYY * D.da[rXXXYY] + S.XXXXXXXXYY * D.da[rXXXXX];
+
+  res.da[rYZZZZ] = 5.0 * S.XXXXYYYYZZ * D.da[rYZZZZ] + 10.0 * S.XXXXYYYYZZ * D.da[rYYYZZ] + 30.0 * S.XXXXXXYYZZ * D.da[rXXYZZ] +
+                   S.XXXXXXYYYY * D.da[rYYYYY] + 10.0 * S.XXXXXXYYYY * D.da[rXXYYY] + 5.0 * S.XXXXXXXXYY * D.da[rXXXXY];
+
+  res.da[rZZZZZ] = 5.0 * S.XXXXXXYYYY * D.da[rXZZZZ] + 30.0 * S.XXXXXXYYZZ * D.da[rXYYZZ] + 10.0 * S.XXXXXXXXYY * D.da[rXXXZZ] +
+                   5.0 * S.XXXXXXYYYY * D.da[rXYYYY] + 10.0 * S.XXXXXXXXYY * D.da[rXXXYY] + S.XXXXXXXXXX * D.da[rXXXXX];
+
+  return res;
+}
+
+#endif
diff --git a/src/gravity/grav_direct.cc b/src/gravity/grav_direct.cc
new file mode 100644
index 0000000000000000000000000000000000000000..607a833eb1ef6b23a833c68bfd47a0896cb73b8b
--- /dev/null
+++ b/src/gravity/grav_direct.cc
@@ -0,0 +1,325 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file grav_direct.cc
+ *
+ *  \brief calculates forces through direct summation
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../pm/pm.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+#ifdef ALLOW_DIRECT_SUMMATION
+
+/*! \brief This function computes the gravitational forces for all active particles through direct summation.
+ *
+ */
+template <>
+void gravtree<simparticles>::gravity_direct(simparticles *Sp, domain<simparticles> *D, int timebin)
+{
+  TIMER_START(CPU_TREEDIRECT);
+
+  D->mpi_printf("GRAVDIRECT: direct summation.  (presently allocated=%g MB)\n", Mem.getAllocatedBytesInMB());
+
+  double tstart = Logs.second();
+
+  int *Send_count  = (int *)Mem.mymalloc_movable(&Send_count, "Send_count", sizeof(int) * D->NTask);
+  int *Send_offset = (int *)Mem.mymalloc_movable(&Send_offset, "Send_offset", sizeof(int) * D->NTask);
+  int *Recv_count  = (int *)Mem.mymalloc_movable(&Recv_count, "Recv_count", sizeof(int) * D->NTask);
+  int *Recv_offset = (int *)Mem.mymalloc_movable(&Recv_offset, "Recv_offset", sizeof(int) * D->NTask);
+
+  struct directdata
+  {
+    MyIntPosType IntPos[3];
+    MyDouble Mass;
+    unsigned char Type;
+#if NSOFTCLASSES > 1
+    unsigned char SofteningClass;
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+    unsigned char InsideOutsideFlag : 1;
+#endif
+  };
+  directdata *DirectDataIn, *DirectDataAll;
+
+  struct accdata
+  {
+    MyFloat Acc[3];
+#ifdef EVALPOTENTIAL
+    MyFloat Potential;
+#endif
+  };
+  accdata *DirectAccOut, *DirectAccIn;
+
+  DirectDataIn = (directdata *)Mem.mymalloc("DirectDataIn", Sp->TimeBinsGravity.NActiveParticles * sizeof(directdata));
+
+  int nforces = 0;
+
+  for(int idx = 0; idx < Sp->TimeBinsGravity.NActiveParticles; idx++)
+    {
+      int i = Sp->TimeBinsGravity.ActiveParticleList[idx];
+
+      for(int k = 0; k < 3; k++)
+        DirectDataIn[nforces].IntPos[k] = Sp->P[i].IntPos[k];
+
+      DirectDataIn[nforces].Mass = Sp->P[i].getMass();
+
+      DirectDataIn[nforces].Type = Sp->P[i].getType();
+#if NSOFTCLASSES > 1
+      DirectDataIn[nforces].SofteningClass = Sp->P[i].getSofteningClass();
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      DirectDataIn[nforces].InsideOutsideFlag = Sp->P[i].InsideOutsideFlag;
+#endif
+      nforces++;
+    }
+
+  MPI_Allgather(&nforces, 1, MPI_INT, Recv_count, 1, MPI_INT, D->Communicator);
+
+  int nimport    = 0;
+  Recv_offset[0] = 0;
+
+  for(int j = 0; j < D->NTask; j++)
+    {
+      nimport += Recv_count[j];
+
+      if(j > 0)
+        Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+    }
+
+  DirectDataAll = (directdata *)Mem.mymalloc("DirectDataAll", nimport * sizeof(directdata));
+
+  for(int j = 0; j < D->NTask; j++)
+    {
+      Send_count[j]  = Recv_count[j] * sizeof(directdata);
+      Send_offset[j] = Recv_offset[j] * sizeof(directdata);
+    }
+
+  MPI_Allgatherv(DirectDataIn, nforces * sizeof(directdata), MPI_BYTE, DirectDataAll, Send_count, Send_offset, MPI_BYTE,
+                 D->Communicator);
+
+  /* subdivide the work evenly */
+  int first, count;
+  subdivide_evenly(nimport, D->NTask, D->ThisTask, &first, &count);
+
+  DirectAccOut = (accdata *)Mem.mymalloc("DirectDataOut", count * sizeof(accdata));
+
+  /* now calculate the forces */
+  for(int i = 0; i < count; i++)
+    {
+      int target     = i + first;
+      int result_idx = i;
+
+      vector<double> acc = 0.0;
+#ifdef EVALPOTENTIAL
+      double pot = 0.0;
+#endif
+
+#if NSOFTCLASSES > 1
+      double h_i = All.ForceSoftening[DirectDataAll[target].SofteningClass];
+#else
+      double h_i = All.ForceSoftening[0];
+#endif
+
+      for(int j = 0; j < nimport; j++)
+        {
+#if NSOFTCLASSES > 1
+          double h_j = All.ForceSoftening[DirectDataAll[j].SofteningClass];
+#else
+          double h_j = All.ForceSoftening[0];
+#endif
+          double hmax = (h_j > h_i) ? h_j : h_i;
+
+          vector<double> dxyz;
+          Sp->nearest_image_intpos_to_pos(DirectDataAll[j].IntPos, DirectDataAll[target].IntPos,
+                                          dxyz.da); /* converts the integer distance to floating point */
+
+          double r2 = dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2];
+
+          double mass = DirectDataAll[j].Mass;
+
+          /* now evaluate the force component */
+
+          double r = sqrt(r2);
+
+          double rinv = (r > 0) ? 1.0 / r : 0;
+
+          gravtree<simparticles>::gfactors gfac;
+
+#ifdef PMGRID
+          mesh_factors *mfp = &mf[LOW_MESH];
+#if defined(PLACEHIGHRESREGION)
+          if((DoPM & TREE_ACTIVE_CUTTOFF_HIGHRES_PM))
+            {
+              if(DirectDataAll[j].InsideOutsideFlag == FLAG_INSIDE && DirectDataAll[target].InsideOutsideFlag == FLAG_INSIDE)
+                mfp = &mf[HIGH_MESH];
+            }
+#endif
+          if((DoPM & (TREE_ACTIVE_CUTTOFF_BASE_PM + TREE_ACTIVE_CUTTOFF_HIGHRES_PM)))
+            {
+              if(modify_gfactors_pm_monopole(gfac, r, rinv, mfp))
+                return;  // if we are outside the cut-off radius, we have no interaction
+            }
+#endif
+          get_gfactors_monopole(gfac, r, hmax, rinv);
+
+#ifdef EVALPOTENTIAL
+          pot -= mass * gfac.fac0;
+#endif
+          acc -= (mass * gfac.fac1 * rinv) * dxyz;
+
+          if(DoEwald)
+            {
+              // EWALD treatment, only done for periodic boundaries in case PM is not active
+
+              ewald_data ew;
+              Ewald.ewald_gridlookup(DirectDataAll[j].IntPos, DirectDataAll[target].IntPos, ewald::POINTMASS, ew);
+
+#ifdef EVALPOTENTIAL
+              pot += mass * ew.D0phi;
+#endif
+              acc += mass * ew.D1phi;
+            }
+        }
+
+      DirectAccOut[result_idx].Acc[0] = acc[0];
+      DirectAccOut[result_idx].Acc[1] = acc[1];
+      DirectAccOut[result_idx].Acc[2] = acc[2];
+#ifdef EVALPOTENTIAL
+      DirectAccOut[result_idx].Potential = pot;
+#endif
+    }
+
+  /* now send the forces to the right places */
+
+  DirectAccIn = (accdata *)Mem.mymalloc("DirectDataIn", nforces * sizeof(accdata));
+
+  MPI_Request *requests = (MPI_Request *)Mem.mymalloc_movable(&requests, "requests", 2 * D->NTask * sizeof(MPI_Request));
+  int n_requests        = 0;
+
+  int recvTask = 0;
+  int sendTask = 0;
+  int send_first, send_count;
+  subdivide_evenly(nimport, D->NTask, sendTask, &send_first, &send_count);
+
+  while(recvTask < D->NTask && sendTask < D->NTask) /* go through both lists */
+    {
+      while(send_first + send_count < Recv_offset[recvTask])
+        {
+          if(sendTask >= D->NTask - 1)
+            Terminate("sendTask >= NTask  recvTask=%d sendTask=%d", recvTask, sendTask);
+
+          sendTask++;
+          subdivide_evenly(nimport, D->NTask, sendTask, &send_first, &send_count);
+        }
+
+      while(Recv_offset[recvTask] + Recv_count[recvTask] < send_first)
+        {
+          if(recvTask >= D->NTask - 1)
+            Terminate("recvTask >= NTask  recvTask=%d sendTask=%d", recvTask, sendTask);
+
+          recvTask++;
+        }
+
+      int start = std::max<int>(Recv_offset[recvTask], send_first);
+      int next  = std::min<int>(Recv_offset[recvTask] + Recv_count[recvTask], send_first + send_count);
+
+      if(next - start >= 1)
+        {
+          if(D->ThisTask == sendTask)
+            MPI_Isend(DirectAccOut + start - send_first, (next - start) * sizeof(accdata), MPI_BYTE, recvTask, TAG_PDATA_SPH,
+                      D->Communicator, &requests[n_requests++]);
+
+          if(D->ThisTask == recvTask)
+            MPI_Irecv(DirectAccIn + start - Recv_offset[recvTask], (next - start) * sizeof(accdata), MPI_BYTE, sendTask, TAG_PDATA_SPH,
+                      D->Communicator, &requests[n_requests++]);
+        }
+
+      if(next == Recv_offset[recvTask] + Recv_count[recvTask])
+        recvTask++;
+      else
+        {
+          sendTask++;
+          if(sendTask >= D->NTask)
+            break;
+
+          subdivide_evenly(nimport, D->NTask, sendTask, &send_first, &send_count);
+        }
+    }
+
+  MPI_Waitall(n_requests, requests, MPI_STATUSES_IGNORE);
+  Mem.myfree(requests);
+
+  nforces = 0;
+
+  for(int idx = 0; idx < Sp->TimeBinsGravity.NActiveParticles; idx++)
+    {
+      int i = Sp->TimeBinsGravity.ActiveParticleList[idx];
+
+      for(int k = 0; k < 3; k++)
+        Sp->P[i].GravAccel[k] = DirectAccIn[nforces].Acc[k];
+
+#ifdef EVALPOTENTIAL
+      Sp->P[i].Potential = DirectAccIn[nforces].Potential;
+#endif
+      nforces++;
+    }
+
+  Mem.myfree(DirectAccIn);
+  Mem.myfree(DirectAccOut);
+  Mem.myfree(DirectDataAll);
+  Mem.myfree(DirectDataIn);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  D->mpi_printf("GRAVDIRECT: force is done.\n");
+
+  All.TotNumDirectForces += Sp->TimeBinsGravity.GlobalNActiveParticles;
+
+  double tend = Logs.second();
+
+  double timedirect, sumt;
+  timedirect = tend - tstart;
+
+  MPI_Reduce(&timedirect, &sumt, 1, MPI_DOUBLE, MPI_SUM, 0, D->Communicator);
+
+  if(D->ThisTask == 0)
+    {
+      fprintf(Logs.FdTimings, "Nf=%9lld  timebin=%d  active part/task: avg=%g   total-Nf=%lld\n",
+              Sp->TimeBinsGravity.GlobalNActiveParticles, timebin, ((double)Sp->TimeBinsGravity.GlobalNActiveParticles) / D->NTask,
+              All.TotNumDirectForces);
+      fprintf(Logs.FdTimings, "  (direct) took=%g sec part/sec:  %g   ia/sec: %g\n", timedirect,
+              Sp->TimeBinsGravity.GlobalNActiveParticles / (sumt + 1.0e-20),
+              Sp->TimeBinsGravity.GlobalNActiveParticles / (sumt + 1.0e-20) * Sp->TimeBinsGravity.GlobalNActiveParticles);
+      myflush(Logs.FdTimings);
+    }
+
+  TIMER_STOP(CPU_TREEDIRECT);
+}
+
+#endif
diff --git a/src/gravity/grav_forcetest.cc b/src/gravity/grav_forcetest.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad99a5fa99185bf018d8270101abd3f31f867a02
--- /dev/null
+++ b/src/gravity/grav_forcetest.cc
@@ -0,0 +1,726 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file grav_forcetest.cc
+ *
+ *  \brief routines for testing the force accuracy though comparison with direct summation
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef FORCETEST
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../gravity/ewald.h"
+#include "../gravity/grav_forcetest.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/generic_comm.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../pm/pm.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/*  This function computes the gravitational forces for all active particles.
+ *  A new tree is constructed, if the number of force computations since
+ *  it's last construction exceeds some fraction of the total
+ *  particle number, otherwise tree nodes are dynamically updated if needed.
+ */
+
+/*  FORCETEST_TESTFORCELAW=1   for special test to check force law for TreePM
+ *  FORCETEST_TESTFORCELAW=2   for special test to check force law for TreePM+PLACEHIGHRESREGION
+ */
+
+/* local data structure for collecting particle/cell data that is sent to other processors if needed */
+struct frctest_in : data_in_generic
+{
+  MyIntPosType IntPos[3];
+  unsigned char Type;
+#if NSOFTCLASSES > 1
+  unsigned char SofteningClass;
+#endif
+};
+
+/* local data structure that holds results acquired on remote processors */
+struct frctest_out
+{
+  double Acc[3];
+  double Pot;
+  double DistToID1;
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+  double AccShortRange[3];
+  double PotShortRange;
+#ifdef PLACEHIGHRESREGION
+  double AccVeryShortRange[3];
+  double PotVeryShortRange;
+#endif
+#endif
+};
+
+typedef generic_comm<frctest_in, frctest_out, gravtree<simparticles>, domain<simparticles>, simparticles> my_comm;
+
+class frctest_comm : public my_comm
+{
+ public:
+  // need to implement a constructor that calls the constructor of the base class
+  frctest_comm(domain<simparticles> *dptr, gravtree<simparticles> *tptr, simparticles *pptr, gravtest *gtptr)
+      : my_comm(dptr, tptr, pptr)
+  {
+  }
+
+  using my_comm::D;
+  using my_comm::Thread;
+  using my_comm::Tp;
+  using my_comm::Tree;
+
+  void particle2in(frctest_in *in, int i) override
+  {
+    for(int j = 0; j < 3; j++)
+      in->IntPos[j] = Tp->P[i].IntPos[j];
+
+    in->Type = Tp->P[i].getType();
+#if NSOFTCLASSES > 1
+    in->SofteningClass = Tp->P[i].getSofteningClass();
+#endif
+  }
+
+  void out2particle(frctest_out *out, int i, int mode) override
+  {
+    if(mode == MODE_LOCAL_PARTICLES) /* initial store */
+      {
+        Tp->P[i].GravAccelDirect[0] = out->Acc[0];
+        Tp->P[i].GravAccelDirect[1] = out->Acc[1];
+        Tp->P[i].GravAccelDirect[2] = out->Acc[2];
+        Tp->P[i].PotentialDirect    = out->Pot;
+
+        Tp->P[i].DistToID1 = out->DistToID1;
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+        Tp->P[i].GravAccelShortRange[0] = out->AccShortRange[0];
+        Tp->P[i].GravAccelShortRange[1] = out->AccShortRange[1];
+        Tp->P[i].GravAccelShortRange[2] = out->AccShortRange[2];
+        Tp->P[i].PotentialShortRange    = out->PotShortRange;
+#ifdef PLACEHIGHRESREGION
+        Tp->P[i].GravAccelVeryShortRange[0] = out->AccVeryShortRange[0];
+        Tp->P[i].GravAccelVeryShortRange[1] = out->AccVeryShortRange[1];
+        Tp->P[i].GravAccelVeryShortRange[2] = out->AccVeryShortRange[2];
+        Tp->P[i].PotentialVeryShortRange    = out->PotVeryShortRange;
+#endif
+#endif
+      }
+    else /* combine */
+      {
+        Tp->P[i].GravAccelDirect[0] += out->Acc[0];
+        Tp->P[i].GravAccelDirect[1] += out->Acc[1];
+        Tp->P[i].GravAccelDirect[2] += out->Acc[2];
+        Tp->P[i].PotentialDirect += out->Pot;
+        if(out->DistToID1 > 0)
+          Tp->P[i].DistToID1 = out->DistToID1;
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+        Tp->P[i].GravAccelShortRange[0] += out->AccShortRange[0];
+        Tp->P[i].GravAccelShortRange[1] += out->AccShortRange[1];
+        Tp->P[i].GravAccelShortRange[2] += out->AccShortRange[2];
+        Tp->P[i].PotentialShortRange += out->PotShortRange;
+#ifdef PLACEHIGHRESREGION
+        Tp->P[i].GravAccelVeryShortRange[0] += out->AccVeryShortRange[0];
+        Tp->P[i].GravAccelVeryShortRange[1] += out->AccVeryShortRange[1];
+        Tp->P[i].GravAccelVeryShortRange[2] += out->AccVeryShortRange[2];
+        Tp->P[i].PotentialVeryShortRange += out->PotVeryShortRange;
+#endif
+#endif
+      }
+  }
+
+  int evaluate(int target, int mode, int thread_id, int action, frctest_in *in, int numnodes, node_info *firstnode,
+               frctest_out &out) override
+  {
+    /* make sure that the particle is exported to all other tasks exactly once */
+    if(mode == MODE_LOCAL_PARTICLES)
+      {
+        for(int n = 0; n < this->D->NTopleaves; n++)
+          {
+            int task = D->TaskOfLeaf[n];
+
+            if(task == D->ThisTask)
+              continue;
+
+            if(Thread.Exportflag[task] == target)
+              continue;
+
+            int no = n + this->Tree->MaxPart + this->Tree->MaxNodes; /* a pseudo node for this task */
+
+            this->Tree->tree_export_node_threads(no, target, &Thread);
+          }
+      }
+
+    MyIntPosType *intpos = in->IntPos;
+
+    out.Pot = 0;
+
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+    out.PotShortRange = 0;
+#ifdef PLACEHIGHRESREGION
+    out.PotVeryShortRange = 0;
+#endif
+#endif
+    for(int i = 0; i < 3; i++)
+      {
+        out.Acc[i] = 0;
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+        out.AccShortRange[i] = 0;
+#ifdef PLACEHIGHRESREGION
+        out.AccVeryShortRange[i] = 0;
+#endif
+#endif
+      }
+
+    double disttoid1 = 0;
+
+    for(int idx = 0; idx < Tp->nsource; idx++)
+      {
+        int j = Tp->indexlist[idx];
+
+        double hmax;
+#if NSOFTCLASSES > 1
+        double h_i = All.ForceSoftening[in->SofteningClass];
+        double h_j = All.ForceSoftening[Tp->P[j].getSofteningClass()];
+
+        if(h_j > h_i)
+          hmax = h_j;
+        else
+          hmax = h_i;
+#else
+        hmax = All.ForceSoftening[0];
+#endif
+
+        double dxyz[3];
+        Tp->nearest_image_intpos_to_pos(Tp->P[j].IntPos, intpos, dxyz); /* converts the integer distance to floating point */
+
+        double r2 = dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2];
+
+        double mass = Tp->P[j].getMass();
+
+        /* now evaluate the multipole moment */
+
+        double r = sqrt(r2);
+
+        if(Tp->P[j].ID.get() == 1)
+          disttoid1 = r;
+
+        /* we compute 3 different forces:
+         * (1) The correct direct summation force, if needed with Ewald correction: ftrue
+         * In the case of PM:
+         * (2) The short range direct summation force with only the erfc cut-off (this is what the tree can at best deliver): fsr
+         * (3) The expected PM force based on the long-range part of the Ewald sum. This is equal to ftrue - fsr - fsfr_periodic_images
+         * */
+
+        double wp_newton, fac_newton;
+
+        if(r > 0)
+          {
+            fac_newton = mass / (r2 * r);
+            wp_newton  = -mass / r;
+          }
+        else
+          {
+            fac_newton = 0;
+            wp_newton  = 0;
+          }
+
+        double fac, wp;
+
+        if(r >= hmax)
+          {
+            fac = fac_newton;
+            wp  = wp_newton;
+          }
+        else
+          {
+            double h_inv  = 1.0 / hmax;
+            double h3_inv = h_inv * h_inv * h_inv;
+            double u      = r * h_inv;
+
+            if(u < 0.5)
+              {
+                double u2 = u * u;
+                fac       = mass * h3_inv * (SOFTFAC1 + u2 * (SOFTFAC2 * u + SOFTFAC3));
+                wp        = mass * h_inv * (SOFTFAC4 + u2 * (SOFTFAC5 + u2 * (SOFTFAC6 * u + SOFTFAC7)));
+              }
+            else
+              {
+                double u2 = u * u, u3 = u2 * u;
+                fac = mass * h3_inv * (SOFTFAC8 + SOFTFAC9 * u + SOFTFAC10 * u2 + SOFTFAC11 * u3 + SOFTFAC12 / u3);
+                wp  = mass * h_inv * (SOFTFAC13 + SOFTFAC14 / u + u2 * (SOFTFAC1 + u * (SOFTFAC15 + u * (SOFTFAC16 + SOFTFAC17 * u))));
+              }
+          }
+
+          // The Newtonian force is:      fac * dxyz
+          // The Newtonian potential is:  wp
+
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+        {
+          double asmth = Tp->Asmth[0];
+          double u     = 0.5 / asmth * r;
+
+          double factor_force = (erfc(u) + 2.0 * u / sqrt(M_PI) * exp(-u * u) - 1.0);
+          double factor_pot   = erfc(u);
+
+          double facs = fac + fac_newton * factor_force;
+          double wps  = wp + (r > 0 ? wp_newton * (factor_pot - 1.0) : mass / (asmth * sqrt(M_PI)));
+
+          double acc_short_x = dxyz[0] * facs;
+          double acc_short_y = dxyz[1] * facs;
+          double acc_short_z = dxyz[2] * facs;
+
+#ifndef GRAVITY_TALLBOX
+          double alpha = 0.5 / asmth;
+          double pot_short =
+              wps + mass * M_PI / (alpha * alpha * All.BoxSize * All.BoxSize * All.BoxSize) * (LONG_X * LONG_Y * LONG_Z);
+#else
+          double pot_short = wps; /* the constant potential term is here computed as part of the long-range force and not the
+                                     short-range force, unlike in the ordinary periodic case */
+#endif
+          out.AccShortRange[0] += acc_short_x;
+          out.AccShortRange[1] += acc_short_y;
+          out.AccShortRange[2] += acc_short_z;
+          out.PotShortRange += pot_short;
+
+#ifdef PLACEHIGHRESREGION
+          if(Tp->check_high_res_point_location(Tp->P[j].IntPos) == FLAG_INSIDE &&
+             Tp->check_high_res_point_location(intpos) == FLAG_INSIDE)
+            {
+              double asmth = Tp->Asmth[1];
+              double u     = 0.5 / asmth * r;
+
+              double factor_force = (erfc(u) + 2.0 * u / sqrt(M_PI) * exp(-u * u) - 1.0);
+              double factor_pot   = erfc(u);
+
+              double facs = fac + fac_newton * factor_force;
+              double wps  = wp + (r > 0 ? wp_newton * (factor_pot - 1.0) : mass / (asmth * sqrt(M_PI)));
+
+              double alpha = 0.5 / asmth;
+
+              acc_short_x = dxyz[0] * facs;
+              acc_short_y = dxyz[1] * facs;
+              acc_short_z = dxyz[2] * facs;
+
+              pot_short = wps + mass * M_PI / (alpha * alpha * All.BoxSize * All.BoxSize * All.BoxSize) * (LONG_X * LONG_Y * LONG_Z);
+            }
+
+          out.AccVeryShortRange[0] += acc_short_x;
+          out.AccVeryShortRange[1] += acc_short_y;
+          out.AccVeryShortRange[2] += acc_short_z;
+          out.PotVeryShortRange += pot_short;
+#endif
+        }
+#endif
+
+        // the direct force is the ordinary Newtonian force
+        out.Acc[0] += fac * dxyz[0];
+        out.Acc[1] += fac * dxyz[1];
+        out.Acc[2] += fac * dxyz[2];
+        out.Pot += wp;
+
+#ifdef PERIODIC
+        // and in the periodic case, we add in the correction potential/force
+        ewald_data ew;
+        Ewald.ewald_gridlookup(Tp->P[j].IntPos, intpos, ewald::POINTMASS, ew);
+
+        out.Pot += mass * ew.D0phi;
+        out.Acc[0] += mass * ew.D1phi[0];
+        out.Acc[1] += mass * ew.D1phi[1];
+        out.Acc[2] += mass * ew.D1phi[2];
+#endif
+      }
+
+    out.DistToID1 = disttoid1;
+
+    return 0;
+  }
+};
+
+void gravtest::gravity_forcetest(int timebin)
+{
+  TIMER_START(CPU_FORCETEST);
+
+  int *TargetList = (int *)Mem.mymalloc("TargetList", Sp->NumPart * sizeof(int));
+  int nloc        = 0;
+
+  particle_data *P = Sp->P;
+
+  // create a random selection of target particles for which we compute direct summation forces
+  for(int idx = 0; idx < Sp->TimeBinsGravity.NActiveParticles; idx++)
+    {
+      int target = Sp->TimeBinsGravity.ActiveParticleList[idx];
+
+      if(target < 0)
+        continue;
+
+#ifdef FORCETEST_FIXEDPARTICLESET
+      if(All.NumCurrentTiStep == 0)
+        {
+          if(get_random_number() < FORCETEST)
+            P[target].SelectedFlag = true;
+          else
+            P[target].SelectedFlag = false;
+        }
+      if(P[target].SelectedFlag)
+        TargetList[nloc++] = target;
+#else
+      if(get_random_number() < FORCETEST)
+        TargetList[nloc++] = target;
+#endif
+    }
+
+  long long ntot;
+  sumup_large_ints(1, &nloc, &ntot, D->Communicator);
+
+  /* we pull put a separate list of the particles with non-zero masses to accelerate some of our special tests
+   * where there are potentially many particles with zero masses
+   */
+
+  Sp->nsource   = 0;
+  Sp->indexlist = (int *)Mem.mymalloc("indexlist", Sp->NumPart * sizeof(int));
+
+#ifdef HIERARCHICAL_GRAVITY
+  int numidx = Sp->TimeBinsGravity.NActiveParticles;
+#else
+  int numidx = Sp->NumPart;
+#endif
+
+  for(int idx = 0; idx < numidx; idx++)
+    {
+#ifdef HIERARCHICAL_GRAVITY
+      int target = Sp->TimeBinsGravity.ActiveParticleList[idx];
+
+      if(target < 0)
+        continue;
+#else
+      int target = idx;
+#endif
+
+      if(P[target].getMass() == 0)
+        continue;
+
+      Sp->indexlist[Sp->nsource++] = target;
+    }
+
+  D->mpi_printf("FORCETEST: Testing forces for %lld particles out of %lld active ones.\n", ntot,
+                Sp->TimeBinsGravity.GlobalNActiveParticles);
+
+  frctest_comm commpattern(this->D, this->GravTree, this->Sp, this);
+
+  double t0 = Logs.second();
+
+  commpattern.execute(nloc, TargetList, MODE_DEFAULT);
+
+  double t1   = Logs.second();
+  double maxt = Logs.timediff(t0, t1);
+
+  D->mpi_printf("FORCETEST: Testing forces took %g sec.\n", maxt);
+
+  Mem.myfree(Sp->indexlist);
+  Sp->indexlist = NULL;
+
+  /* now add things for comoving integration */
+
+  if(All.ComovingIntegrationOn)
+    {
+#ifndef PERIODIC
+      double fac1 = 0.5 * All.Hubble * All.Hubble * All.Omega0 / All.G;
+
+      for(int idx = 0; idx < nloc; idx++)
+        {
+          int i = TargetList[idx];
+
+          double pos[3];
+          Sp->intpos_to_pos(Sp->P[i].IntPos, pos);
+
+          for(int j = 0; j < 3; j++)
+            Sp->P[i].GravAccelDirect[j] += fac1 * pos[j];
+        }
+#endif
+    }
+
+  /*  muliply by G */
+  for(int idx = 0; idx < nloc; idx++)
+    {
+      int i = TargetList[idx];
+
+      for(int j = 0; j < 3; j++)
+        {
+          Sp->P[i].GravAccelDirect[j] *= All.G;
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+          Sp->P[i].GravAccelShortRange[j] *= All.G;
+#ifdef PLACEHIGHRESREGION
+          Sp->P[i].GravAccelVeryShortRange[j] *= All.G;
+#endif
+#endif
+#if(FORCETEST_TESTFORCELAW == 3)
+          Sp->P[i].GravAccelDirectTest[j] *= All.G;
+#endif
+        }
+
+#if NSOFTCLASSES > 1
+      double selfpot = Sp->P[i].getMass() / (All.ForceSoftening[Sp->P[i].getSofteningClass()] / 2.8);
+#else
+      double selfpot = Sp->P[i].getMass() / (All.ForceSoftening[0] / 2.8);
+#endif
+
+      Sp->P[i].PotentialDirect += selfpot; /* remove self-potential */
+      Sp->P[i].PotentialDirect *= All.G;
+
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+      Sp->P[i].PotentialShortRange += selfpot; /* remove self-potential */
+      Sp->P[i].PotentialShortRange *= All.G;
+
+#ifdef PLACEHIGHRESREGION
+      Sp->P[i].PotentialVeryShortRange += selfpot; /* remove self-potential */
+      Sp->P[i].PotentialVeryShortRange *= All.G;
+#endif
+#endif
+#if(FORCETEST_TESTFORCELAW == 3)
+      Sp->P[i].PotentialDirectTest += selfpot; /* remove self-potential */
+      Sp->P[i].PotentialDirectTest *= All.G;
+#endif
+    }
+
+  /* Finally, the following factor allows a computation of cosmological simulation
+     with vacuum energy in physical coordinates */
+
+  if(All.ComovingIntegrationOn == 0)
+    {
+      double fac1 = All.OmegaLambda * All.Hubble * All.Hubble;
+
+      for(int idx = 0; idx < nloc; idx++)
+        {
+          int i = TargetList[idx];
+
+          double pos[3];
+          Sp->intpos_to_pos(Sp->P[i].IntPos, pos);
+
+          for(int j = 0; j < 3; j++)
+            Sp->P[i].GravAccelDirect[j] += fac1 * pos[j];
+        }
+    }
+
+  /* now output the forces to a file */
+
+  int *nloc_tab = (int *)Mem.mymalloc("nloc_tab", D->NTask * sizeof(int));
+  MPI_Allgather(&nloc, 1, MPI_INT, nloc_tab, 1, MPI_INT, D->Communicator);
+
+  for(int nthis = 0; nthis < D->NTask; nthis++)
+    {
+      if(nloc_tab[nthis] > 0)
+        {
+          if(nthis == D->ThisTask)
+            {
+              char buf[MAXLEN_PATH_EXTRA];
+              sprintf(buf, "%s%s", All.OutputDir, "forcetest.txt");
+
+              if(!(Logs.FdForceTest = fopen(buf, "a")))
+                Terminate("error in opening file '%s'\n", buf);
+
+              for(int idx = 0; idx < nloc; idx++)
+                {
+                  int i = TargetList[idx];
+
+                  double pos[3];
+                  Sp->intpos_to_pos(Sp->P[i].IntPos, pos);
+
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+
+#ifdef PLACEHIGHRESREGION
+                  int flaginside = Sp->check_high_res_point_location(P[i].IntPos);
+                  fprintf(
+                      Logs.FdForceTest,
+                      "%d %d %lld  %g  %17.12g %17.12g %17.12g  %17.12g  %12.12g %17.12g %17.12g  %17.12g %17.12g %17.12g  %17.12g "
+                      "%17.12g %17.12g  %17.12g %17.12g %17.12g  %17.12g %17.12g %17.12g  %17.12g "
+                      "%17.12g %17.12g     %17.12g %17.12g %17.12g %17.12g %17.12g %17.12g   %17.12g\n",
+                      P[i].getType(), flaginside, (long long)P[i].ID.get(), All.Time, pos[0], pos[1], pos[2], P[i].DistToID1,
+                      P[i].GravAccelDirect[0], P[i].GravAccelDirect[1], P[i].GravAccelDirect[2], P[i].GravAccelShortRange[0],
+                      P[i].GravAccelShortRange[1], P[i].GravAccelShortRange[2], P[i].GravAccelVeryShortRange[0],
+                      P[i].GravAccelVeryShortRange[1], P[i].GravAccelVeryShortRange[2], P[i].GravAccel[0], P[i].GravAccel[1],
+                      P[i].GravAccel[2], P[i].GravPM[0], P[i].GravPM[1], P[i].GravPM[2], P[i].GravAccelHPM[0], P[i].GravAccelHPM[1],
+                      P[i].GravAccelHPM[2], P[i].PotentialDirect, P[i].PotentialShortRange, P[i].PotentialVeryShortRange,
+                      P[i].Potential, P[i].PM_Potential, P[i].PotentialHPM, Sp->Asmth[1]);
+#else
+                  fprintf(
+                      Logs.FdForceTest,
+                      "%d %d %lld  %g  %17.12g %17.12g %17.12g  %17.12g  %17.12g %17.12g %17.12g  %17.12g %17.12g %17.12g  %17.12g "
+                      "%17.12g "
+                      "%17.12g  %17.12g %17.12g %17.12g  %17.12g %17.12g %17.12g  %17.12g\n",
+                      P[i].getType(), timebin, (long long)P[i].ID.get(), All.Time, pos[0], pos[1], pos[2], P[i].DistToID1,
+                      P[i].GravAccelDirect[0], P[i].GravAccelDirect[1], P[i].GravAccelDirect[2], P[i].GravAccelShortRange[0],
+                      P[i].GravAccelShortRange[1], P[i].GravAccelShortRange[2], P[i].GravAccel[0], P[i].GravAccel[1],
+                      P[i].GravAccel[2], P[i].GravPM[0], P[i].GravPM[1], P[i].GravPM[2], P[i].PotentialDirect,
+                      P[i].PotentialShortRange, P[i].Potential, P[i].PM_Potential);
+#endif
+#else
+                  fprintf(Logs.FdForceTest,
+                          "%d %d %lld %g  %17.12g %17.12g %17.12g  %17.12g  %17.12g %17.12g %17.12g  %17.12g %17.12g %17.12g  %17.12g "
+                          "%17.12g\n",
+                          P[i].getType(), timebin, (long long)P[i].ID.get(), All.Time, pos[0], pos[1], pos[2], P[i].DistToID1,
+                          P[i].GravAccelDirect[0], P[i].GravAccelDirect[1], P[i].GravAccelDirect[2], Sp->P[i].GravAccel[0],
+                          Sp->P[i].GravAccel[1], Sp->P[i].GravAccel[2], P[i].Potential, P[i].PotentialDirect);
+#endif
+                }
+
+              fclose(Logs.FdForceTest);
+            }
+
+          MPI_Barrier(D->Communicator);
+        }
+    }
+  Mem.myfree(nloc_tab);
+
+  /* Now the force computation is finished */
+  if(D->ThisTask == 0)
+    {
+      double costtotal = Sp->NumPart * ntot;
+
+      fprintf(Logs.FdTimings, "DIRECT Nf= %lld  step=%d  timebin=%d  part/sec=%g | %g  ia/part=%g\n\n", ntot, All.NumCurrentTiStep,
+              timebin, ((double)ntot) / (Sp->NTask * maxt + 1.0e-20), ntot / ((maxt + 1.0e-20) * Sp->NTask),
+              ((double)(costtotal)) / (ntot + 1.0e-20));
+
+      myflush(Logs.FdTimings);
+    }
+
+  Mem.myfree(TargetList);
+
+  TIMER_STOP(CPU_FORCETEST);
+}
+
+#ifdef FORCETEST_TESTFORCELAW /* in this option we assume NSOFTCLASSES >= 2 for FORCETEST_TESTFORCELAW=1, and  NSOFTCLASSES >= 3 for \
+                                 FORCETEST_TESTFORCELAW=2*/
+void sim::gravity_forcetest_testforcelaw(void)
+{
+  int Ncycles = 40;
+  double xyz[6], eps;
+
+  NgbTree.treefree();
+  Sp.mark_active_timebins();
+
+  for(int cycle = 0; cycle < Ncycles; cycle++)
+    {
+      Domain.mpi_printf("\nTEST-FORCE-LAW: cycle=%d|%d ----------------------------------\n\n", cycle, Ncycles);
+
+      double epsloc = 0, xyzloc[6] = {0, 0, 0, 0, 0, 0};
+
+      /* set particle with ID=1 to new random coordinate in box */
+      for(int n = 0; n < Sp.NumPart; n++)
+        {
+          Sp.P[n].setType(1);
+          Sp.P[n].setSofteningClass(1);
+
+          if(Sp.P[n].ID.get() == 1)
+            {
+              xyzloc[0] = All.BoxSize / LONG_X * get_random_number();
+              xyzloc[1] = All.BoxSize / LONG_Y * get_random_number();
+              xyzloc[2] = All.BoxSize / LONG_Z * get_random_number();
+
+              for(int i = 0; i < 3; i++)
+                xyzloc[3 + i] = xyzloc[i];
+
+#if defined(PLACEHIGHRESREGION) && (FORCETEST_TESTFORCELAW == 2)
+              if(get_random_number() < 0.5)
+                {
+                  xyzloc[3] = All.BoxSize / LONG_X * get_random_number();
+                  xyzloc[4] = All.BoxSize / LONG_Y * get_random_number();
+                  xyzloc[5] = All.BoxSize / LONG_Z * get_random_number();
+                }
+#endif
+              Sp.pos_to_intpos(&xyzloc[3], Sp.P[n].IntPos);
+
+              epsloc = All.ForceSoftening[Sp.P[n].getSofteningClass()];
+            }
+        }
+
+      MPI_Allreduce(xyzloc, xyz, 6, MPI_DOUBLE, MPI_SUM, Communicator);
+      MPI_Allreduce(&epsloc, &eps, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+
+      double rmin = 0.01 * eps;
+      double rmax = sqrt(pow(0.5 * All.BoxSize / LONG_X, 2) + pow(0.5 * All.BoxSize / LONG_Y, 2) + pow(0.5 * All.BoxSize / LONG_Z, 2));
+
+      for(int n = 0; n < Sp.NumPart; n++)
+        {
+          if(Sp.P[n].ID.get() != 1)
+            {
+              double r     = exp(log(rmin) + (log(rmax) - log(rmin)) * get_random_number());
+              double theta = acos(2 * get_random_number() - 1);
+              double phi   = 2 * M_PI * get_random_number();
+
+#if defined(PLACEHIGHRESREGION) && (FORCETEST_TESTFORCELAW == 2)
+              if(get_random_number() < 0.5)
+                {
+                  r = exp(log(rmin) + (log(rmax / 100) - log(rmin)) * get_random_number());
+                  Sp.P[n].setType(2);
+                }
+              else
+                {
+                  Sp.P[n].setType(3);
+                }
+#endif
+
+              double dx = r * sin(theta) * cos(phi);
+              double dy = r * sin(theta) * sin(phi);
+              double dz = r * cos(theta);
+
+              double pos[3];
+              pos[0] = xyz[0] + dx;
+              pos[1] = xyz[1] + dy;
+              pos[2] = xyz[2] + dz;
+
+#if defined(PLACEHIGHRESREGION) && (FORCETEST_TESTFORCELAW == 2)
+              if(Sp.P[n].getType() == 3)
+                {
+                  pos[0] = xyz[3] + dx;
+                  pos[1] = xyz[4] + dy;
+                  pos[2] = xyz[5] + dz;
+                }
+#endif
+              Sp.pos_to_intpos(pos, Sp.P[n].IntPos);
+              Sp.constrain_intpos(Sp.P[n].IntPos);
+            }
+        }
+
+      Domain.domain_free();
+      Domain.domain_decomposition(STANDARD); /* do domain decomposition if needed */
+
+      /* allocate space for gravity accelerations */
+
+      Sp.TimeBinsGravity.NActiveParticles = 0;
+      for(int timebin = All.HighestSynchronizedTimeBin; timebin >= 0; timebin--)
+        {
+          for(int i = Sp.TimeBinsGravity.FirstInTimeBin[timebin]; i >= 0; i = Sp.TimeBinsGravity.NextInTimeBin[i])
+            Sp.TimeBinsGravity.ActiveParticleList[Sp.TimeBinsGravity.NActiveParticles++] = i;
+        }
+
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+      gravity_long_range_force();
+#endif
+
+      compute_grav_accelerations(All.HighestActiveTimeBin);
+    }
+
+  endrun();
+}
+#endif
+
+#endif
diff --git a/src/gravity/grav_forcetest.h b/src/gravity/grav_forcetest.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbbd959f67cfb34e2ec822f32d1a604daf949864
--- /dev/null
+++ b/src/gravity/grav_forcetest.h
@@ -0,0 +1,75 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file grav_forcetest.h
+ *
+ *  \brief declares a class needed for the force test calculations
+ */
+
+#ifndef GRAV_FORCETEST_H
+#define GRAV_FORCETEST_H
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/simparticles.h"
+#include "../domain/domain.h"
+#include "../gravtree/gravtree.h"
+
+#define TESTGRID 384
+
+#ifdef LONG_X_BITS
+#if TESTGRID != ((TESTGRID / LONG_X) * LONG_X)
+#error "TESTGRID must be a multiple of the stretch factor in the x-direction"
+#endif
+#endif
+
+#ifdef LONG_Y_BITS
+#if TESTGRID != ((TESTGRID / LONG_Y) * LONG_Y)
+#error "TESTGRID must be a multiple of the stretch factor in the y-direction"
+#endif
+#endif
+
+#ifdef LONG_Z_BITS
+#if TESTGRID != ((TESTGRID / LONG_Z) * LONG_Z)
+#error "TESTGRID must be a multiple of the stretch factor in the x-direction"
+#endif
+#endif
+
+#define TESTGRIDX ((TESTGRID / LONG_X))
+#define TESTGRIDY ((TESTGRID / LONG_Y))
+#define TESTGRIDZ ((TESTGRID / LONG_Z))
+
+#define TESTGRIDZ2 (TESTGRIDZ / 2 + 1)
+
+#define ASMTH_DIRECT 30.0
+
+class gravtest
+{
+ private:
+  simparticles *Sp;
+  gravtree<simparticles> *GravTree;
+  domain<simparticles> *D;
+
+ public:
+  gravtest(simparticles *Sp_ptr, gravtree<simparticles> *GravTree_ptr, domain<simparticles> *Domain_ptr)
+  {
+    Sp       = Sp_ptr;
+    GravTree = GravTree_ptr;
+    D        = Domain_ptr;
+  }
+
+  void gravity_forcetest(int timebin);
+};
+
+#endif /* GRAV_FORCETEST_H */
diff --git a/src/gravity/gravity.cc b/src/gravity/gravity.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cdc1797bed3b60bc29acb4b0f6641c732340d2d
--- /dev/null
+++ b/src/gravity/gravity.cc
@@ -0,0 +1,460 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file gravity.cc
+ *
+ * \brief main driver routines for computing the gravitational accelerations for all active particles
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fmm/fmm.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../pm/pm.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/*! \brief This routine computes the gravitational accelerations for all active particles.
+ *
+ * If the particle mesh is used and the current time step
+ * requires a PM force computation, new long range forces are
+ * computed by long_range_force(). Then the shortrange tree forces
+ * are computed by gravity(). The force tree is rebuild every time step.
+ */
+void sim::compute_grav_accelerations(int timebin)
+{
+  sumup_large_ints(1, &Sp.TimeBinsGravity.NActiveParticles, &Sp.TimeBinsGravity.GlobalNActiveParticles, Communicator);
+
+  mpi_printf("ACCEL: Start tree gravity force computation... (%lld particles)\n", Sp.TimeBinsGravity.GlobalNActiveParticles);
+
+  if(Sp.TimeBinsGravity.GlobalNActiveParticles > 0)
+    {
+      GravTree.DoEwald = 0;
+
+#ifdef PMGRID
+      GravTree.DoPM = 0; /* default value */
+#endif
+
+#if defined(PMGRID) && defined(PERIODIC) && \
+    !defined(TREEPM_NOTIMESPLIT) /* classic TreePM in periodic box with time integration split */
+      GravTree.DoEwald = 0;
+      GravTree.DoPM    = TREE_ACTIVE_CUTTOFF_BASE_PM;
+
+#ifdef PLACEHIGHRESREGION
+      if(Sp.TimeBinsGravity.GlobalNActiveParticles > All.ActivePartFracForPMinsteadOfEwald * Sp.TotNumPart)
+        GravTree.DoPM += TREE_DO_HIGHRES_PM + TREE_ACTIVE_CUTTOFF_HIGHRES_PM;
+#endif
+
+#else /* everything else */
+
+#if defined(PMGRID) /* with PM acceleration (we force here TREEPM_NOTIMESPLIT to be set) */
+
+      if(Sp.TimeBinsGravity.GlobalNActiveParticles > All.ActivePartFracForPMinsteadOfEwald * Sp.TotNumPart)
+        {
+          GravTree.DoEwald = 0;
+#ifdef PLACEHIGHRESREGION
+          GravTree.DoPM    = TREE_DO_BASE_PM + TREE_DO_HIGHRES_PM + TREE_ACTIVE_CUTTOFF_BASE_PM + TREE_ACTIVE_CUTTOFF_HIGHRES_PM;
+#else
+          GravTree.DoPM = TREE_DO_BASE_PM + TREE_ACTIVE_CUTTOFF_BASE_PM;
+#endif
+        }
+      else
+        {
+          GravTree.DoPM    = 0;
+          GravTree.DoEwald = 1;
+        }
+
+#else /* here no PM acceleration is used */
+
+#if defined(PERIODIC)
+      GravTree.DoEwald = 1; /* periodic boundaries with Ewald summation */
+#endif
+
+#endif
+
+#endif
+
+#ifdef SECOND_ORDER_LPT_ICS
+      if(All.Ti_Current == 0)
+        second_order_ic_correction();
+#endif
+
+      if(All.TypeOfOpeningCriterion == 1 && All.Ti_Current == 0 && All.RelOpeningCriterionInUse == 0)
+        {
+          /* For the first timestep, we do one gravity calculation up front
+           * with the Barnes & Hut Criterion to allow usage of relative opening
+           * criterion with consistent accuracy.
+           */
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+          gravity_long_range_force();
+#endif
+          gravity(timebin);
+
+          gravity_set_oldacc(timebin);
+
+          /* now may switch on relative opening criterion since we have an old acceleration */
+          if(All.TypeOfOpeningCriterion == 1)
+            All.RelOpeningCriterionInUse = 1;
+        }
+
+      gravity(timebin); /* computes gravity acceleration */
+
+#ifdef FORCETEST
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+      if(timebin == All.HighestOccupiedGravTimeBin)
+#endif
+        {
+          GravTree.treeallocate(Sp.NumPart, &Sp, &Domain);
+          GravTest.gravity_forcetest(timebin);
+          GravTree.treefree();
+
+          if(FORCETEST >= 2.0) /* this is for a special test where we repeat the calculation */
+            {
+              for(int i = 0; i < (int)(FORCETEST)-1; i++)
+                {
+                  NgbTree.treefree();
+                  Domain.domain_free();
+                  Domain.domain_decomposition(STANDARD);
+                  NgbTree.treeallocate(Sp.NumGas, &Sp, &Domain);
+                  NgbTree.treebuild(Sp.NumGas, NULL);
+
+                  gravity(timebin);
+
+                  GravTree.treeallocate(Sp.NumPart, &Sp, &Domain);
+                  GravTest.gravity_forcetest(timebin);
+                  GravTree.treefree();
+                }
+            }
+
+          if(FORCETEST > 1.0)
+            endrun();
+        }
+#endif
+    }
+
+  mpi_printf("ACCEL: tree force computation done.\n");
+
+  if(All.TimeLimitCPU == 0)
+    endrun();
+}
+
+/*! \brief main driver routine of gravity tree/fmm force calculation
+ *
+ *  This routine handles the whole tree force calculation. First it
+ *  builds a new force tree force_treebuild() every timestep. This tree is then
+ *  used to calculate a new tree force for every active particle ( gravity_tree() ).
+ *  The passed variable 'timebin' is only used to inform about the largest timebin
+ *  of the particles in the list of active particles.
+ *
+ *  The tree will be constructed for the NActiveGravity particles listed in the
+ *  array ActiveActiveGravityParticles[]
+ */
+void sim::gravity(int timebin)
+{
+  if(timebin == All.HighestOccupiedGravTimeBin)
+    GravTree.MeasureCostFlag = 1;
+  else
+    GravTree.MeasureCostFlag = 0;
+
+  /* let's first initialize the results */
+  for(int i = 0; i < Sp.TimeBinsGravity.NActiveParticles; i++)
+    {
+      int target = Sp.TimeBinsGravity.ActiveParticleList[i];
+#ifdef EVALPOTENTIAL
+      Sp.P[target].Potential = 0;
+#endif
+      for(int j = 0; j < 3; j++)
+        Sp.P[target].GravAccel[j] = 0;
+
+      if(GravTree.MeasureCostFlag)
+        Sp.P[target].GravCost = 0;
+    }
+
+#ifdef SELFGRAVITY
+  /* set new softening lengths on global steps to take into account possible cosmological time variation */
+  if(timebin == All.HighestOccupiedGravTimeBin)
+    GravTree.set_softenings();
+
+#if defined(PMGRID) && (defined(TREEPM_NOTIMESPLIT) || defined(PLACEHIGHRESREGION))
+  if((GravTree.DoPM & (TREE_DO_BASE_PM + TREE_DO_HIGHRES_PM)))
+    gravity_pm(timebin);
+
+#if defined(FORCETEST) && defined(PLACEHIGHRESREGION)
+  for(int i = 0; i < Sp.TimeBinsGravity.NActiveParticles; i++)
+    {
+      int target                = Sp.TimeBinsGravity.ActiveParticleList[i];
+      Sp.P[target].PotentialHPM = All.G * Sp.P[target].Potential;
+      for(int j = 0; j < 3; j++)
+        Sp.P[target].GravAccelHPM[j] = All.G * Sp.P[target].GravAccel[j];
+    }
+#endif
+
+#endif
+
+#ifdef ALLOW_DIRECT_SUMMATION
+  if(Sp.TimeBinsGravity.GlobalNActiveParticles < DIRECT_SUMMATION_THRESHOLD)
+    {
+      GravTree.gravity_direct(&Sp, &Domain, timebin);
+    }
+  else
+#endif
+    {
+      GravTree.treeallocate(Sp.NumPart, &Sp, &Domain);
+
+#ifdef HIERARCHICAL_GRAVITY
+      GravTree.treebuild(Sp.TimeBinsGravity.NActiveParticles, Sp.TimeBinsGravity.ActiveParticleList);
+#else
+    GravTree.treebuild(Sp.NumPart, NULL);
+#endif
+
+#ifdef FMM
+      GravTree.gravity_fmm(timebin);
+#else
+    GravTree.gravity_tree(timebin);
+#endif
+
+      GravTree.treefree();
+    }
+
+  /* now multplify with G and add things for comoving integration */
+  gravity_comoving_factors(timebin);
+
+#endif
+
+#ifdef EXTERNALGRAVITY
+  gravity_external();
+#endif
+}
+
+void sim::gravity_set_oldacc(int timebin)
+{
+#ifdef HIERARCHICAL_GRAVITY
+  if(timebin == All.HighestOccupiedGravTimeBin)
+#endif
+    {
+      mpi_printf("GRAVTREE/FMM: Setting OldAcc!\n");
+
+      particle_data *P = Sp.P;
+
+      double ginv = 1 / All.G;
+
+      for(int idx = 0; idx < Sp.TimeBinsGravity.NActiveParticles; idx++)
+        {
+          int target = Sp.TimeBinsGravity.ActiveParticleList[idx];
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+          double ax = P[target].GravAccel[0] + P[target].GravPM[0];
+          double ay = P[target].GravAccel[1] + P[target].GravPM[1];
+          double az = P[target].GravAccel[2] + P[target].GravPM[2];
+#else
+        double ax = P[target].GravAccel[0];
+        double ay = P[target].GravAccel[1];
+        double az = P[target].GravAccel[2];
+#endif
+          P[target].OldAcc = sqrt(ax * ax + ay * ay + az * az) * ginv;
+        }
+    }
+}
+
+/* CHECK: shoukd this function still contain the communication of the forces in in Tree.ResultsActiveImported? */
+void sim::gravity_comoving_factors(int timebin)
+{
+  particle_data *P = Sp.P;
+
+#ifndef PERIODIC
+#ifndef PMGRID
+  if(All.ComovingIntegrationOn)
+    {
+      double fac = 0.5 * All.Hubble * All.Hubble * All.Omega0 / All.G;
+
+      for(int i = 0; i < Sp.TimeBinsGravity.NActiveParticles; i++)
+        {
+          int target = Sp.TimeBinsGravity.ActiveParticleList[i];
+          for(int j = 0; j < 3; j++)
+            P[target].GravAccel[j] += fac * P[target].IntPos[j];
+        }
+    }
+#endif
+#endif
+
+  /*  muliply by G */
+  for(int idx = 0; idx < Sp.TimeBinsGravity.NActiveParticles; idx++)
+    {
+      int target = Sp.TimeBinsGravity.ActiveParticleList[idx];
+
+      for(int j = 0; j < 3; j++)
+        P[target].GravAccel[j] *= All.G;
+
+#if defined(EVALPOTENTIAL) && defined(PMGRID) && defined(PERIODIC)
+        /* To get correct zero point in potential for TreePM calculation, need to add this term,
+         * because we cannot include the pi/(V*alpha^2) term in the correction potential in real space
+         * since we only touch a restricted set of particles in the tree calculation
+         */
+#ifndef GRAVITY_TALLBOX
+      /* note: for the tallbox,  the constant potential term is comuted as part of the long-range force and not the short-range force,
+       * unlike in the ordinary periodic case */
+      if(GravTree.DoPM)
+        {
+          double alpha = 0.5 / Sp.Asmth[0];
+          P[target].Potential +=
+              All.TotalMass * M_PI / (alpha * alpha * All.BoxSize * All.BoxSize * All.BoxSize) * (LONG_X * LONG_Y * LONG_Z);
+        }
+#endif
+#endif
+
+#if defined(EVALPOTENTIAL)
+#ifndef FMM
+        /* remove self-interaction */
+#if NSOFTCLASSES > 1
+      P[target].Potential += P[target].getMass() / (All.ForceSoftening[P[target].getSofteningClass()] / 2.8);
+#else
+      P[target].Potential += P[target].getMass() / (All.ForceSoftening[0] / 2.8);
+#endif
+
+#endif
+
+#if defined(FMM) && defined(PERIODIC) && !defined(PMGRID)
+      /* in FMM case, add in interaction with other images in periodic grid */
+      P[target].Potential += P[target].getMass() * Ewald.ewald_gridlookup_origin_D0();
+#endif
+
+      P[target].Potential *= All.G;
+
+#if defined(PMGRID) && !defined(FORCETEST_TESTFORCELAW)
+      P[target].Potential += P[target].PM_Potential; /* add in long-range potential */
+#endif
+#endif
+
+      if(All.ComovingIntegrationOn)
+        {
+#ifndef PERIODIC
+#ifdef EVALPOTENTIAL
+          double fac = -0.5 * All.Omega0 * All.Hubble * All.Hubble;
+
+          double pos[3];
+          Sp.intpos_to_pos(P[target].IntPos, pos); /* converts the integer distance to floating point */
+
+          double r2 = 0;
+          for(int k = 0; k < 3; k++)
+            r2 += pos[k] * pos[k];
+          P[target].Potential += fac * r2;
+#endif
+#endif
+        }
+      else
+        {
+          if(All.OmegaLambda != 0)
+            {
+#ifndef PERIODIC
+              /* Finally, the following factor allows a computation of a cosmological simulation
+                   with vacuum energy in physical coordinates */
+
+              double pos[3];
+              Sp.intpos_to_pos(P[target].IntPos, pos); /* converts the integer distance to floating point */
+
+              double fac = All.OmegaLambda * All.Hubble * All.Hubble;
+
+              for(int j = 0; j < 3; j++)
+                Sp.P[target].GravAccel[j] += fac * pos[j];
+
+#ifdef EVALPOTENTIAL
+              double r2 = 0;
+              for(int k = 0; k < 3; k++)
+                r2 += pos[k] * pos[k];
+              P[target].Potential -= 0.5 * fac * r2;
+#endif
+#endif
+            }
+        }
+    }
+}
+
+#if defined(PMGRID) && (defined(TREEPM_NOTIMESPLIT) || defined(PLACEHIGHRESREGION))
+void sim::gravity_pm(int timebin)
+{
+  double tstart = Logs.second();
+  TIMER_START(CPU_PM_GRAVITY);
+  mpi_printf("TREEPM: Starting PM part of force calculation. (timebin=%d)\n", timebin);
+
+#if !defined(PERIODIC) || defined(PLACEHIGHRESREGION)
+  PM.pm_init_regionsize();
+#endif
+
+  if((GravTree.DoPM & TREE_DO_BASE_PM))
+    {
+#ifdef PERIODIC
+      PM.pmforce_periodic(LOW_MESH, NULL);
+#else
+      /* non periodic PM mesh */
+      PM.pmforce_nonperiodic(LOW_MESH);
+#endif
+    }
+
+#ifdef PLACEHIGHRESREGION
+  if((GravTree.DoPM & TREE_DO_HIGHRES_PM))
+    PM.pmforce_nonperiodic(HIGH_MESH);
+#endif
+
+  mpi_printf("TREEPM: Finished PM part of force calculation.\n");
+  TIMER_STOP(CPU_PM_GRAVITY);
+  double tend               = Logs.second();
+  All.CPUForLastPMExecution = Logs.timediff(tstart, tend);
+}
+#endif
+
+/*! \brief This function computes the long-range PM force for all the particles.
+ *
+ */
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+void sim::gravity_long_range_force(void)
+{
+#ifndef SELFGRAVITY
+  return;
+#endif
+
+  double tstart = Logs.second();
+  TIMER_START(CPU_PM_GRAVITY);
+
+  for(int i = 0; i < Sp.NumPart; i++)
+    {
+      Sp.P[i].GravPM[0] = Sp.P[i].GravPM[1] = Sp.P[i].GravPM[2] = 0;
+#ifdef EVALPOTENTIAL
+      Sp.P[i].PM_Potential = 0;
+#endif
+    }
+
+  PM.pmforce_periodic(0, NULL);
+
+  /* multiply with the gravitational constant */
+  for(int i = 0; i < Sp.NumPart; i++)
+    {
+      for(int j = 0; j < 3; j++)
+        Sp.P[i].GravPM[j] *= All.G;
+#ifdef EVALPOTENTIAL
+      Sp.P[i].PM_Potential *= All.G;
+#endif
+    }
+
+  TIMER_STOP(CPU_PM_GRAVITY);
+  double tend               = Logs.second();
+  All.CPUForLastPMExecution = Logs.timediff(tstart, tend);
+
+  Sp.find_long_range_step_constraint();
+}
+#endif
diff --git a/src/gravity/second_order_ics.cc b/src/gravity/second_order_ics.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e35ba836fa6d006057afe9483303a7e9e556783f
--- /dev/null
+++ b/src/gravity/second_order_ics.cc
@@ -0,0 +1,147 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file second_order_ics.cc
+ *
+ *  \brief produce actual ICs from special 2nd order LPT ICs created by Adrian Jenkin's code
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../gravtree/gravtree.h"
+#include "../io/io.h"
+#include "../main/simulation.h"
+#include "../system/system.h"
+#include "../time_integration/driftfac.h"
+
+#ifdef SECOND_ORDER_LPT_ICS
+
+double sim::F1_Omega(double a)
+{
+  double omega_a = All.Omega0 / (All.Omega0 + a * (1 - All.Omega0 - All.OmegaLambda) + a * a * a * All.OmegaLambda);
+
+  return pow(omega_a, 5.0 / 9);
+}
+
+double sim::F2_Omega(double a)
+{
+  double omega_a = All.Omega0 / (All.Omega0 + a * (1 - All.Omega0 - All.OmegaLambda) + a * a * a * All.OmegaLambda);
+
+  return 2.0 * pow(omega_a, 6.0 / 11);
+}
+
+void sim::second_order_ic_correction(void)
+{
+  if(executed)
+    return;
+
+  mpi_printf("SECOND_ORDER_LPT_ICS: Now producing ICs based on 2nd-order LPT\n");
+
+  particle_data *P = Sp.P;
+
+  /* first, back up masses and set special 2nd LPT masses (which have been read in in the field OldAcc) */
+
+  double *mass_bak = (double *)Mem.mymalloc("mass_bak", Sp.NumPart * sizeof(double));
+
+  for(int i = 0; i < Sp.NumPart; i++)
+    {
+      mass_bak[i] = P[i].getMass();
+      P[i].setMass(P[i].OldAcc);
+    }
+
+    /* now do the gravity computation */
+
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+  gravity_long_range_force();
+#endif
+  gravity(0);
+
+  if(All.TypeOfOpeningCriterion == 1)
+    gravity(0);
+
+  /* correct the ICs accordingly */
+
+  double a = All.TimeBegin;
+
+  double hubble = Driftfac.hubble_function(a);
+
+  double fac1 = 1.0 / (a * a * hubble * F1_Omega(a)); /* this factor converts the Zeldovich peculiar velocity
+                                                 (expressed as  a^2*dX/dt  to comoving displacement */
+  double fac2 = All.LptScalingfactor;
+
+  double fac3 = fac2 * a * a * hubble * F2_Omega(a);
+
+  mpi_printf("SECOND_ORDER_LPT_ICS: fac1=%g  fac2=%g  fac3=%g\n", fac1, fac2, fac3);
+
+  for(int i = 0; i < Sp.NumPart; i++)
+    {
+      double posdiff[3];
+      for(int j = 0; j < 3; j++)
+        posdiff[j] = fac1 * P[i].Vel[j]; /* Zeldovich displacement */
+
+      MyIntPosType delta[3];
+      Sp.pos_to_signedintpos(posdiff, (MySignedIntPosType *)delta);
+
+      for(int j = 0; j < 3; j++)
+        P[i].IntPos[j] += delta[j];
+
+      double acc[3];
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+      for(int j = 0; j < 3; j++)
+        acc[j] = P[i].GravPM[j] + P[i].GravAccel[j];
+#else
+      for(int j = 0; j < 3; j++)
+        acc[j] = P[i].GravAccel[j];
+#endif
+
+      for(int j = 0; j < 3; j++)
+        posdiff[j] = fac2 * acc[j]; /* second order lpt displacement */
+
+      Sp.pos_to_signedintpos(posdiff, (MySignedIntPosType *)delta);
+
+      for(int j = 0; j < 3; j++)
+        P[i].IntPos[j] += delta[j];
+
+      for(int j = 0; j < 3; j++)
+        P[i].Vel[j] += fac3 * acc[j]; /* second order lpt velocity correction */
+    }
+
+  /* now restore the correct masses */
+
+  for(int i = 0; i < Sp.NumPart; i++)
+    {
+      P[i].setMass(mass_bak[i]);
+      P[i].OldAcc = 0;
+    }
+
+  Mem.myfree(mass_bak);
+
+  mpi_printf("SECOND_ORDER_LPT_ICS: finished,\n");
+
+  executed = 1;
+
+  if(All.TypeOfOpeningCriterion == 1)
+    All.RelOpeningCriterionInUse = 0;
+
+  /* put in an extra domain decomposition because particle positions have been shifted */
+
+  NgbTree.treefree();
+  Domain.domain_free();
+  Domain.domain_decomposition(STANDARD);
+  NgbTree.treeallocate(Sp.NumGas, &Sp, &Domain);
+  NgbTree.treebuild(Sp.NumGas, NULL);
+}
+
+#endif
diff --git a/src/gravtree/gravtree.cc b/src/gravtree/gravtree.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af9b91f45457f0e69eb25b7efe7061a1b95be71f
--- /dev/null
+++ b/src/gravtree/gravtree.cc
@@ -0,0 +1,220 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file gravtree.cc
+ *
+ *  \brief driver routines for computing the (short-range) gravity
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdlib.h>
+#include <string.h>
+#include <atomic>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../gravity/ewald.h"
+#include "../gravtree/gravtree.h"
+#include "../gravtree/gwalk.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../mpi_utils/shared_mem_handler.h"
+#include "../pm/pm.h"
+#include "../sort/cxxsort.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/*!
+ *  This file contains the code for the gravitational force computation by
+ *  means of the tree algorithm. To this end, a tree force is computed for all
+ *  active local particles, and particles are exported to other processors if
+ *  needed, where they can receive additional force contributions. If the
+ *  TreePM algorithm is enabled, the force computed will only be the
+ *  short-range part.
+ */
+
+#ifdef PMGRID
+
+/*! \brief initializes the short range table
+ *
+ * The short range table contains the complementary error function
+ * needed for the computation of the short range part of the gravity
+ * force/potential in case of the TreePM algorithm.
+ */
+template <typename partset>
+void gravtree<partset>::short_range_init(void)
+{
+  for(int i = 0; i <= NTAB; i++)
+    {
+      double u = (RCUT / 2.0) / NTAB * i;
+
+      /*
+       *  fac0 contains    g = g(u) = (erfc(u) - 1)/u
+       *  fac1 contains    g'
+       *  fac2 contains    g'' - g'/u
+       *  fac3 contains    g''' - 3g''/u + 3g'/u^2
+       *  fac4 contains    g'''' - 6*g'''/u + 15*g''/u^2 - 15*g'/u^3
+       *  fac5 contains    g''''' - 10*g''''/u + 45*g'''/u^2 - 105*g''/u^3 +  105*g'/u^4
+       */
+
+      shortrange_factors[i].fac0 = (u > 0) ? (erfc(u) - 1.0) / u : -2.0 / sqrt(M_PI);
+      shortrange_factors[i].fac1 = (u > 0) ? ((1.0 - erfc(u)) - 2.0 * u / sqrt(M_PI) * exp(-u * u)) / (u * u) : 0;
+#if(MULTIPOLE_ORDER >= 2)
+      shortrange_factors[i].fac2 =
+          (u > 0) ? (-3.0 * (1.0 - erfc(u)) + (6 * u + 4 * pow(u, 3)) / sqrt(M_PI) * exp(-u * u)) / pow(u, 3) : 0;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+      shortrange_factors[i].fac3 =
+          (u > 0) ? (15.0 * (1.0 - erfc(u)) - (30 * u + 20 * pow(u, 3) + 8 * pow(u, 5)) / sqrt(M_PI) * exp(-u * u)) / pow(u, 4) : 0;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+      shortrange_factors[i].fac4 =
+          (u > 0) ? (-105.0 * (1.0 - erfc(u)) +
+                     (210.0 * u + 140.0 * pow(u, 3) + 56.0 * pow(u, 5) + 16.0 * pow(u, 7)) / sqrt(M_PI) * exp(-u * u)) /
+                        pow(u, 5)
+                  : 0;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+      shortrange_factors[i].fac5 = (u > 0) ? (945.0 * (1.0 - erfc(u)) - (1890.0 * u + 1260.0 * pow(u, 3) + 504.0 * pow(u, 5) +
+                                                                         144.0 * pow(u, 7) + 32.0 * pow(u, 9)) /
+                                                                            sqrt(M_PI) * exp(-u * u)) /
+                                                 pow(u, 6)
+                                           : 0;
+#endif
+    }
+}
+
+template <typename partset>
+void gravtree<partset>::set_mesh_factors(void)
+{
+  for(int i = 0; i < 2; i++)
+    {
+      mf[i].rcut2       = Tp->Rcut[i] * Tp->Rcut[i];
+      double dblrcut[3] = {Tp->Rcut[i], Tp->Rcut[i],
+                           Tp->Rcut[i]}; /* for stretched boxes, the conversion may be different in each dimension */
+      Tp->pos_to_signedintpos(dblrcut, (MySignedIntPosType *)mf[i].intrcut);
+
+      if(Tp->Asmth[i] > 0)
+        mf[i].asmthinv1 = 0.5 / Tp->Asmth[i];
+      else
+        mf[i].asmthinv1 = 0;
+
+      mf[i].asmthinv2 = mf[i].asmthinv1 * mf[i].asmthinv1;
+#if(MULTIPOLE_ORDER >= 2)
+      mf[i].asmthinv3 = mf[i].asmthinv2 * mf[i].asmthinv1;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+      mf[i].asmthinv4 = mf[i].asmthinv3 * mf[i].asmthinv1;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+      mf[i].asmthinv5 = mf[i].asmthinv4 * mf[i].asmthinv1;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+      mf[i].asmthinv6 = mf[i].asmthinv5 * mf[i].asmthinv1;
+#endif
+
+      mf[i].asmthfac = mf[i].asmthinv1 * (NTAB / (RCUT / 2.0));
+    }
+}
+
+#endif
+
+template <typename partset>
+void gravtree<partset>::gravity_exchange_forces(void)
+{
+  int *send_count  = (int *)Mem.mymalloc_movable(&send_count, "send_count", sizeof(int) * D->NTask);
+  int *send_offset = (int *)Mem.mymalloc_movable(&send_offset, "send_offset", sizeof(int) * D->NTask);
+  int *recv_count  = (int *)Mem.mymalloc_movable(&recv_count, "recv_count", sizeof(int) * D->NTask);
+  int *recv_offset = (int *)Mem.mymalloc_movable(&recv_offset, "tecv_offset", sizeof(int) * D->NTask);
+
+  /* now communicate the forces in ResultsActiveImported */
+  for(int j = 0; j < D->NTask; j++)
+    recv_count[j] = 0;
+
+  int n = 0, k = 0;
+
+  for(int i = 0; i < D->NTask; i++)
+    for(int j = 0; j < Recv_count[i]; j++, n++) /* Note that we access Tree.Recv_count here */
+      {
+#ifndef HIERARCHICAL_GRAVITY
+        if(Points[n].ActiveFlag)
+#endif
+          {
+            ResultsActiveImported[k].index = Points[n].index;
+            recv_count[i]++;
+            k++;
+          }
+      }
+  MPI_Alltoall(recv_count, 1, MPI_INT, send_count, 1, MPI_INT, D->Communicator);
+
+  recv_offset[0] = 0;
+  send_offset[0] = 0;
+
+  int Nexport = 0;
+  int Nimport = 0;
+
+  for(int j = 0; j < D->NTask; j++)
+    {
+      Nexport += send_count[j];
+      Nimport += recv_count[j];
+      if(j > 0)
+        {
+          send_offset[j] = send_offset[j - 1] + send_count[j - 1];
+          recv_offset[j] = recv_offset[j - 1] + recv_count[j - 1];
+        }
+    }
+
+  resultsactiveimported_data *tmp_results =
+      (resultsactiveimported_data *)Mem.mymalloc("tmp_results", Nexport * sizeof(resultsactiveimported_data));
+
+  /* exchange  data */
+  for(int ngrp = 1; ngrp < (1 << D->PTask); ngrp++)
+    {
+      int recvTask = D->ThisTask ^ ngrp;
+
+      if(recvTask < D->NTask)
+        {
+          if(send_count[recvTask] > 0 || recv_count[recvTask] > 0)
+            {
+              MPI_Sendrecv(&ResultsActiveImported[recv_offset[recvTask]], recv_count[recvTask] * sizeof(resultsactiveimported_data),
+                           MPI_BYTE, recvTask, TAG_FOF_A, &tmp_results[send_offset[recvTask]],
+                           send_count[recvTask] * sizeof(resultsactiveimported_data), MPI_BYTE, recvTask, TAG_FOF_A, D->Communicator,
+                           MPI_STATUS_IGNORE);
+            }
+        }
+    }
+  for(int i = 0; i < Nexport; i++)
+    {
+      int target = tmp_results[i].index;
+
+      for(int k = 0; k < 3; k++)
+        Tp->P[target].GravAccel[k] += tmp_results[i].GravAccel[k];
+#ifdef EVALPOTENTIAL
+      Tp->P[target].Potential += tmp_results[i].Potential;
+#endif
+
+      if(MeasureCostFlag)
+        Tp->P[target].GravCost += tmp_results[i].GravCost;
+    }
+  Mem.myfree(tmp_results);
+
+  Mem.myfree(recv_offset);
+  Mem.myfree(recv_count);
+  Mem.myfree(send_offset);
+  Mem.myfree(send_count);
+}
+
+/* make sure that we instantiate the template */
+#include "../data/simparticles.h"
+template class gravtree<simparticles>;
diff --git a/src/gravtree/gravtree.h b/src/gravtree/gravtree.h
new file mode 100644
index 0000000000000000000000000000000000000000..046ade24094c3a50a49e75c76de82e3187241f5d
--- /dev/null
+++ b/src/gravtree/gravtree.h
@@ -0,0 +1,688 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file gravtree.h
+ *
+ *  \brief declares the gravitational tree node structure and a class for gravity tree calculation
+ */
+
+#ifndef FORCETREE_H
+#define FORCETREE_H
+
+#include "gadgetconfig.h"
+
+#ifndef NTAB
+#define NTAB 256 /* size of short-range look up */
+#endif
+
+#define TREE_DO_BASE_PM 1
+#define TREE_DO_HIGHRES_PM 2
+#define TREE_ACTIVE_CUTTOFF_BASE_PM 4
+#define TREE_ACTIVE_CUTTOFF_HIGHRES_PM 8
+
+#define TREE_MIN_WORKSTACK_SIZE 100000
+#define TREE_EXPECTED_CYCLES 80
+
+#define NODE_TYPE_LOCAL_PARTICLE 0
+#define NODE_TYPE_TREEPOINT_PARTICLE 1
+#define NODE_TYPE_FETCHED_PARTICLE 2
+#define NODE_TYPE_LOCAL_NODE 3
+#define NODE_TYPE_FETCHED_NODE 4
+
+#define NODE_USE 0
+#define NODE_OPEN 1
+#define NODE_DISCARD 2
+
+#define MAX_TREE_ALLOC_FACTOR 30.0
+
+#define TAKE_NSLOTS_IN_ONE_GO 32
+
+#include <string.h>
+
+#include "../data/simparticles.h"
+#include "../data/symtensors.h"
+#include "../domain/domain.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../tree/tree.h"
+
+/** The tree node data structure. Nodes points to the actual memory
+ allocated for the internal nodes, but is shifted such that
+ Nodes[Sp.MaxPart] gives the first allocated node. Note that node
+ numbers less than Sp.MaxPart are the leaf nodes that contain a
+ single particle, and node numbers >= MaxPart+MaxNodes are "pseudo
+ particles" that hang off the toplevel leaf nodes belonging to
+ other tasks. These are not represented by this structure. Instead,
+ the tree traversal for these are saved in the Nextnode, Prevnode
+ and Father arrays, indexed with the node number in the case of
+ real particles and by nodenumber-MaxNodes for pseudo
+ particles.  */
+
+struct gravnode : public basenode
+{
+  MyDouble mass;          /**< mass of node */
+  vector<MyIntPosType> s; /**< center of mass of node (in integer coordinates!) */
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+  symtensor2<MyDouble> Q2Tensor; /**< quadrupole tensor */
+#endif
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM))
+  symtensor3<MyDouble> Q3Tensor; /**< octupole tensor */
+#endif
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM))
+  symtensor4<MyDouble> Q4Tensor; /**< hexadecupole tensor */
+#endif
+#if(MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM))
+  symtensor5<MyDouble> Q5Tensor; /**< triakontadipole tensor */
+#endif
+#ifdef FMM
+  float MinOldAcc; /**< minimum magnitude of old gravitational force. Used in relative opening criterion */
+#endif
+#if(NSOFTCLASSES > 1)
+  unsigned char maxsofttype; /**< hold the maximum gravitational softening of particles */
+  unsigned char minsofttype; /**< hold the minimum gravitational softening of particles */
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+  unsigned char overlap_flag : 2;
+#endif
+
+  inline int getSofteningClass(void)
+  {
+#if NSOFTCLASSES > 1
+    return maxsofttype;
+#else
+    return 0;
+#endif
+  }
+};
+
+struct gravpoint_data
+{
+  MyIntPosType IntPos[3];
+  MyDouble Mass;
+  float OldAcc;
+  int index;
+  int no;
+  unsigned char Type;
+#if NSOFTCLASSES > 1
+  unsigned char SofteningClass : 7;
+#endif
+#ifndef HIERARCHICAL_GRAVITY
+  unsigned char ActiveFlag : 1; /* we don't need this for hierarchical gravity as then the particles are always active */
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+  unsigned char InsideOutsideFlag : 1;
+#endif
+
+  inline unsigned char getSofteningClass(void)
+  {
+#if NSOFTCLASSES > 1
+    return SofteningClass;
+#else
+    return 0;
+#endif
+  }
+};
+
+struct foreign_gravpoint_data
+{
+  MyIntPosType IntPos[3];
+  MyDouble Mass;
+  int Nextnode;
+  unsigned char Nextnode_shmrank;
+  unsigned char Type;
+  float OldAcc;
+#if NSOFTCLASSES > 1
+  unsigned char SofteningClass;
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+  unsigned char InsideOutsideFlag : 1;
+#endif
+
+  inline unsigned char getSofteningClass(void)
+  {
+#if NSOFTCLASSES > 1
+    return SofteningClass;
+#else
+    return 0;
+#endif
+  }
+};
+
+#define HIGHEST_NEEDEDORDER_EWALD_DPHI 1
+
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM) || (MULTIPOLE_ORDER >= 2 && defined(FMM)))
+#undef HIGHEST_NEEDEDORDER_EWALD_DPHI
+#define HIGHEST_NEEDEDORDER_EWALD_DPHI 2
+#endif
+
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM) || (MULTIPOLE_ORDER >= 3 && defined(FMM)))
+#undef HIGHEST_NEEDEDORDER_EWALD_DPHI
+#define HIGHEST_NEEDEDORDER_EWALD_DPHI 3
+#endif
+
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM) || (MULTIPOLE_ORDER >= 4 && defined(FMM)))
+#undef HIGHEST_NEEDEDORDER_EWALD_DPHI
+#define HIGHEST_NEEDEDORDER_EWALD_DPHI 4
+#endif
+
+#if((MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM) && defined(EVALPOTENTIAL)) || (MULTIPOLE_ORDER >= 5 && defined(FMM)) || \
+    defined(EWALD_TEST))
+#undef HIGHEST_NEEDEDORDER_EWALD_DPHI
+#define HIGHEST_NEEDEDORDER_EWALD_DPHI 5
+#endif
+
+#ifdef EXTRA_HIGH_EWALD_ACCURACY
+#define EWALD_TAYLOR_ORDER 3
+#else
+#define EWALD_TAYLOR_ORDER 2
+#endif
+
+/* variables for Ewald correction lookup table */
+struct ewald_data
+{
+  MyReal D0phi;
+  vector<MyReal> D1phi;
+  symtensor2<MyReal> D2phi;
+  symtensor3<MyReal> D3phi;
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 4
+  symtensor4<MyReal> D4phi;
+#endif
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 5
+  symtensor5<MyReal> D5phi;
+#endif
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 6
+  symtensor6<MyReal> D6phi;
+#endif
+#if(HIGHEST_NEEDEDORDER_EWALD_DPHI + EWALD_TAYLOR_ORDER) >= 7
+  symtensor7<MyReal> D7phi;
+#endif
+};
+
+template <typename partset> /* partset will either be 'simparticles' or 'lightconeparticles'  */
+class gravtree : public tree<gravnode, partset, gravpoint_data, foreign_gravpoint_data>
+{
+ public:
+  typedef tree<gravnode, partset, gravpoint_data, foreign_gravpoint_data> basetree;
+  using basetree::Buildtime;
+  using basetree::D;  // this avoids that we have to use a "this->" when accessing these variables from the base class
+  using basetree::EndOfForeignNodes;
+  using basetree::EndOfTreePoints;
+  using basetree::Father;
+  using basetree::FirstNonTopLevelNode;
+  using basetree::Foreign_Nodes;
+  using basetree::get_nodep;
+  using basetree::ImportedNodeOffset;
+  using basetree::IndexList;
+  using basetree::MaxForeignNodes;
+  using basetree::MaxNodes;
+  using basetree::MaxPart;
+  using basetree::Nextnode;
+  using basetree::Ninsert;
+  using basetree::NodeIndex;
+  using basetree::Nodes;
+  using basetree::NumForeignNodes;
+  using basetree::NumForeignPoints;
+  using basetree::NumNodes;
+  using basetree::NumPartExported;
+  using basetree::NumPartImported;
+  using basetree::Points;
+  using basetree::Recv_count;
+  using basetree::Recv_offset;
+  using basetree::ResultIndexList;
+  using basetree::Send_count;
+  using basetree::Send_offset;
+  using basetree::TopNodes;
+  using basetree::Tp;
+  using basetree::TreeForeign_Nodes_offsets;
+  using basetree::TreeForeign_Points_offsets;
+  using basetree::TreeNextnode_offsets;
+  using basetree::TreeNodes_offsets;
+  using basetree::TreeP_offsets;
+  using basetree::TreePoints_offsets;
+  using basetree::TreePS_offsets;
+  using basetree::TreeSharedMem_ThisTask;
+  using basetree::TreeSharedMemBaseAddr;
+  using basetree::TreeSharedMemComm;
+  using basetree::TreeSphP_offsets;
+
+  struct mesh_factors
+  {
+    double rcut2;
+    double asmthfac;
+
+    double asmthinv1;
+    double asmthinv2;
+#if(MULTIPOLE_ORDER >= 2)
+    double asmthinv3;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+    double asmthinv4;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+    double asmthinv5;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+    double asmthinv6;
+#endif
+    MyIntPosType intrcut[3];
+  };
+  mesh_factors mf[2];
+
+  struct resultsactiveimported_data
+  {
+    vector<MyFloat> GravAccel;
+#ifdef EVALPOTENTIAL
+    MyFloat Potential;
+#endif
+    int GravCost;
+    int index;
+  };
+
+  struct index_data
+  {
+    int p;
+    int subnode;
+  };
+
+  /** Variables for gravitational tree */
+
+  char MeasureCostFlag;
+
+  resultsactiveimported_data *ResultsActiveImported;
+
+  ewald_data PotTaylor;
+  int num_layers = 0;
+
+ private:
+  /** Gives next node in tree walk for the "particle" nodes. Entries 0
+   -- MaxPart-1 are the real particles, and the "pseudoparticles" are
+   indexed by the node number-MaxNodes. */
+
+  /** Gives previous node in tree walk for the leaf (particle)
+   nodes. Entries 0 -- MaxPart-1 are the real particles, and the
+   "pseudoparticles" are indexed by the node number-MaxNodes. */
+
+ private:
+  /* private member functions */
+
+  void update_node_recursive(int no, int sib, int mode) override;
+  void exchange_topleafdata(void) override;
+  void fill_in_export_points(gravpoint_data *exp_point, int i, int no) override;
+  void report_log_message(void) override;
+
+ public:
+  void set_softenings(void);
+
+#ifdef ALLOW_DIRECT_SUMMATION
+  void gravity_direct(partset *Pptr, domain<partset> *Dptr, int timebin);
+#endif
+
+  void gravity_exchange_forces(void);
+
+  /** public functions */
+ public:
+#ifdef PMGRID
+  void short_range_init(void);
+  void set_mesh_factors(void);
+#endif
+
+ public:
+#ifdef PMGRID
+  /*! \brief variable for short-range lookup table
+   *
+   *  contains the factor needed for the short range
+   *  contribution of the tree to the gravity force
+   */
+  struct
+  {
+    float fac0;
+    float fac1;
+#if(MULTIPOLE_ORDER >= 2)
+    float fac2;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+    float fac3;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+    float fac4;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+    float fac5;
+#endif
+  } shortrange_factors[NTAB + 1];
+#endif
+
+ public:
+#if defined(PMGRID)
+  char DoPM;
+#endif
+  char DoEwald;
+
+  struct gfactors
+  {
+    MyReal rinv2;
+#if(MULTIPOLE_ORDER >= 2)
+    MyReal rinv3;
+#endif
+
+    MyReal fac0;  //    g
+    MyReal fac1;  //    g'
+#if(MULTIPOLE_ORDER >= 2)
+    MyReal fac2;  //    (g'' - g'/r) = (g'' - g'/r)
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+    MyReal fac3;  //    (g''' - 3*g''/r + 3 * g'/r^2) = (g''' - (3/r) * fac2)
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+    MyReal fac4;  //    (g'''' - 6*g'''/r + 15*g''/r^2 - 15*g'/r^3)  =  (g'''' - (6/r) * fac3 - (3/r^2) * fac2)
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+    MyReal fac5;  //    (g''''' - 10*g''''/r + 45*g'''/r^2 - 105*g''/r^3 +  105 g'/r^4)  =  (g''''' - (10/r) * fac4 - (15/r^2) * fac3)
+#endif
+
+    gfactors() /* constructor, initialize the factors to zero */
+    {
+      fac0 = 0;
+      fac1 = 0;
+#if(MULTIPOLE_ORDER >= 2)
+      fac2 = 0;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+      fac3 = 0;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+      fac4 = 0;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+      fac5 = 0;
+#endif
+    }
+  };
+
+  template <typename T>
+  inline void get_gfactors_multipole(gfactors &res, const T r, const T h_max, const T rinv)
+  {
+    res.rinv2 = rinv * rinv;
+
+#if(MULTIPOLE_ORDER >= 2)
+    res.rinv3 = res.rinv2 * rinv;
+#endif
+
+    if(r >= h_max)
+      {
+#ifdef EVALPOTENTIAL
+        res.fac0 += rinv;
+#endif
+        res.fac1 -= res.rinv2;
+#if(MULTIPOLE_ORDER >= 2)
+        res.fac2 += 3 * res.rinv3;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+        res.fac3 -= 15 * res.rinv3 * rinv;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+        res.fac4 += 105 * res.rinv3 * res.rinv2;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+        res.fac5 -= 945 * res.rinv3 * res.rinv3;
+#endif
+      }
+    else
+      {
+        T h_inv  = 1 / h_max;
+        T h2_inv = h_inv * h_inv;
+        T u      = r * h_inv;
+        T fac1;
+#if(MULTIPOLE_ORDER >= 2)
+        T h3_inv = h_inv * h2_inv;
+        T gpp;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+        T gppp;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+        T gpppp;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+        T gppppp;
+#endif
+        if(u < static_cast<T>(0.5))
+          {
+            T u2 = u * u;
+#ifdef EVALPOTENTIAL
+            res.fac0 -= h_inv * (static_cast<T>(SOFTFAC4) +
+                                 u2 * (static_cast<T>(SOFTFAC5) + u2 * (static_cast<T>(SOFTFAC6) * u + static_cast<T>(SOFTFAC7))));
+#endif
+            fac1 = -h2_inv * u * (static_cast<T>(SOFTFAC1) + u2 * (static_cast<T>(SOFTFAC2) * u + static_cast<T>(SOFTFAC3)));
+            res.fac1 += fac1;
+#if(MULTIPOLE_ORDER >= 2)
+            gpp = -h3_inv * (static_cast<T>(SOFTFAC30) + (static_cast<T>(SOFTFAC31) + static_cast<T>(SOFTFAC32) * u) * u2);
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+            gppp = -h3_inv * h_inv * (static_cast<T>(SOFTFAC33) * u + static_cast<T>(SOFTFAC34) * u2);
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+            gpppp = -h3_inv * h2_inv * (static_cast<T>(SOFTFAC33) + static_cast<T>(SOFTFAC35) * u);
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+            gppppp = -h3_inv * h3_inv * static_cast<T>(SOFTFAC35);
+#endif
+          }
+        else
+          {
+            T u2    = u * u;
+            T u3    = u2 * u;
+            T u3inv = 1 / u3;
+#ifdef EVALPOTENTIAL
+            res.fac0 -=
+                h_inv * (static_cast<T>(SOFTFAC13) + static_cast<T>(SOFTFAC14) / u +
+                         u2 * (static_cast<T>(SOFTFAC1) +
+                               u * (static_cast<T>(SOFTFAC15) + u * (static_cast<T>(SOFTFAC16) + static_cast<T>(SOFTFAC17) * u))));
+#endif
+            fac1 = -h2_inv * u *
+                   (static_cast<T>(SOFTFAC8) + static_cast<T>(SOFTFAC9) * u + static_cast<T>(SOFTFAC10) * u2 +
+                    static_cast<T>(SOFTFAC11) * u3 + static_cast<T>(SOFTFAC12) * u3inv);
+            res.fac1 += fac1;
+#if(MULTIPOLE_ORDER >= 2)
+            gpp = -h3_inv * (static_cast<T>(SOFTFAC40) + static_cast<T>(SOFTFAC41) / (u * u2) + static_cast<T>(SOFTFAC42) * u +
+                             static_cast<T>(SOFTFAC43) * u2 + static_cast<T>(SOFTFAC44) * u2 * u);
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+            gppp = -h3_inv * h_inv *
+                   (static_cast<T>(SOFTFAC45) + static_cast<T>(SOFTFAC46) / (u2 * u2) + static_cast<T>(SOFTFAC47) * u +
+                    static_cast<T>(SOFTFAC48) * u2);
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+            gpppp =
+                -h3_inv * h2_inv * (static_cast<T>(SOFTFAC49) / (u2 * u3) + static_cast<T>(SOFTFAC47) + static_cast<T>(SOFTFAC50) * u);
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+            gppppp = -h3_inv * h3_inv * (static_cast<T>(SOFTFAC51) / (u3 * u3) + static_cast<T>(SOFTFAC50));
+#endif
+          }
+#if(MULTIPOLE_ORDER >= 2)
+        T fac2 = (gpp - rinv * fac1);
+        res.fac2 += fac2;
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+        T fac3 = (gppp - 3 * rinv * fac2);
+        res.fac3 += fac3;
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+        T fac4 = (gpppp - 6 * rinv * fac3 - 3 * rinv * rinv * fac2);
+        res.fac4 += fac4;
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+        T fac5 = (gppppp - 10 * rinv * fac4 - 15 * rinv * rinv * fac3);
+        res.fac5 += fac5;
+#endif
+      }
+  }
+
+  template <typename T>
+  inline void get_gfactors_monopole(gfactors &res, const T r, const T h_max, const T rinv)
+  {
+    if(r > h_max)
+      {
+        res.fac1 -= rinv * rinv;
+#ifdef EVALPOTENTIAL
+        res.fac0 += rinv;
+#endif
+      }
+    else
+      {
+        T h_inv  = 1 / h_max;
+        T h2_inv = h_inv * h_inv;
+        T u      = r * h_inv;
+
+        if(u < 0.5f)
+          {
+            T u2 = u * u;
+            res.fac1 -= h2_inv * u * (static_cast<T>(SOFTFAC1) + u2 * (static_cast<T>(SOFTFAC2) * u + static_cast<T>(SOFTFAC3)));
+#ifdef EVALPOTENTIAL
+            res.fac0 -= h_inv * (static_cast<T>(SOFTFAC4) +
+                                 u2 * (static_cast<T>(SOFTFAC5) + u2 * (static_cast<T>(SOFTFAC6) * u + static_cast<T>(SOFTFAC7))));
+#endif
+          }
+        else
+          {
+            T u2 = u * u;
+            T u3 = u2 * u;
+            res.fac1 -= h2_inv * u *
+                        (static_cast<T>(SOFTFAC8) + static_cast<T>(SOFTFAC9) * u + static_cast<T>(SOFTFAC10) * u2 +
+                         static_cast<T>(SOFTFAC11) * u3 + static_cast<T>(SOFTFAC12) / u3);
+#ifdef EVALPOTENTIAL
+            res.fac0 -=
+                h_inv * (static_cast<T>(SOFTFAC13) + static_cast<T>(SOFTFAC14) / u +
+                         u2 * (static_cast<T>(SOFTFAC1) +
+                               u * (static_cast<T>(SOFTFAC15) + u * (static_cast<T>(SOFTFAC16) + static_cast<T>(SOFTFAC17) * u))));
+#endif
+          }
+      }
+  }
+
+  template <typename T>
+  inline void get_gfactors_potential(gfactors &res, const T r, const T hmax, const T rinv)
+  {
+    if(r >= hmax)
+      {
+        res.fac0 = rinv;
+#if(MULTIPOLE_ORDER >= 3)
+        T rinv3  = rinv * rinv * rinv;
+        res.fac1 = -rinv * rinv;
+        res.fac2 = 3 * rinv3;
+#endif
+      }
+    else
+      {
+        T h_inv = 1 / hmax;
+#if(MULTIPOLE_ORDER >= 3)
+        T h2_inv = h_inv * h_inv;
+        T h3_inv = h2_inv * h_inv;
+#endif
+        T u = r * h_inv;
+#if(MULTIPOLE_ORDER >= 3)
+        T fac1;
+#endif
+        if(u < 0.5)
+          {
+            T u2 = u * u;
+#if(MULTIPOLE_ORDER >= 3)
+            fac1 = -h2_inv * u * (static_cast<T>(SOFTFAC1) + u2 * (static_cast<T>(SOFTFAC2) * u + static_cast<T>(SOFTFAC3)));
+            res.fac1 += fac1;
+#endif
+            res.fac0 = -h_inv * (static_cast<T>(SOFTFAC4) +
+                                 u2 * (static_cast<T>(SOFTFAC5) + u2 * (static_cast<T>(SOFTFAC6) * u + static_cast<T>(SOFTFAC7))));
+          }
+        else
+          {
+            T u2 = u * u;
+#if(MULTIPOLE_ORDER >= 3)
+            T u3 = u2 * u;
+            fac1 = -h2_inv * u *
+                   (static_cast<T>(SOFTFAC8) + static_cast<T>(SOFTFAC9) * u + static_cast<T>(SOFTFAC10) * u2 +
+                    static_cast<T>(SOFTFAC11) * u3 + static_cast<T>(SOFTFAC12) / u3);
+            res.fac1 += fac1;
+#endif
+            res.fac0 =
+                -h_inv * (static_cast<T>(SOFTFAC13) + static_cast<T>(SOFTFAC14) / u +
+                          u2 * (static_cast<T>(SOFTFAC1) +
+                                u * (static_cast<T>(SOFTFAC15) + u * (static_cast<T>(SOFTFAC16) + static_cast<T>(SOFTFAC17) * u))));
+          }
+
+#if(MULTIPOLE_ORDER >= 3)
+        T gpp;
+        T u2 = u * u;
+
+        if(u < 0.5)
+          gpp = -h3_inv * (static_cast<T>(SOFTFAC30) + (static_cast<T>(SOFTFAC31) + static_cast<T>(SOFTFAC32) * u) * u2);
+        else
+          gpp = -h3_inv * (static_cast<T>(SOFTFAC40) + static_cast<T>(SOFTFAC41) / (u * u2) + static_cast<T>(SOFTFAC42) * u +
+                           static_cast<T>(SOFTFAC43) * u2 + static_cast<T>(SOFTFAC44) * u2 * u);
+
+        res.fac2 = (gpp - rinv * fac1);
+#endif
+      }
+  }
+
+#ifdef PMGRID
+  inline bool modify_gfactors_pm_multipole(gfactors &res, const double r, const double rinv, const mesh_factors *mfp)
+  {
+    double tabentry = mfp->asmthfac * r;
+    int tabindex    = (int)tabentry;
+
+    if(tabindex < NTAB)
+      {
+        double w1 = tabentry - tabindex;
+        double w0 = 1 - w1;
+
+#ifdef EVALPOTENTIAL
+        res.fac0 += mfp->asmthinv1 * (w0 * shortrange_factors[tabindex].fac0 + w1 * shortrange_factors[tabindex + 1].fac0);
+#endif
+        res.fac1 += mfp->asmthinv2 * (w0 * shortrange_factors[tabindex].fac1 + w1 * shortrange_factors[tabindex + 1].fac1);
+
+#if(MULTIPOLE_ORDER >= 2)
+        res.fac2 += mfp->asmthinv3 * (w0 * shortrange_factors[tabindex].fac2 + w1 * shortrange_factors[tabindex + 1].fac2);
+#endif
+#if(MULTIPOLE_ORDER >= 3)
+        res.fac3 += mfp->asmthinv4 * (w0 * shortrange_factors[tabindex].fac3 + w1 * shortrange_factors[tabindex + 1].fac3);
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+        res.fac4 += mfp->asmthinv5 * (w0 * shortrange_factors[tabindex].fac4 + w1 * shortrange_factors[tabindex + 1].fac4);
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+        res.fac5 += mfp->asmthinv6 * (w0 * shortrange_factors[tabindex].fac5 + w1 * shortrange_factors[tabindex + 1].fac5);
+#endif
+        return false;
+      }
+    else
+      return true;
+  }
+
+  inline bool modify_gfactors_pm_monopole(gfactors &res, const double r, const double rinv, const mesh_factors *mfp)
+  {
+    double tabentry = mfp->asmthfac * r;
+    int tabindex    = (int)tabentry;
+
+    if(tabindex < NTAB)
+      {
+        double w1 = tabentry - tabindex;
+        double w0 = 1 - w1;
+
+#ifdef EVALPOTENTIAL
+        res.fac0 += mfp->asmthinv1 * (w0 * shortrange_factors[tabindex].fac0 + w1 * shortrange_factors[tabindex + 1].fac0);
+#endif
+        res.fac1 += mfp->asmthinv2 * (w0 * shortrange_factors[tabindex].fac1 + w1 * shortrange_factors[tabindex + 1].fac1);
+
+        return false;
+      }
+    else
+      return true;
+  }
+
+#endif
+};
+
+#endif
diff --git a/src/gravtree/gravtree_build.cc b/src/gravtree/gravtree_build.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7fb360db5f700d4448084902654f5d847df42e22
--- /dev/null
+++ b/src/gravtree/gravtree_build.cc
@@ -0,0 +1,634 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file gravtree_build.cc
+ *
+ *  \brief routines for building the gravitational tree
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../gravtree/gravtree.h"
+#include "../io/io.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../pm/pm.h"
+#include "../sort/cxxsort.h"
+#include "../sort/peano.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/*!
+ *  This file contains the construction of the tree used for calculating the gravitational force.
+ *  The type tree implemented is a geometrical oct-tree, starting from a cube encompassing
+ *  all particles. This cube is automatically found in the domain decomposition, which also
+ *  splits up the global "top-level" tree along node boundaries, moving the particles
+ *  of different parts of the tree to separate processors. In this version of the code, the tree
+ *  construction may be repeated every timestep without a renewed domain decomposition.
+ *  If particles are on the "wrong" processor because a new domain decomposition has not been
+ *  carried out, they are sent as temporary points to the right insertion processor according
+ *  to the layout of the top-level nodes. In addition, the mapping of the top-level nodes to
+ *  processors may be readjusted in order to improve work-load balance for the current time step.
+ *
+ */
+
+template <typename partset>
+void gravtree<partset>::report_log_message(void)
+{
+  TIMER_START(CPU_LOGS);
+
+  int max_imported;
+  long long tot_imported;
+  sumup_large_ints(1, &NumPartImported, &tot_imported, D->Communicator);
+  MPI_Reduce(&NumPartImported, &max_imported, 1, MPI_INT, MPI_MAX, 0, D->Communicator);
+
+  MyReal numnodes = NumNodes, tot_numnodes;
+  MPI_Reduce(&numnodes, &tot_numnodes, 1, MPI_DOUBLE, MPI_SUM, 0, D->Communicator);
+
+  if(Ninsert == Tp->NumPart)
+    D->mpi_printf(
+        "GRAVTREE: Tree construction done. took %g sec  <numnodes>=%g  NTopnodes=%d NTopleaves=%d tree-build-scalability=%g\n",
+        Buildtime, tot_numnodes / D->NTask, D->NTopnodes, D->NTopleaves,
+        ((double)((tot_numnodes - D->NTask * ((double)D->NTopnodes)) + D->NTopnodes)) / (tot_numnodes > 0 ? tot_numnodes : 1));
+
+  TIMER_STOP(CPU_LOGS);
+}
+
+template <>
+void gravtree<simparticles>::fill_in_export_points(gravpoint_data *exp_point, int i, int no)
+{
+  /* this point has to go to another task */
+  for(int j = 0; j < 3; j++)
+    exp_point->IntPos[j] = Tp->P[i].IntPos[j];
+
+  exp_point->Mass   = Tp->P[i].getMass();
+  exp_point->OldAcc = Tp->P[i].OldAcc;
+  exp_point->index  = i;
+  exp_point->Type   = Tp->P[i].getType();
+#if NSOFTCLASSES > 1
+  exp_point->SofteningClass = Tp->P[i].getSofteningClass();
+#endif
+  exp_point->no = no;
+#ifndef HIERARCHICAL_GRAVITY
+  if(Tp->TimeBinSynchronized[Tp->P[i].TimeBinGrav])
+    exp_point->ActiveFlag = 1;
+  else
+    exp_point->ActiveFlag = 0;
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+  exp_point->InsideOutsideFlag = Tp->P[i].InsideOutsideFlag;
+#endif
+}
+
+#if defined(LIGHTCONE) && (defined(LIGHTCONE_PARTICLES_GROUPS) || defined(LIGHTCONE_IMAGE_COMP_HSML_VELDISP))
+
+template <>
+void gravtree<lcparticles>::fill_in_export_points(gravpoint_data *exp_point, int i, int no)
+{
+  /* this point has to go to another task */
+  for(int j = 0; j < 3; j++)
+    exp_point->IntPos[j] = Tp->P[i].IntPos[j];
+
+  exp_point->Mass   = Tp->P[i].getMass();
+  exp_point->OldAcc = 0;
+  exp_point->index  = i;
+  exp_point->Type   = Tp->P[i].getType();
+#if NSOFTCLASSES > 1
+  exp_point->SofteningClass = Tp->P[i].getSofteningClass();
+#endif
+  exp_point->no = no;
+}
+
+#endif
+
+/*! This function communicates the values of the multipole moments of the
+ *  top-level tree-nodes of the domain grid.  This data can then be used to
+ *  update the pseudo-particles on each CPU accordingly.
+ */
+template <typename partset>
+void gravtree<partset>::exchange_topleafdata(void)
+{
+  struct leafnode_data
+  {
+    vector<MyIntPosType> s;
+    MyDouble mass;
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+    symtensor2<MyDouble> Q2Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM))
+    symtensor3<MyDouble> Q3Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM))
+    symtensor4<MyDouble> Q4Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM))
+    symtensor5<MyDouble> Q5Tensor;
+#endif
+#ifdef FMM
+    float MinOldAcc;
+#endif
+    unsigned char not_empty;
+#if NSOFTCLASSES > 1
+    unsigned char maxsofttype;
+    unsigned char minsofttype;
+#endif
+    unsigned char level;
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+    unsigned char overlap_flag : 2;
+#endif
+  };
+  leafnode_data *glob_leaf_node_data, *loc_leaf_node_data;
+
+  glob_leaf_node_data = (leafnode_data *)Mem.mymalloc("glob_leaf_node_data", D->NTopleaves * sizeof(leafnode_data));
+
+  /* share the pseudo-particle data accross CPUs */
+  int *recvcounts = (int *)Mem.mymalloc("recvcounts", sizeof(int) * D->NTask);
+  int *recvoffset = (int *)Mem.mymalloc("recvoffset", sizeof(int) * D->NTask);
+  int *bytecounts = (int *)Mem.mymalloc("bytecounts", sizeof(int) * D->NTask);
+  int *byteoffset = (int *)Mem.mymalloc("byteoffset", sizeof(int) * D->NTask);
+
+  for(int task = 0; task < D->NTask; task++)
+    recvcounts[task] = 0;
+
+  for(int n = 0; n < D->NTopleaves; n++)
+    recvcounts[D->TaskOfLeaf[n]]++;
+
+  for(int task = 0; task < D->NTask; task++)
+    bytecounts[task] = recvcounts[task] * sizeof(leafnode_data);
+
+  recvoffset[0] = 0;
+  byteoffset[0] = 0;
+
+  for(int task = 1; task < D->NTask; task++)
+    {
+      recvoffset[task] = recvoffset[task - 1] + recvcounts[task - 1];
+      byteoffset[task] = byteoffset[task - 1] + bytecounts[task - 1];
+    }
+
+  loc_leaf_node_data = (leafnode_data *)Mem.mymalloc("loc_leaf_node_data", recvcounts[D->ThisTask] * sizeof(leafnode_data));
+
+  int idx = 0;
+
+  for(int n = 0; n < D->NTopleaves; n++)
+    {
+      if(D->TaskOfLeaf[n] == D->ThisTask)
+        {
+          int no        = NodeIndex[n];
+          gravnode *nop = &TopNodes[no];
+
+          leafnode_data *locp = &loc_leaf_node_data[idx];
+
+          /* read out the multipole moments from the local base cells */
+          locp->s    = nop->s;
+          locp->mass = nop->mass;
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+          locp->Q2Tensor = nop->Q2Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM))
+          locp->Q3Tensor = nop->Q3Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM))
+          locp->Q4Tensor = nop->Q4Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM))
+          locp->Q5Tensor = nop->Q5Tensor;
+#endif
+#if NSOFTCLASSES > 1
+          locp->maxsofttype = nop->maxsofttype;
+          locp->minsofttype = nop->minsofttype;
+#endif
+#ifdef FMM
+          locp->MinOldAcc = nop->MinOldAcc;
+#endif
+          locp->not_empty = nop->not_empty;
+          locp->level     = nop->level;
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+          locp->overlap_flag = nop->overlap_flag;
+#endif
+          idx++;
+        }
+    }
+
+  // optimise this step - only need to update this once per shared memory node
+
+  MPI_Allgatherv(loc_leaf_node_data, bytecounts[D->ThisTask], MPI_BYTE, glob_leaf_node_data, bytecounts, byteoffset, MPI_BYTE,
+                 D->Communicator);
+
+  for(int task = 0; task < D->NTask; task++)
+    recvcounts[task] = 0;
+
+  for(int n = 0; n < D->NTopleaves; n++)
+    {
+      int task = D->TaskOfLeaf[n];
+      if(task != D->ThisTask)
+        {
+          int no        = NodeIndex[n];
+          gravnode *nop = &TopNodes[no];
+
+          idx                  = recvoffset[task] + recvcounts[task]++;
+          leafnode_data *globp = &glob_leaf_node_data[idx];
+
+          nop->s    = globp->s;
+          nop->mass = globp->mass;
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+          nop->Q2Tensor = globp->Q2Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM))
+          nop->Q3Tensor = globp->Q3Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM))
+          nop->Q4Tensor = globp->Q4Tensor;
+#endif
+#if(MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM))
+          nop->Q5Tensor = globp->Q5Tensor;
+#endif
+#if NSOFTCLASSES > 1
+          nop->maxsofttype = globp->maxsofttype;
+          nop->minsofttype = globp->minsofttype;
+#endif
+          nop->not_empty = globp->not_empty;
+#ifdef FMM
+          nop->MinOldAcc = globp->MinOldAcc;
+#endif
+          nop->level = globp->level;
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+          nop->overlap_flag = globp->overlap_flag;
+#endif
+        }
+    }
+
+  Mem.myfree(loc_leaf_node_data);
+  Mem.myfree(byteoffset);
+  Mem.myfree(bytecounts);
+  Mem.myfree(recvoffset);
+  Mem.myfree(recvcounts);
+  Mem.myfree(glob_leaf_node_data);
+}
+
+/*! this routine determines the multipole moments for a given internal node
+ *  and all its subnodes using a recursive computation.  The result is
+ *  stored in the Nodes[] structure in the sequence of this tree-walk.
+ *  'no' is the node for which the moments shall be found
+ *  'sib' is the sibling of this node
+ */
+template <typename partset>
+void gravtree<partset>::update_node_recursive(int no, int sib, int mode)
+{
+  if(!(no >= MaxPart && no < MaxPart + MaxNodes)) /* are we an internal node? */
+    Terminate("no internal node\n");
+
+  gravnode *nop = get_nodep(no);
+
+  if(mode == TREE_MODE_TOPLEVEL)
+    {
+      int p = nop->nextnode;
+
+      /* if the next node is not a top-level node, we have reached a leaf node, and we need to do nothing */
+      if(p < MaxPart || p >= FirstNonTopLevelNode)
+        return;
+    }
+
+  MyReal mass = 0;
+  vector<MyReal> s(0.0);
+
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+  symtensor2<MyReal> Q2Tensor(0.0); /**< quadrupole tensor */
+#endif
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM))
+  symtensor3<MyReal> Q3Tensor(0.0); /**< octupole tensor */
+#endif
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM))
+  symtensor4<MyReal> Q4Tensor(0.0); /**< hexadecupole tensor */
+#endif
+#if(MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM))
+  symtensor5<MyReal> Q5Tensor(0.0); /**< triakontadipole tensor */
+#endif
+#if NSOFTCLASSES > 1
+  unsigned char maxsofttype = NSOFTCLASSES + NSOFTCLASSES_HYDRO;
+  unsigned char minsofttype = NSOFTCLASSES + NSOFTCLASSES_HYDRO + 1;
+#endif
+  unsigned char not_empty = 0;
+#ifdef FMM
+  float minOldAcc = MAX_FLOAT_NUMBER;
+#endif
+
+  int p = nop->nextnode;
+
+  while(p != nop->sibling)
+    {
+      if(p >= 0)
+        {
+          if(p >= MaxPart && p < MaxPart + MaxNodes) /* we have an internal node */
+            {
+              int nextsib = get_nodep(p)->sibling;
+
+              update_node_recursive(p, nextsib, mode);
+            }
+
+          if(p < MaxPart) /* a particle */
+            {
+              vector<MyReal> dxyz;
+              Tp->nearest_image_intpos_to_pos(Tp->P[p].IntPos, nop->center.da, dxyz.da);
+
+              MyReal m = Tp->P[p].getMass();
+
+              mass += m;
+              s += m * dxyz;
+
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+              vector<MyReal> mdxyz = m * dxyz;
+              symtensor2<MyReal> mdxyz2(mdxyz, dxyz);
+              Q2Tensor += mdxyz2;
+#endif
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM))
+              symtensor3<MyReal> mdxyz3(dxyz, mdxyz2);
+              Q3Tensor += mdxyz3;
+#endif
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM))
+              symtensor4<MyReal> mdxyz4(dxyz, mdxyz3);
+              Q4Tensor += mdxyz4;
+#endif
+#if(MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM))
+              symtensor5<MyReal> mdxyz5(dxyz, mdxyz4);
+              Q5Tensor += mdxyz5;
+#endif
+              not_empty = 1;
+#ifndef HIERARCHICAL_GRAVITY
+              if(Tp->getTimeBinSynchronized(Tp->P[p].getTimeBinGrav()))
+#endif
+                {
+#ifdef FMM
+                  if(minOldAcc > Tp->P[p].getOldAcc())
+                    minOldAcc = Tp->P[p].getOldAcc();
+#endif
+                }
+
+#if NSOFTCLASSES > 1
+              if(All.ForceSoftening[maxsofttype] < All.ForceSoftening[Tp->P[p].getSofteningClass()])
+                maxsofttype = Tp->P[p].getSofteningClass();
+              if(All.ForceSoftening[minsofttype] > All.ForceSoftening[Tp->P[p].getSofteningClass()])
+                minsofttype = Tp->P[p].getSofteningClass();
+#endif
+              p = Nextnode[p];
+            }
+          else if(p < MaxPart + MaxNodes) /* an internal node  */
+            {
+              gravnode *noptr = get_nodep(p);
+
+              vector<MyReal> dxyz;
+              Tp->nearest_image_intpos_to_pos(noptr->s.da, nop->center.da, dxyz.da);
+
+              MyReal m = noptr->mass;
+
+              mass += m;
+              s += m * dxyz;
+
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+              Q2Tensor += noptr->Q2Tensor;
+
+              vector<MyReal> mdxyz = m * dxyz;
+              symtensor2<MyReal> mdxyz2(mdxyz, dxyz);
+              Q2Tensor += mdxyz2;
+#endif
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM))
+              Q3Tensor += noptr->Q3Tensor;
+
+              symtensor3<MyReal> mdxyz3(dxyz, mdxyz2);
+              Q3Tensor += mdxyz3;
+
+              Q3Tensor += outer_prod_sum(noptr->Q2Tensor, dxyz);
+#endif
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM))
+              Q4Tensor += noptr->Q4Tensor;
+
+              symtensor4<MyReal> mdxyz4(dxyz, mdxyz3);
+              Q4Tensor += mdxyz4;
+
+              Q4Tensor += outer_prod_sum(noptr->Q3Tensor, dxyz);
+              symtensor2<MyReal> dxyz2(dxyz, dxyz);
+              Q4Tensor += outer_prod_sum(noptr->Q2Tensor, dxyz2);
+#endif
+#if(MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM))
+              Q5Tensor += noptr->Q5Tensor;
+
+              symtensor5<MyReal> mdxyz5(dxyz, mdxyz4);
+              Q5Tensor += mdxyz5;
+
+              Q5Tensor += outer_prod_sum(noptr->Q4Tensor, dxyz);
+              Q5Tensor += outer_prod_sum(noptr->Q3Tensor, dxyz2);
+              symtensor3<MyReal> dxyz3(dxyz, dxyz2);
+              Q5Tensor += outer_prod_sum(dxyz3, noptr->Q2Tensor);
+#endif
+
+#if NSOFTCLASSES > 1
+              if(All.ForceSoftening[maxsofttype] < All.ForceSoftening[noptr->maxsofttype])
+                maxsofttype = noptr->maxsofttype;
+              if(All.ForceSoftening[minsofttype] > All.ForceSoftening[noptr->minsofttype])
+                minsofttype = noptr->minsofttype;
+#endif
+              not_empty |= noptr->not_empty;
+#ifdef FMM
+              if(minOldAcc > noptr->MinOldAcc)
+                minOldAcc = noptr->MinOldAcc;
+#endif
+
+              p = noptr->sibling;
+            }
+          else if(p < MaxPart + MaxNodes + D->NTopleaves) /* a pseudo particle */
+            {
+              /* we are processing a local leaf-node which does not have any particles.
+               * can continue to the next element, which should end the work.
+               */
+              p = Nextnode[p - MaxNodes];
+            }
+          else
+            {
+              /* an imported point */
+              int n = p - ImportedNodeOffset;
+
+              if(n >= NumPartImported)
+                Terminate("n=%d >= NumPartImported=%d   MaxPart=%d MaxNodes=%d  D->NTopleaves=%d", n, NumPartImported, MaxPart,
+                          MaxNodes, D->NTopleaves);
+
+              vector<MyReal> dxyz;
+              Tp->nearest_image_intpos_to_pos(Points[n].IntPos, nop->center.da, dxyz.da);
+
+              MyReal m = Points[n].Mass;
+
+              mass += m;
+              s += m * dxyz;
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+              vector<MyReal> mdxyz = m * dxyz;
+              symtensor2<MyReal> mdxyz2(mdxyz, dxyz);
+              Q2Tensor += mdxyz2;
+#endif
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM))
+              symtensor3<MyReal> mdxyz3(dxyz, mdxyz2);
+              Q3Tensor += mdxyz3;
+#endif
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM))
+              symtensor4<MyReal> mdxyz4(dxyz, mdxyz3);
+              Q4Tensor += mdxyz4;
+#endif
+#if(MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM))
+              symtensor5<MyReal> mdxyz5(dxyz, mdxyz4);
+              Q5Tensor += mdxyz5;
+#endif
+              not_empty = 1;
+#ifndef HIERARCHICAL_GRAVITY
+              if(Points[n].ActiveFlag)
+#endif
+                {
+#ifdef FMM
+                  if(minOldAcc > Points[n].OldAcc)
+                    minOldAcc = Points[n].OldAcc;
+#endif
+                }
+#if NSOFTCLASSES > 1
+              if(All.ForceSoftening[maxsofttype] < All.ForceSoftening[Points[n].SofteningClass])
+                maxsofttype = Points[n].SofteningClass;
+              if(All.ForceSoftening[minsofttype] > All.ForceSoftening[Points[n].SofteningClass])
+                minsofttype = Points[n].SofteningClass;
+#endif
+              p = Nextnode[p - MaxNodes];
+            }
+        }
+    }
+
+  if(mass)
+    {
+      s *= (1 / mass);
+    }
+
+  nop->mass = mass;
+
+  vector<MySignedIntPosType> off;
+  Tp->pos_to_signedintpos(s.da, off.da);
+
+  nop->s[0] = off[0] + nop->center[0];
+  nop->s[1] = off[1] + nop->center[1];
+  nop->s[2] = off[2] + nop->center[2];
+
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+  vector<MyReal> ms = mass * s;
+  symtensor2<MyReal> ms2(ms, s);
+  Q2Tensor -= ms2;
+  nop->Q2Tensor = Q2Tensor;
+#endif
+
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM))
+  symtensor3<MyReal> ms3(s, ms2);
+  Q3Tensor -= ms3;
+  Q3Tensor -= outer_prod_sum(Q2Tensor, s);
+  nop->Q3Tensor = Q3Tensor;
+#endif
+
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM))
+  symtensor4<MyReal> ms4(s, ms3);
+  Q4Tensor -= ms4;
+  Q4Tensor -= outer_prod_sum(Q3Tensor, s);
+  symtensor2<MyReal> s2(s, s);
+  Q4Tensor -= outer_prod_sum(Q2Tensor, s2);
+  nop->Q4Tensor = Q4Tensor;
+#endif
+
+#if(MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM))
+  symtensor5<MyReal> ms5(s, ms4);
+  Q5Tensor -= ms5;
+  Q5Tensor -= outer_prod_sum(Q4Tensor, s);
+  Q5Tensor -= outer_prod_sum(Q3Tensor, s2);
+  symtensor3<MyReal> s3(s, s2);
+  Q5Tensor -= outer_prod_sum(s3, Q2Tensor);
+  nop->Q5Tensor = Q5Tensor;
+#endif
+
+#if NSOFTCLASSES > 1
+  nop->maxsofttype = maxsofttype;
+  nop->minsofttype = minsofttype;
+#endif
+  nop->cannot_be_opened_locally = 0;
+  nop->not_empty                = not_empty;
+#ifdef FMM
+  nop->MinOldAcc = minOldAcc;
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+  MyIntPosType halflen = ((MyIntPosType)1) << ((BITS_FOR_POSITIONS - 1) - nop->level);
+  nop->overlap_flag    = Tp->check_high_res_overlap(nop->center.da, halflen);
+#endif
+}
+
+/*! \brief This function sets the (comoving) softening length of all particle
+ *  types in the table All.SofteningTable[...].
+ *
+ *  A check is performed that the physical
+ *  softening length is bounded by the Softening-MaxPhys values.
+ */
+template <typename partset>
+void gravtree<partset>::set_softenings(void)
+{
+  if(All.ComovingIntegrationOn)
+    {
+      for(int i = 0; i < NSOFTCLASSES; i++)
+        if(All.SofteningComoving[i] * All.Time > All.SofteningMaxPhys[i])
+          All.SofteningTable[i] = All.SofteningMaxPhys[i] / All.Time;
+        else
+          All.SofteningTable[i] = All.SofteningComoving[i];
+    }
+  else
+    {
+      for(int i = 0; i < NSOFTCLASSES; i++)
+        All.SofteningTable[i] = All.SofteningComoving[i];
+    }
+
+#ifdef ADAPTIVE_HYDRO_SOFTENING
+  for(int i = 0; i < NSOFTCLASSES_HYDRO; i++)
+    All.SofteningTable[i + NSOFTCLASSES] = All.MinimumComovingHydroSoftening * pow(All.AdaptiveHydroSofteningSpacing, i);
+
+  if(All.AdaptiveHydroSofteningSpacing < 1)
+    Terminate("All.AdaptiveHydroSofteningSpacing < 1");
+
+  /* we check that type=0 has its own slot 0 in the softening types, so that only gas masses are stored there */
+  if(All.SofteningClassOfPartType[0] != 0)
+    Terminate("All.SofteningClassOfPartType[0] != 0");
+
+  for(int i = 1; i < NTYPES; i++)
+    if(All.SofteningClassOfPartType[i] == All.SofteningClassOfPartType[0])
+      Terminate("i=%d: All.SofteningClassOfPartType[i] == All.SofteningClassOfPartType[0]", i);
+#endif
+
+  for(int i = 0; i < NSOFTCLASSES + NSOFTCLASSES_HYDRO; i++)
+    All.ForceSoftening[i] = 2.8 * All.SofteningTable[i];
+
+  All.ForceSoftening[NSOFTCLASSES + NSOFTCLASSES_HYDRO] =
+      0; /* important - this entry is actually used in the tree construction for the search of the maximum softening in a node */
+  All.ForceSoftening[NSOFTCLASSES + NSOFTCLASSES_HYDRO + 1] =
+      MAX_FLOAT_NUMBER; /* important - this entry is actually used in the tree construction for the search of the maximum softening in
+                           a node */
+}
+
+/* make sure that we instantiate the template */
+#include "../data/simparticles.h"
+template class gravtree<simparticles>;
+
+/* make sure that we instantiate the template */
+#if defined(LIGHTCONE) && (defined(LIGHTCONE_PARTICLES_GROUPS) || defined(LIGHTCONE_IMAGE_COMP_HSML_VELDISP))
+#include "../data/lcparticles.h"
+template class gravtree<lcparticles>;
+#endif
diff --git a/src/gravtree/gwalk.cc b/src/gravtree/gwalk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..71206410d84651538aa30d126f78744a4bedb692
--- /dev/null
+++ b/src/gravtree/gwalk.cc
@@ -0,0 +1,937 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  gwalk.cc
+ *
+ *  \brief implements the routines for walking the gravity tree and accumulating forces
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../gravity/ewald.h"
+#include "../gravtree/gravtree.h"
+#include "../gravtree/gwalk.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../pm/pm.h"
+#include "../sort/cxxsort.h"
+#include "../sort/peano.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/*! This file contains the code for the gravitational force computation by
+ *  means of the tree algorithm. To this end, a tree force is computed for all
+ *  active local particles, and particles are exported to other processors if
+ *  needed, where they can receive additional force contributions. If the
+ *  TreePM algorithm is enabled, the force computed will only be the
+ *  short-range part.
+ */
+
+inline void gwalk::evaluate_particle_particle_interaction(const pinfo &pdat, const int no, const char jtype, int shmrank)
+{
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  if(skip_actual_force_computation)
+    return;
+#endif
+
+  /* first, let's get the relevant information of our partner particle, which can be in three different lists */
+
+  MyIntPosType *intpos;
+  MyReal mass;
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+  int overlap_check;
+#endif
+
+  MyReal hmax = pdat.h_i;
+
+  if(jtype == NODE_TYPE_LOCAL_PARTICLE)
+    {
+      particle_data *P = get_Pp(no, shmrank);
+
+      intpos = P->IntPos;
+      mass   = P->getMass();
+
+#if NSOFTCLASSES > 1
+      MyReal h_j = All.ForceSoftening[P->getSofteningClass()];
+      if(h_j > hmax)
+        hmax = h_j;
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      overlap_check = P->InsideOutsideFlag;
+#endif
+    }
+  else if(jtype == NODE_TYPE_TREEPOINT_PARTICLE)
+    {
+      int n = no - ImportedNodeOffset;
+
+      gravpoint_data *Pointp = get_pointsp(n, shmrank);
+
+      intpos = Pointp->IntPos;
+      mass   = Pointp->Mass;
+
+#if NSOFTCLASSES > 1
+      MyReal h_j = All.ForceSoftening[Pointp->getSofteningClass()];
+      if(h_j > hmax)
+        hmax = h_j;
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      overlap_check = Pointp->InsideOutsideFlag;
+#endif
+    }
+  else /* a point that was fetched */
+    {
+      int n = no - EndOfForeignNodes;
+
+      foreign_gravpoint_data *foreignpoint = get_foreignpointsp(n, shmrank);
+
+      intpos = foreignpoint->IntPos;
+      mass   = foreignpoint->Mass;
+#if NSOFTCLASSES > 1
+      MyReal h_j = All.ForceSoftening[foreignpoint->getSofteningClass()];
+      if(h_j > hmax)
+        hmax = h_j;
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+      overlap_check = foreignpoint->InsideOutsideFlag;
+#endif
+    }
+
+#ifdef PMGRID
+  mesh_factors *mfp = &mf[LOW_MESH];
+#if defined(PLACEHIGHRESREGION)
+  if((DoPM & TREE_ACTIVE_CUTTOFF_HIGHRES_PM))
+    {
+      if(overlap_check == FLAG_INSIDE && pdat.InsideOutsideFlag == FLAG_INSIDE)
+        mfp = &mf[HIGH_MESH];
+    }
+#endif
+#endif
+
+  vector<MyReal> dxyz;
+  Tp->nearest_image_intpos_to_pos(intpos, pdat.intpos, dxyz.da);
+
+  MyReal r2   = dxyz.r2();
+  MyReal r    = sqrt(r2);
+  MyReal rinv = (r > 0) ? 1 / r : 0;
+
+  gravtree<simparticles>::gfactors gfac;
+
+#ifdef PMGRID
+  if((DoPM & (TREE_ACTIVE_CUTTOFF_BASE_PM + TREE_ACTIVE_CUTTOFF_HIGHRES_PM)))
+    {
+      if(modify_gfactors_pm_monopole(gfac, r, rinv, mfp))
+        return;  // if we are outside the cut-off radius, we have no interaction
+    }
+#endif
+
+  get_gfactors_monopole(gfac, r, hmax, rinv);
+
+#ifdef EVALPOTENTIAL
+  *pdat.pot -= mass * gfac.fac0;
+#endif
+  *pdat.acc -= (mass * gfac.fac1 * rinv) * dxyz;
+
+  if(DoEwald)
+    {
+      // EWALD treatment, only done for periodic boundaries in case PM is not active
+
+      ewald_data ew;
+      Ewald.ewald_gridlookup(intpos, pdat.intpos, ewald::POINTMASS, ew);
+
+#ifdef EVALPOTENTIAL
+      *pdat.pot += mass * ew.D0phi;
+#endif
+      *pdat.acc += mass * ew.D1phi;
+    }
+
+  if(MeasureCostFlag)
+    *pdat.GravCost += 1;
+
+  interactioncountPP += 1;
+}
+
+inline int gwalk::evaluate_particle_node_opening_criterion_and_interaction(const pinfo &pdat, gravnode *nop)
+{
+  if(nop->level == 0)  // always open the root node (note: full node length does not fit in the integer type)
+    return NODE_OPEN;
+
+  MyIntPosType halflen = ((MyIntPosType)1) << ((BITS_FOR_POSITIONS - 1) - nop->level);
+  MyIntPosType intlen  = halflen << 1;
+
+#if defined(PMGRID) || !defined(TREE_NO_SAFETY_BOX)
+  MyIntPosType dist[3];
+  Tp->nearest_image_intpos_to_absolute_intdist(nop->center.da, pdat.intpos, dist);
+#endif
+
+#ifndef TREE_NO_SAFETY_BOX
+  // if we are close to the node, and therefore open it to protect against worst-case force errors
+  if(dist[0] < intlen && dist[1] < intlen && dist[2] < intlen)
+    return NODE_OPEN;
+#endif
+
+#ifdef PMGRID
+  mesh_factors *mfp = &mf[LOW_MESH];
+
+  if((DoPM & (TREE_ACTIVE_CUTTOFF_BASE_PM + TREE_ACTIVE_CUTTOFF_HIGHRES_PM)))
+    {
+#ifdef PLACEHIGHRESREGION
+      if((DoPM & TREE_ACTIVE_CUTTOFF_HIGHRES_PM))
+        {
+          int overlap_check = nop->overlap_flag;
+
+          if(pdat.InsideOutsideFlag == FLAG_INSIDE && overlap_check == FLAG_BOUNDARYOVERLAP)
+            Terminate(
+                "this case should not happen:  node center=(%g|%g|%g)  len=%g   particle=(%g|%g|%g)  "
+                "rel-dist=(%g|%g|%g)\n",
+                nop->center[0] * Tp->FacIntToCoord, nop->center[1] * Tp->FacIntToCoord, nop->center[2] * Tp->FacIntToCoord,
+                intlen * Tp->FacIntToCoord, pdat.intpos[0] * Tp->FacIntToCoord, pdat.intpos[1] * Tp->FacIntToCoord,
+                pdat.intpos[2] * Tp->FacIntToCoord, pdat.intpos[0] * Tp->FacIntToCoord - nop->center[0] * Tp->FacIntToCoord,
+                pdat.intpos[1] * Tp->FacIntToCoord - nop->center[1] * Tp->FacIntToCoord,
+                pdat.intpos[2] * Tp->FacIntToCoord - nop->center[2] * Tp->FacIntToCoord);
+
+          if(overlap_check == FLAG_INSIDE && pdat.InsideOutsideFlag == FLAG_INSIDE)
+            mfp = &mf[HIGH_MESH];
+        }
+#endif
+
+      /* check whether we can stop walking along this branch */
+      if(dist[0] > mfp->intrcut[0] + halflen)
+        return NODE_DISCARD;
+
+      /* check whether we can stop walking along this branch */
+      if(dist[1] > mfp->intrcut[1] + halflen)
+        return NODE_DISCARD;
+
+      /* check whether we can stop walking along this branch */
+      if(dist[2] > mfp->intrcut[2] + halflen)
+        return NODE_DISCARD;
+    }
+#endif
+
+  /* converts the integer distance to floating point */
+  vector<MyReal> dxyz;
+  Tp->nearest_image_intpos_to_pos(nop->s.da, pdat.intpos, dxyz.da);
+
+  MyReal r2 = dxyz.r2();
+
+  MyReal mass = nop->mass;
+
+  MyReal len  = intlen * Tp->FacIntToCoord;
+  MyReal len2 = len * len;
+
+  if(All.RelOpeningCriterionInUse == 0) /* check Barnes-Hut opening criterion */
+    {
+      if(len2 > r2 * theta2)
+        return NODE_OPEN;
+    }
+  else /* check relative opening criterion */
+    {
+#if(MULTIPOLE_ORDER <= 2)
+      if(mass * len2 > r2 * r2 * errTolForceAcc * pdat.aold)
+        return NODE_OPEN;
+#elif(MULTIPOLE_ORDER == 3)
+      if(square(mass * len * len2) > r2 * square(r2 * r2 * errTolForceAcc * pdat.aold))
+        return NODE_OPEN;
+#elif(MULTIPOLE_ORDER == 4)
+      if(mass * len2 * len2 > r2 * r2 * r2 * errTolForceAcc * pdat.aold)
+        return NODE_OPEN;
+#elif(MULTIPOLE_ORDER == 5)
+      if(square(mass * len2 * len2 * len) > r2 * square(r2 * r2 * r2 * errTolForceAcc * pdat.aold))
+        return NODE_OPEN;
+#endif
+      // carry out an additional test to protect against pathological force errors for very large opening angles
+      if(len2 > r2 * thetamax2)
+        return NODE_OPEN;
+    }
+
+  MyReal hmax = pdat.h_i;
+
+#if NSOFTCLASSES > 1
+  MyReal h_j = All.ForceSoftening[nop->maxsofttype];
+
+  if(h_j > hmax)
+    {
+      if(r2 < h_j * h_j)
+        {
+          if(All.ForceSoftening[nop->minsofttype] < All.ForceSoftening[nop->maxsofttype])
+            return NODE_OPEN;
+        }
+      hmax = h_j;
+    }
+#endif
+
+    /**************************/
+
+    // now evaluate the multipole moment interaction
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  if(skip_actual_force_computation)
+    return NODE_USE;
+#endif
+
+  MyReal r    = sqrt(r2);
+  MyReal rinv = (r > 0) ? 1 / r : 0;
+
+  gravtree<simparticles>::gfactors gfac;
+
+#ifdef PMGRID
+  if((DoPM & (TREE_ACTIVE_CUTTOFF_BASE_PM + TREE_ACTIVE_CUTTOFF_HIGHRES_PM)))
+    {
+      if(modify_gfactors_pm_multipole(gfac, r, rinv, mfp))
+        return NODE_DISCARD;  // if we are outside the cut-off radius, we have no interaction
+    }
+#endif
+
+  get_gfactors_multipole(gfac, r, hmax, rinv);
+
+#ifdef EVALPOTENTIAL
+  MyReal g0 = gfac.fac0;
+  *pdat.pot -= mass * g0;  //                       monopole potential
+#endif
+
+  MyReal g1 = gfac.fac1 * rinv;
+  *pdat.acc -= (mass * g1) * dxyz;  //              monopole force
+
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+  MyReal g2             = gfac.fac2 * gfac.rinv2;
+  vector<MyReal> Q2dxyz = nop->Q2Tensor * dxyz;
+  MyReal Q2dxyz2        = Q2dxyz * dxyz;
+  MyReal Q2trace        = nop->Q2Tensor.trace();
+#if(MULTIPOLE_ORDER >= 3)
+  MyReal g3 = gfac.fac3 * gfac.rinv3;
+  *pdat.acc -= static_cast<MyReal>(0.5) * (g2 * Q2trace + g3 * Q2dxyz2) * dxyz + g2 * Q2dxyz;  //  quadrupole force
+#endif
+#ifdef EVALPOTENTIAL
+  *pdat.pot -= static_cast<MyReal>(0.5) * (g1 * Q2trace + g2 * Q2dxyz2);  //  quadrupole potential
+#endif
+#endif
+
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM))
+  symtensor3<MyDouble> &Q3 = nop->Q3Tensor;
+
+  symtensor2<MyDouble> Q3dxyz = Q3 * dxyz;
+  vector<MyDouble> Q3dxyz2    = Q3dxyz * dxyz;
+  MyReal Q3dxyz3              = Q3dxyz2 * dxyz;
+  MyReal Q3dxyzTrace          = Q3dxyz.trace();
+
+  vector<MyDouble> Q3vec;
+  Q3vec[0] = Q3[dXXX] + Q3[dXYY] + Q3[dXZZ];
+  Q3vec[1] = Q3[dYXX] + Q3[dYYY] + Q3[dYZZ];
+  Q3vec[2] = Q3[dZXX] + Q3[dZYY] + Q3[dZZZ];
+
+#if(MULTIPOLE_ORDER >= 4)
+  MyReal g4 = gfac.fac4 * gfac.rinv2 * gfac.rinv2;
+  *pdat.acc -=
+      static_cast<MyReal>(0.5) *
+      (g2 * Q3vec + g3 * Q3dxyz2 + (static_cast<MyReal>(1.0 / 3) * g4 * Q3dxyz3 + g3 * Q3dxyzTrace) * dxyz);  //      octupole force
+#endif
+#ifdef EVALPOTENTIAL
+  *pdat.pot -= static_cast<MyReal>(1.0 / 6) * (3 * g2 * Q3dxyzTrace + g3 * Q3dxyz3);  //       octupole potential
+#endif
+#endif
+
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM))
+  // now compute the hexadecupole force
+  symtensor4<MyDouble> &Q4 = nop->Q4Tensor;
+
+  symtensor3<MyDouble> Q4dxyz  = Q4 * dxyz;
+  symtensor2<MyDouble> Q4dxyz2 = Q4dxyz * dxyz;
+  vector<MyDouble> Q4dxyz3     = Q4dxyz2 * dxyz;
+  MyReal Q4dxyz4               = Q4dxyz3 * dxyz;
+  MyReal Q4dxyz2trace          = Q4dxyz2.trace();
+
+  symtensor2<MyDouble> QT;
+  QT[qXX]        = Q4[sXXXX] + Q4[sXXYY] + Q4[sXXZZ];
+  QT[qYY]        = Q4[sYYXX] + Q4[sYYYY] + Q4[sYYZZ];
+  QT[qZZ]        = Q4[sZZXX] + Q4[sZZYY] + Q4[sZZZZ];
+  QT[qXY]        = Q4[sXYXX] + Q4[sXYYY] + Q4[sXYZZ];
+  QT[qXZ]        = Q4[sXZXX] + Q4[sXZYY] + Q4[sXZZZ];
+  QT[qYZ]        = Q4[sYZXX] + Q4[sYZYY] + Q4[sYZZZ];
+  MyReal QTtrace = QT.trace();
+
+#if(MULTIPOLE_ORDER >= 5)
+  vector<MyDouble> QTdxyz = QT * dxyz;
+  MyReal g5               = gfac.fac5 * gfac.rinv2 * gfac.rinv3;
+  *pdat.acc -=
+      static_cast<MyReal>(1.0 / 24) * (g3 * (3 * QTtrace * dxyz + 12 * QTdxyz) + g4 * (6 * Q4dxyz2trace * dxyz + 4 * Q4dxyz3) +
+                                       g5 * Q4dxyz4 * dxyz);  //  hexadecupole force
+#endif
+#ifdef EVALPOTENTIAL
+  *pdat.pot -= static_cast<MyReal>(1.0 / 24) * (g2 * 3 * QTtrace + g3 * 6 * Q4dxyz2trace + g4 * Q4dxyz4);  //  hexadecupole potential
+#endif
+#endif
+
+#if(MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM) && defined(EVALPOTENTIAL))
+  symtensor5<MyDouble> &Q5 = nop->Q5Tensor;
+
+  symtensor4<MyDouble> Q5dxyz  = Q5 * dxyz;
+  symtensor3<MyDouble> Q5dxyz2 = Q5dxyz * dxyz;
+  symtensor2<MyDouble> Q5dxyz3 = Q5dxyz2 * dxyz;
+  vector<MyDouble> Q5dxyz4     = Q5dxyz3 * dxyz;
+  MyReal Q5dxyz5               = Q5dxyz4 * dxyz;
+
+  MyReal Q5dxyzTtrace = Q5dxyz[sXXXX] + Q5dxyz[sYYYY] + Q5dxyz[sZZZZ] + 2 * (Q5dxyz[sXXYY] + Q5dxyz[sXXZZ] + Q5dxyz[sYYZZ]);
+
+  // now compute the triakontadipole  potential term
+  *pdat.pot -= static_cast<MyReal>(1.0 / 120) * (g3 * 15 * Q5dxyzTtrace + g4 * 10 * Q5dxyz3.trace() + g5 * Q5dxyz5);
+#endif
+
+  if(DoEwald)
+    {
+      // EWALD treatment, only done for periodic boundaries in case PM is not active
+
+      ewald_data ew;
+      Ewald.ewald_gridlookup(nop->s.da, pdat.intpos, ewald::MULTIPOLES, ew);
+
+#ifdef EVALPOTENTIAL
+      *pdat.pot += mass * ew.D0phi;
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+      *pdat.pot += static_cast<MyReal>(0.5) * (nop->Q2Tensor * ew.D2phi);
+#endif
+#if(MULTIPOLE_ORDER >= 4) || (MULTIPOLE_ORDER >= 3 && defined(EXTRAPOTTERM))
+      *pdat.pot += static_cast<MyReal>(1.0 / 6) * (nop->Q3Tensor * ew.D3phi);
+#endif
+#if(MULTIPOLE_ORDER >= 5) || (MULTIPOLE_ORDER >= 4 && defined(EXTRAPOTTERM))
+      *pdat.pot += static_cast<MyReal>(1.0 / 24) * (nop->Q4Tensor * ew.D4phi);
+#endif
+#if(MULTIPOLE_ORDER >= 5 && defined(EXTRAPOTTERM) && defined(EVALPOTENTIAL))
+      *pdat.pot += static_cast<MyReal>(1.0 / 120) * (nop->Q5Tensor * ew.D5phi);
+#endif
+#endif
+      *pdat.acc += mass * ew.D1phi;
+#if(MULTIPOLE_ORDER >= 3)
+      *pdat.acc += static_cast<MyReal>(0.5) * (ew.D3phi * nop->Q2Tensor);
+#endif
+#if(MULTIPOLE_ORDER >= 4)
+      *pdat.acc += static_cast<MyReal>(1.0 / 6) * (ew.D4phi * nop->Q3Tensor);
+#endif
+#if(MULTIPOLE_ORDER >= 5)
+      *pdat.acc += static_cast<MyReal>(1.0 / 24) * (ew.D5phi * nop->Q4Tensor);
+#endif
+    }
+
+  interactioncountPN += 1;
+
+  if(MeasureCostFlag)
+    *pdat.GravCost += 1;
+
+  return NODE_USE;
+}
+
+inline void gwalk::gwalk_open_node(const pinfo &pdat, int i, char ptype, gravnode *nop, int mintopleafnode, int committed)
+{
+  /* open node */
+  int p                 = nop->nextnode;
+  unsigned char shmrank = nop->nextnode_shmrank;
+
+  while(p != nop->sibling || (shmrank != nop->sibling_shmrank && nop->sibling >= MaxPart + D->NTopnodes))
+    {
+      if(p < 0)
+        Terminate(
+            "p=%d < 0  nop->sibling=%d nop->nextnode=%d shmrank=%d nop->sibling_shmrank=%d nop->foreigntask=%d  mass=%g  "
+            "first_nontoplevelnode=%d",
+            p, nop->sibling, nop->nextnode, shmrank, nop->sibling_shmrank, nop->OriginTask, nop->mass, MaxPart + D->NTopnodes);
+
+      int next;
+      unsigned char next_shmrank;
+      char type;
+
+      if(p < MaxPart) /* a local particle */
+        {
+          /* note: here shmrank cannot change */
+          next         = get_nextnodep(shmrank)[p];
+          next_shmrank = shmrank;
+          type         = NODE_TYPE_LOCAL_PARTICLE;
+        }
+      else if(p < MaxPart + MaxNodes) /* an internal node  */
+        {
+          gravnode *nop = get_nodep(p, shmrank);
+          next          = nop->sibling;
+          next_shmrank  = nop->sibling_shmrank;
+          type          = NODE_TYPE_LOCAL_NODE;
+        }
+      else if(p >= ImportedNodeOffset && p < EndOfTreePoints) /* an imported Treepoint particle  */
+        {
+          /* note: here shmrank cannot change */
+          next         = get_nextnodep(shmrank)[p - MaxNodes];
+          next_shmrank = shmrank;
+          type         = NODE_TYPE_TREEPOINT_PARTICLE;
+        }
+      else if(p >= EndOfTreePoints && p < EndOfForeignNodes) /* an imported tree node */
+        {
+          gravnode *nop = get_nodep(p, shmrank);
+          next          = nop->sibling;
+          next_shmrank  = nop->sibling_shmrank;
+          type          = NODE_TYPE_FETCHED_NODE;
+        }
+      else if(p >= EndOfForeignNodes) /* an imported particle below an imported tree node */
+        {
+          foreign_gravpoint_data *foreignpoint = get_foreignpointsp(p - EndOfForeignNodes, shmrank);
+
+          next         = foreignpoint->Nextnode;
+          next_shmrank = foreignpoint->Nextnode_shmrank;
+          type         = NODE_TYPE_FETCHED_PARTICLE;
+        }
+      else
+        {
+          /* a pseudo point */
+          Terminate(
+              "should not happen: p=%d MaxPart=%d MaxNodes=%d  ImportedNodeOffset=%d  EndOfTreePoints=%d  EndOfForeignNodes=%d "
+              "shmrank=%d",
+              p, MaxPart, MaxNodes, ImportedNodeOffset, EndOfTreePoints, EndOfForeignNodes, shmrank);
+        }
+
+      gravity_force_interact(pdat, i, p, ptype, type, shmrank, mintopleafnode, committed);
+
+      p       = next;
+      shmrank = next_shmrank;
+    }
+}
+
+void gwalk::gravity_force_interact(const pinfo &pdat, int i, int no, char ptype, char no_type, unsigned char shmrank,
+                                   int mintopleafnode, int committed)
+{
+  if(no_type <= NODE_TYPE_FETCHED_PARTICLE)  // we are interacting with a particle
+    {
+      evaluate_particle_particle_interaction(pdat, no, no_type, shmrank);
+    }
+  else  // we are interacting with a node
+    {
+      gravnode *nop = get_nodep(no, shmrank);
+
+      if(nop->not_empty == 0)
+        return;
+
+      if(no < MaxPart + MaxNodes)                // we have a top-level node
+        if(nop->nextnode >= MaxPart + MaxNodes)  // if the next node is not a top-level, we have a leaf node
+          {
+            mintopleafnode = no;
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+            // if the leaf node is on this shared memory, we have all the data, so we for sure don't need to import anything on this
+            // branch
+            if(skip_actual_force_computation)
+              if(Shmem.GetNodeIDForSimulCommRank[nop->OriginTask] == Shmem.GetNodeIDForSimulCommRank[D->ThisTask])
+                return;
+#endif
+          }
+
+      int openflag = evaluate_particle_node_opening_criterion_and_interaction(pdat, nop);
+
+      if(openflag == NODE_OPEN) /* cell can't be used, need to open it */
+        {
+          if(nop->cannot_be_opened_locally.load(std::memory_order_acquire))
+            {
+              // are we in the same shared memory node?
+              if(Shmem.GetNodeIDForSimulCommRank[nop->OriginTask] == Shmem.GetNodeIDForSimulCommRank[D->ThisTask])
+                {
+                  Terminate("this should not happen any more");
+                }
+              else
+                {
+                  tree_add_to_fetch_stack(nop, no, shmrank);  // will only add unique copies
+
+                  tree_add_to_work_stack(i, no, shmrank, mintopleafnode);
+                }
+            }
+          else
+            {
+              int min_buffer_space =
+                  std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+
+              if(min_buffer_space >= committed + 8 * TREE_NUM_BEFORE_NODESPLIT)
+                gwalk_open_node(pdat, i, ptype, nop, mintopleafnode, committed + 8 * TREE_NUM_BEFORE_NODESPLIT);
+              else
+                tree_add_to_work_stack(i, no, shmrank, mintopleafnode);
+            }
+        }
+    }
+}
+
+/*
+
+*/
+
+/*! \brief This function computes the gravitational forces for all active particles.
+ *
+ * The tree walk is done in two phases: First the local part of the force tree is processed (gravity_primary_loop() ).
+ * Whenever an external node is encountered during the walk, this node is saved on a list.
+ * This node list along with data about the particles is then exchanged among tasks.
+ * In the second phase (gravity_secondary_loop() ) each task now continues the tree walk for
+ * the imported particles. Finally the resulting partial forces are send back to the original task
+ * and are summed up there to complete the tree force calculation.
+ *
+ * Particles are only exported to other processors when really needed, thereby allowing a
+ * good use of the communication buffer. Every particle is sent at most once to a given processor
+ * together with the complete list of relevant tree nodes to be checked on the other task.
+ *
+ * Particles which drifted into the domain of another task are sent to this task for the force computation.
+ * Afterwards the resulting force is sent back to the originating task.
+ *
+ * In order to improve the work load balancing during a domain decomposition, the work done by each
+ * node/particle is measured. The work is measured for the interaction partners (i.e. the nodes or particles)
+ * and not for the particles itself that require a force computation. This way, work done for imported
+ * particles is accounted for at the task where the work actually incurred. The cost measurement is
+ * only done for the "GRAVCOSTLEVELS" highest occupied time bins. The variable #MeasureCostFlag will state whether a
+ * measurement is done at the present time step.
+ *
+ * The tree imbalance can be further reduced using chunking. The particles requiring a force computation
+ * are split into chunks of size #Nchunksize. A set of every #Nchunk -th chunk is processed first.
+ * Then the process is repeated, processing the next set of chunks. This way the amount of exported particles
+ * is more balanced, as communication heavy regions are mixed with less communication intensive regions.
+ *
+ */
+
+void gwalk::gravity_tree(int timebin)
+{
+  interactioncountPP = 0;
+  interactioncountPN = 0;
+
+  TIMER_STORE;
+  TIMER_START(CPU_TREE);
+
+  D->mpi_printf("GRAVTREE: Begin tree force. timebin=%d (presently allocated=%g MB)\n", timebin, Mem.getAllocatedBytesInMB());
+
+#ifdef PMGRID
+  set_mesh_factors();
+#endif
+
+  TIMER_START(CPU_TREESTACK);
+
+  // Create list of targets (the work queue). There are initially two possible sources of points, local ones, and imported ones.
+
+  NumOnWorkStack         = 0;
+  AllocWorkStackBaseLow  = std::max<int>(1.5 * (Tp->NumPart + NumPartImported), TREE_MIN_WORKSTACK_SIZE);
+  AllocWorkStackBaseHigh = AllocWorkStackBaseLow + TREE_EXPECTED_CYCLES * TREE_MIN_WORKSTACK_SIZE;
+  MaxOnWorkStack         = AllocWorkStackBaseLow;
+
+  WorkStack       = (workstack_data *)Mem.mymalloc("WorkStack", AllocWorkStackBaseHigh * sizeof(workstack_data));
+  ResultIndexList = (int *)Mem.mymalloc("ResultIndexList", NumPartImported * sizeof(int));
+
+  for(int i = 0; i < Tp->TimeBinsGravity.NActiveParticles; i++)
+    {
+      int target = Tp->TimeBinsGravity.ActiveParticleList[i];
+
+      // if we have exported particles, we need to explicitly check whether this particle is among them
+      if(NumPartExported > 0)
+        {
+          MyIntPosType xxb       = Tp->P[target].IntPos[0];
+          MyIntPosType yyb       = Tp->P[target].IntPos[1];
+          MyIntPosType zzb       = Tp->P[target].IntPos[2];
+          MyIntPosType mask      = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - 1));
+          unsigned char shiftx   = (BITS_FOR_POSITIONS - 3);
+          unsigned char shifty   = (BITS_FOR_POSITIONS - 2);
+          unsigned char shiftz   = (BITS_FOR_POSITIONS - 1);
+          unsigned char level    = 0;
+          unsigned char rotation = 0;
+
+          int no = 0;
+          while(D->TopNodes[no].Daughter >= 0) /* walk down top tree to find correct leaf */
+            {
+              unsigned char pix     = (((unsigned char)((xxb & mask) >> (shiftx--))) | ((unsigned char)((yyb & mask) >> (shifty--))) |
+                                   ((unsigned char)((zzb & mask) >> (shiftz--))));
+              unsigned char subnode = peano_incremental_key(pix, &rotation);
+              mask >>= 1;
+              level++;
+              no = D->TopNodes[no].Daughter + subnode;
+            }
+
+          no       = D->TopNodes[no].Leaf;
+          int task = D->TaskOfLeaf[no];
+
+          if(task == D->ThisTask)
+            {
+              WorkStack[NumOnWorkStack].Target         = target;
+              WorkStack[NumOnWorkStack].Node           = MaxPart;
+              WorkStack[NumOnWorkStack].ShmRank        = Shmem.Island_ThisTask;
+              WorkStack[NumOnWorkStack].MinTopLeafNode = MaxPart + D->NTopnodes;
+              NumOnWorkStack++;
+            }
+        }
+      else
+        {
+          WorkStack[NumOnWorkStack].Target         = target;
+          WorkStack[NumOnWorkStack].Node           = MaxPart;
+          WorkStack[NumOnWorkStack].ShmRank        = Shmem.Island_ThisTask;
+          WorkStack[NumOnWorkStack].MinTopLeafNode = MaxPart + D->NTopnodes;
+          NumOnWorkStack++;
+        }
+
+      /* let's do a safety check here to protect against accidental use of zero softening lengths */
+      int softtype = Tp->P[target].getSofteningClass();
+      if(All.ForceSoftening[softtype] == 0)
+        Terminate("Particle with ID=%lld of type=%d and softening type=%d was assigned zero softening\n",
+                  (long long)Tp->P[target].ID.get(), Tp->P[target].getType(), softtype);
+    }
+
+  int ncount = 0;
+
+  for(int i = 0; i < NumPartImported; i++)
+    {
+#ifndef HIERARCHICAL_GRAVITY
+      if(Points[i].ActiveFlag)
+#endif
+        {
+          ResultIndexList[i] = ncount++;
+
+          WorkStack[NumOnWorkStack].Target         = i + ImportedNodeOffset;
+          WorkStack[NumOnWorkStack].Node           = MaxPart;
+          WorkStack[NumOnWorkStack].ShmRank        = Shmem.Island_ThisTask;
+          WorkStack[NumOnWorkStack].MinTopLeafNode = MaxPart + D->NTopnodes;
+          NumOnWorkStack++;
+        }
+    }
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  workstack_data *WorkStackBak = (workstack_data *)Mem.mymalloc("WorkStackBak", NumOnWorkStack * sizeof(workstack_data));
+  int NumOnWorkStackBak        = NumOnWorkStack;
+  memcpy(WorkStackBak, WorkStack, NumOnWorkStack * sizeof(workstack_data));
+#endif
+
+  ResultsActiveImported =
+      (resultsactiveimported_data *)Mem.mymalloc_clear("ResultsActiveImported", ncount * sizeof(resultsactiveimported_data));
+
+  /******************************************/
+  /* now execute the tree walk calculations */
+  /******************************************/
+
+  theta2         = All.ErrTolTheta * All.ErrTolTheta;
+  thetamax2      = All.ErrTolThetaMax * All.ErrTolThetaMax;
+  errTolForceAcc = All.ErrTolForceAcc;
+
+  sum_NumForeignNodes  = 0;
+  sum_NumForeignPoints = 0;
+
+  // set a default size of the fetch stack equal to half the work stack (this may still be somewhat too large)
+  MaxOnFetchStack = std::max<int>(0.1 * (Tp->NumPart + NumPartImported), TREE_MIN_WORKSTACK_SIZE);
+  StackToFetch    = (fetch_data *)Mem.mymalloc_movable(&StackToFetch, "StackToFetch", MaxOnFetchStack * sizeof(fetch_data));
+
+  // let's grab at most half the still available memory for imported points and nodes
+  int nspace = (0.5 * Mem.FreeBytes) / (sizeof(gravnode) + 8 * sizeof(foreign_gravpoint_data));
+
+  MaxForeignNodes  = nspace;
+  MaxForeignPoints = 8 * nspace;
+  NumForeignNodes  = 0;
+  NumForeignPoints = 0;
+
+  /* the following two arrays hold imported tree nodes and imported points to augment the local tree */
+  Foreign_Nodes  = (gravnode *)Mem.mymalloc_movable(&Foreign_Nodes, "Foreign_Nodes", MaxForeignNodes * sizeof(gravnode));
+  Foreign_Points = (foreign_gravpoint_data *)Mem.mymalloc_movable(&Foreign_Points, "Foreign_Points",
+                                                                  MaxForeignPoints * sizeof(foreign_gravpoint_data));
+
+  tree_initialize_leaf_node_access_info();
+
+  TIMER_STOP(CPU_TREESTACK);
+
+  double t0       = Logs.second();
+  int max_ncycles = 0;
+
+  prepare_shared_memory_access();
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  for(int rep = 0; rep < 2; rep++)
+    {
+      if(rep == 0)
+        {
+          skip_actual_force_computation = true;
+        }
+      else
+        {
+          skip_actual_force_computation = false;
+          NumOnWorkStack                = NumOnWorkStackBak;
+          memcpy(WorkStack, WorkStackBak, NumOnWorkStack * sizeof(workstack_data));
+        }
+#endif
+
+      while(NumOnWorkStack > 0)  // repeat until we are out of work
+        {
+          NewOnWorkStack  = 0;  // gives the new entries
+          NumOnFetchStack = 0;
+          MaxOnWorkStack  = std::min<int>(AllocWorkStackBaseLow + max_ncycles * TREE_MIN_WORKSTACK_SIZE, AllocWorkStackBaseHigh);
+
+          TIMER_START(CPU_TREEWALK);
+
+          int item = 0;
+
+          while(item < NumOnWorkStack)
+            {
+              int committed = 8 * TREE_NUM_BEFORE_NODESPLIT;
+              int min_buffer_space =
+                  std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+              if(min_buffer_space >= committed)
+                {
+                  int target     = WorkStack[item].Target;
+                  int no         = WorkStack[item].Node;
+                  int shmrank    = WorkStack[item].ShmRank;
+                  int mintopleaf = WorkStack[item].MinTopLeafNode;
+                  item++;
+
+                  pinfo pdat;
+                  int ptype = get_pinfo(target, pdat);
+
+                  if(no == MaxPart)
+                    {
+                      // we have a pristine particle that's processed for the first time
+                      gravity_force_interact(pdat, target, no, ptype, NODE_TYPE_LOCAL_NODE, shmrank, mintopleaf, committed);
+                    }
+                  else
+                    {
+                      // we have a node that we previously could not open
+                      gravnode *nop = get_nodep(no, shmrank);
+
+                      if(nop->cannot_be_opened_locally)
+                        {
+                          Terminate("item=%d:  no=%d  now we should be able to open it!", item, no);
+                        }
+                      else
+                        gwalk_open_node(pdat, target, ptype, nop, mintopleaf, committed);
+                    }
+                }
+              else
+                break;
+            }
+
+          if(item == 0 && NumOnWorkStack > 0)
+            Terminate("Can't even process a single particle");
+
+          TIMER_STOP(CPU_TREEWALK);
+
+          TIMER_START(CPU_TREEFETCH);
+
+          tree_fetch_foreign_nodes(FETCH_GRAVTREE);
+
+          TIMER_STOP(CPU_TREEFETCH);
+
+          TIMER_START(CPU_TREESTACK);
+
+          /* now reorder the workstack such that we are first going to do residual pristine particles, and then
+           * imported nodes that hang below the first leaf nodes */
+          NumOnWorkStack = NumOnWorkStack - item + NewOnWorkStack;
+          memmove(WorkStack, WorkStack + item, NumOnWorkStack * sizeof(workstack_data));
+
+          /* now let's sort such that we can go deep on top-level node branches, allowing us to clear them out eventually */
+          mycxxsort(WorkStack, WorkStack + NumOnWorkStack, compare_workstack);
+
+          TIMER_STOP(CPU_TREESTACK);
+
+          max_ncycles++;
+        }
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+    }
+#endif
+
+  TIMER_START(CPU_TREEIMBALANCE);
+
+  MPI_Allreduce(MPI_IN_PLACE, &max_ncycles, 1, MPI_INT, MPI_MAX, D->Communicator);
+
+  TIMER_STOP(CPU_TREEIMBALANCE);
+
+  cleanup_shared_memory_access();
+
+  /* free temporary buffers */
+
+  Mem.myfree(Foreign_Points);
+  Mem.myfree(Foreign_Nodes);
+  Mem.myfree(StackToFetch);
+
+  double t1 = Logs.second();
+
+  D->mpi_printf("GRAVTREE: tree-forces are calculated, with %d cycles took %g sec\n", max_ncycles, Logs.timediff(t0, t1));
+
+  /* now communicate the forces in ResultsActiveImported */
+  gravity_exchange_forces();
+
+  Mem.myfree(ResultsActiveImported);
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  Mem.myfree(WorkStackBak);
+#endif
+  Mem.myfree(ResultIndexList);
+  Mem.myfree(WorkStack);
+
+  TIMER_STOP(CPU_TREE);
+
+  D->mpi_printf("GRAVTREE: tree-force is done.\n");
+
+  /*  gather some diagnostic information */
+
+  TIMER_START(CPU_LOGS);
+
+  struct detailed_timings
+  {
+    double tree, wait, fetch, stack, all, lastpm;
+    double costtotal, numnodes;
+    double interactioncountPP, interactioncountPN;
+    double NumForeignNodes, NumForeignPoints;
+    double fillfacFgnNodes, fillfacFgnPoints;
+  };
+  detailed_timings timer, tisum, timax;
+
+  timer.tree               = TIMER_DIFF(CPU_TREEWALK);
+  timer.wait               = TIMER_DIFF(CPU_TREEIMBALANCE);
+  timer.fetch              = TIMER_DIFF(CPU_TREEFETCH);
+  timer.stack              = TIMER_DIFF(CPU_TREESTACK);
+  timer.all                = timer.tree + timer.wait + timer.fetch + timer.stack + TIMER_DIFF(CPU_TREE);
+  timer.lastpm             = All.CPUForLastPMExecution;
+  timer.costtotal          = interactioncountPP + interactioncountPN;
+  timer.numnodes           = NumNodes;
+  timer.interactioncountPP = interactioncountPP;
+  timer.interactioncountPN = interactioncountPN;
+  timer.NumForeignNodes    = NumForeignNodes;
+  timer.NumForeignPoints   = NumForeignPoints;
+  timer.fillfacFgnNodes    = NumForeignNodes / ((double)MaxForeignNodes);
+  timer.fillfacFgnPoints   = NumForeignPoints / ((double)MaxForeignPoints);
+
+  MPI_Reduce((double *)&timer, (double *)&tisum, (int)(sizeof(detailed_timings) / sizeof(double)), MPI_DOUBLE, MPI_SUM, 0,
+             D->Communicator);
+  MPI_Reduce((double *)&timer, (double *)&timax, (int)(sizeof(detailed_timings) / sizeof(double)), MPI_DOUBLE, MPI_MAX, 0,
+             D->Communicator);
+
+  All.TotNumOfForces += Tp->TimeBinsGravity.GlobalNActiveParticles;
+
+  if(D->ThisTask == 0)
+    {
+      fprintf(Logs.FdTimings, "Nf=%9lld  timebin=%d  total-Nf=%lld\n", Tp->TimeBinsGravity.GlobalNActiveParticles, timebin,
+              All.TotNumOfForces);
+      fprintf(Logs.FdTimings, "   work-load balance: %g   part/sec: raw=%g, effective=%g     ia/part: avg=%g   (%g|%g)\n",
+              timax.tree / ((tisum.tree + 1e-20) / D->NTask), Tp->TimeBinsGravity.GlobalNActiveParticles / (tisum.tree + 1.0e-20),
+              Tp->TimeBinsGravity.GlobalNActiveParticles / ((timax.tree + 1.0e-20) * D->NTask),
+              tisum.costtotal / (Tp->TimeBinsGravity.GlobalNActiveParticles + 1.0e-20),
+              tisum.interactioncountPP / (Tp->TimeBinsGravity.GlobalNActiveParticles + 1.0e-20),
+              tisum.interactioncountPN / (Tp->TimeBinsGravity.GlobalNActiveParticles + 1.0e-20));
+      fprintf(Logs.FdTimings,
+              "   maximum number of nodes: %g, filled: %g  NumForeignNodes: max=%g avg=%g fill=%g NumForeignPoints: max=%g avg=%g "
+              "fill=%g  cycles=%d\n",
+              timax.numnodes, timax.numnodes / MaxNodes, timax.NumForeignNodes, tisum.NumForeignNodes / D->NTask,
+              timax.fillfacFgnNodes, timax.NumForeignPoints, tisum.NumForeignPoints / D->NTask, timax.fillfacFgnPoints, max_ncycles);
+      fprintf(Logs.FdTimings,
+              "   avg times: <all>=%g  <tree>=%g  <wait>=%g  <fetch>=%g  <stack>=%g  "
+              "(lastpm=%g) sec\n",
+              tisum.all / D->NTask, tisum.tree / D->NTask, tisum.wait / D->NTask, tisum.fetch / D->NTask, tisum.stack / D->NTask,
+              tisum.lastpm / D->NTask);
+      fprintf(Logs.FdTimings, "   total interaction cost: %g  (imbalance=%g)\n", tisum.costtotal,
+              timax.costtotal / (tisum.costtotal / D->NTask));
+      myflush(Logs.FdTimings);
+    }
+
+  TIMER_STOP(CPU_LOGS);
+}
+
+/* make sure that we instantiate the template */
+#include "../data/simparticles.h"
+template class gravtree<simparticles>;
diff --git a/src/gravtree/gwalk.h b/src/gravtree/gwalk.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b0085503d4c223e37eac8d3610528253c611145
--- /dev/null
+++ b/src/gravtree/gwalk.h
@@ -0,0 +1,119 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  gwalk.h
+ *
+ *  \brief defines a class for walking the gravitational tree
+ */
+
+#ifndef GRAVTREE_WALK_H
+#define GRAVTREE_WALK_H
+
+#include "../mpi_utils/shared_mem_handler.h"
+
+class gwalk : public gravtree<simparticles>
+{
+ public:
+  void gravity_tree(int timebin);
+
+ private:
+  long long interactioncountPP;
+  long long interactioncountPN;
+
+  MyReal theta2;
+  MyReal thetamax2;
+  MyReal errTolForceAcc;
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  bool skip_actual_force_computation;
+#endif
+
+  struct pinfo
+  {
+    MyIntPosType *intpos;
+    MyReal aold;
+    MyReal h_i;
+    int Type;
+#if NSOFTCLASSES > 1
+    int SofteningClass;
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+    int InsideOutsideFlag;
+#endif
+
+    vector<MyFloat> *acc;
+    MyFloat *pot;
+    int *GravCost;
+  };
+
+  inline int get_pinfo(int i, pinfo &pdat)
+  {
+    int ptype;
+
+    if(i < Tp->NumPart)
+      {
+        ptype = NODE_TYPE_LOCAL_PARTICLE;
+
+        pdat.intpos = Tp->P[i].IntPos;
+
+        pdat.Type = Tp->P[i].getType();
+#if NSOFTCLASSES > 1
+        pdat.SofteningClass = Tp->P[i].getSofteningClass();
+#endif
+        pdat.aold = Tp->P[i].OldAcc;
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+        pdat.InsideOutsideFlag = Tp->P[i].InsideOutsideFlag;
+#endif
+
+        pdat.acc = &Tp->P[i].GravAccel;
+#ifdef EVALPOTENTIAL
+        pdat.pot = &Tp->P[i].Potential;
+#endif
+        pdat.GravCost = &Tp->P[i].GravCost;
+      }
+    else
+      {
+        ptype = NODE_TYPE_TREEPOINT_PARTICLE;
+
+        int n = i - ImportedNodeOffset;
+
+        pdat.intpos = Points[n].IntPos;
+
+        pdat.Type = Points[n].Type;
+#if NSOFTCLASSES > 1
+        pdat.SofteningClass = Points[n].SofteningClass;
+#endif
+        pdat.aold = Points[n].OldAcc;
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+        pdat.InsideOutsideFlag = Points[n].InsideOutsideFlag;
+#endif
+
+        int idx  = ResultIndexList[n];
+        pdat.acc = &ResultsActiveImported[idx].GravAccel;
+#ifdef EVALPOTENTIAL
+        pdat.pot = &ResultsActiveImported[idx].Potential;
+#endif
+        pdat.GravCost = &ResultsActiveImported[idx].GravCost;
+      }
+
+#if NSOFTCLASSES > 1
+    pdat.h_i = All.ForceSoftening[pdat.SofteningClass];
+#else
+    pdat.h_i = All.ForceSoftening[0];
+#endif
+
+    return ptype;
+  }
+
+  inline void gwalk_open_node(const pinfo &pdat, int i, char ptype, gravnode *nop, int mintopleafnode, int committed);
+  void gravity_force_interact(const pinfo &pdat, int i, int no, char ptype, char no_type, unsigned char shmrank, int mintopleafnode,
+                              int committed);
+
+  inline int evaluate_particle_node_opening_criterion_and_interaction(const pinfo &pdat, gravnode *nop);
+  inline void evaluate_particle_particle_interaction(const pinfo &pdat, const int no, const char jtype, int no_task);
+};
+
+#endif
diff --git a/src/half/ChangeLog.txt b/src/half/ChangeLog.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9100b6ab74d3aa529b86884f9b73acb8fbb21ebf
--- /dev/null
+++ b/src/half/ChangeLog.txt
@@ -0,0 +1,184 @@
+Release Notes											{#changelog}
+=============
+
+1.12.0 release (2017-03-06):
+----------------------------
+
+- Changed behaviour of `half_cast` to perform conversions to/from `double` 
+  and `long double` directly according to specified rounding mode, without an 
+  intermediate `float` conversion.
+- Added `noexcept` specifiers to constructors.
+- Fixed minor portability problem with `logb` and `ilogb`.
+- Tested for *VC++ 2015*.
+
+
+1.11.0 release (2013-11-16):
+----------------------------
+
+- Made tie-breaking behaviour in round to nearest configurable by 
+  `HALF_ROUND_TIES_TO_EVEN` macro.
+- Completed support for all C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported.
+- Fixed inability to disable support for C++11 mathematical functions on 
+  *VC++ 2013*.
+
+
+1.10.0 release (2013-11-09):
+----------------------------
+
+- Made default rounding mode configurable by `HALF_ROUND_STYLE` macro.
+- Added support for non-IEEE single-precision implementations.
+- Added `HALF_ENABLE_CPP11_TYPE_TRAITS` preprocessor flag for checking 
+  support for C++11 type traits and TMP features.
+- Restricted `half_cast` to support built-in arithmetic types only.
+- Changed behaviour of `half_cast` to respect rounding mode when casting 
+  to/from integer types.
+
+
+1.9.2 release (2013-11-01):
+---------------------------
+
+- Tested for *gcc 4.8*.
+- Tested and fixed for *VC++ 2013*.
+- Removed unnecessary warnings in *MSVC*.
+
+
+1.9.1 release (2013-08-08):
+---------------------------
+
+- Fixed problems with older gcc and MSVC versions.
+- Small fix to non-C++11 implementations of `remainder` and `remquo`.
+
+
+1.9.0 release (2013-08-07):
+---------------------------
+
+- Changed behaviour of `nearbyint`, `rint`, `lrint` and `llrint` to use 
+  rounding mode of half-precision implementation (which is 
+  truncating/indeterminate) instead of single-precision rounding mode.
+- Added support for more C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported, in particular 
+  `remainder`, `remquo` and `cbrt`.
+- Minor implementation changes.
+
+
+1.8.1 release (2013-01-22):
+---------------------------
+
+- Fixed bug resulting in multiple definitions of the `nanh` function due to 
+  a missing `inline` specification.
+
+
+1.8.0 release (2013-01-19):
+---------------------------
+
+- Added support for more C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported, in particular 
+  exponential and logarithm functions, hyperbolic area functions and the 
+  hypotenuse function.
+- Made `fma` function use default implementation if single-precision version
+  from `<cmath>` is not faster and thus `FP_FAST_FMAH` to be defined always.
+- Fixed overload resolution issues when invoking certain mathematical 
+  functions by unqualified calls.
+
+
+1.7.0 release (2012-10-26):
+---------------------------
+
+- Added support for C++11 `noexcept` specifiers.
+- Changed C++11 `long long` to be supported on *VC++ 2003* and up.
+
+
+1.6.1 release (2012-09-13):
+---------------------------
+
+- Made `fma` and `fdim` functions available even if corresponding 
+  single-precision functions are not.
+
+
+1.6.0 release (2012-09-12):
+---------------------------
+
+- Added `HALF_ENABLE_CPP11_LONG_LONG` to control support for `long long` 
+  integers and corresponding mathematical functions.
+- Fixed C++98 compatibility on non-VC compilers.
+
+
+1.5.1 release (2012-08-17):
+---------------------------
+
+- Recorrected `std::numeric_limits::round_style` to always return 
+  `std::round_indeterminate`, due to overflow-handling deviating from 
+  correct round-toward-zero behaviour.
+
+
+1.5.0 release (2012-08-16):
+---------------------------
+
+- Added `half_cast` for explicitly casting between half and any type 
+  convertible to/from `float` and allowing the explicit specification of 
+  the rounding mode to use.
+
+
+1.4.0 release (2012-08-12):
+---------------------------
+
+- Added support for C++11 generalized constant expressions (`constexpr`).
+
+
+1.3.1 release (2012-08-11):
+---------------------------
+
+- Fixed requirement for `std::signbit` and `std::isnan` (even if C++11 
+  `<cmath>` functions disabled) on non-VC compilers.
+
+
+1.3.0 release (2012-08-10):
+---------------------------
+
+- Made requirement for `<cstdint>` and `static_assert` optional and thus 
+  made the library C++98-compatible.
+- Made support for C++11 features user-overridable through explicit 
+  definition of corresponding preprocessor symbols to either 0 or 1.
+- Renamed `HALF_ENABLE_HASH` to `HALF_ENABLE_CPP11_HASH` in correspondence 
+  with other C++11 preprocessor symbols.
+
+
+1.2.0 release (2012-08-07):
+---------------------------
+
+- Added proper preprocessor definitions for `HUGE_VALH` and `FP_FAST_FMAH` 
+  in correspondence with their single-precision counterparts from `<cmath>`.
+- Fixed internal preprocessor macros to be properly undefined after use.
+
+
+1.1.2 release (2012-08-07):
+---------------------------
+
+- Revised `std::numeric_limits::round_style` to return 
+  `std::round_toward_zero` if the `float` version also does and 
+  `std::round_indeterminate` otherwise.
+- Fixed `std::numeric_limits::round_error` to reflect worst-case round 
+  toward zero behaviour.
+
+
+1.1.1 release (2012-08-06):
+---------------------------
+
+- Fixed `std::numeric_limits::min` to return smallest positive normal 
+  number, instead of subnormal number.
+- Fixed `std::numeric_limits::round_style` to return 
+  `std::round_indeterminate` due to mixture of separately rounded 
+  single-precision arithmetics with truncating single-to-half conversions.
+
+
+1.1.0 release (2012-08-06):
+---------------------------
+
+- Added half-precision literals.
+
+
+1.0.0 release (2012-08-05):
+---------------------------
+
+- First release.
diff --git a/src/half/LICENSE.txt b/src/half/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ee9dfd78a362ae533cefb7f8ca70cfa7ff278015
--- /dev/null
+++ b/src/half/LICENSE.txt
@@ -0,0 +1,20 @@
+The MIT License
+
+        Copyright(c) 2012 -
+    2017 Christian Rau
+
+    Permission is hereby granted,
+    free of charge, to any person obtaining a copy of this software and associated documentation files(the "Software"),
+    to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and / or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
+    subject to the following conditions :
+
+    The above copyright notice and this permission notice shall be included in all copies
+    or
+    substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS",
+    WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+    DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/src/half/README.txt b/src/half/README.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7e485180738e9b5fb2e63eb61a773b813ed9626f
--- /dev/null
+++ b/src/half/README.txt
@@ -0,0 +1,286 @@
+HALF-PRECISION FLOATING POINT LIBRARY (Version 1.12.0)
+------------------------------------------------------
+
+This is a C++ header-only library to provide an IEEE 754 conformant 16-bit 
+half-precision floating point type along with corresponding arithmetic 
+operators, type conversions and common mathematical functions. It aims for both 
+efficiency and ease of use, trying to accurately mimic the behaviour of the 
+builtin floating point types at the best performance possible.
+
+
+INSTALLATION AND REQUIREMENTS
+-----------------------------
+
+Comfortably enough, the library consists of just a single header file 
+containing all the functionality, which can be directly included by your 
+projects, without the neccessity to build anything or link to anything.
+
+Whereas this library is fully C++98-compatible, it can profit from certain 
+C++11 features. Support for those features is checked automatically at compile 
+(or rather preprocessing) time, but can be explicitly enabled or disabled by 
+defining the corresponding preprocessor symbols to either 1 or 0 yourself. This 
+is useful when the automatic detection fails (for more exotic implementations) 
+or when a feature should be explicitly disabled:
+
+  - 'long long' integer type for mathematical functions returning 'long long' 
+    results (enabled for VC++ 2003 and newer, gcc and clang, overridable with 
+    'HALF_ENABLE_CPP11_LONG_LONG').
+
+  - Static assertions for extended compile-time checks (enabled for VC++ 2010, 
+    gcc 4.3, clang 2.9 and newer, overridable with 'HALF_ENABLE_CPP11_STATIC_ASSERT').
+
+  - Generalized constant expressions (enabled for VC++ 2015, gcc 4.6, clang 3.1 
+    and newer, overridable with 'HALF_ENABLE_CPP11_CONSTEXPR').
+
+  - noexcept exception specifications (enabled for VC++ 2015, gcc 4.6, clang 3.0 
+    and newer, overridable with 'HALF_ENABLE_CPP11_NOEXCEPT').
+
+  - User-defined literals for half-precision literals to work (enabled for 
+    VC++ 2015, gcc 4.7, clang 3.1 and newer, overridable with 
+    'HALF_ENABLE_CPP11_USER_LITERALS').
+
+  - Type traits and template meta-programming features from <type_traits> 
+    (enabled for VC++ 2010, libstdc++ 4.3, libc++ and newer, overridable with 
+    'HALF_ENABLE_CPP11_TYPE_TRAITS').
+
+  - Special integer types from <cstdint> (enabled for VC++ 2010, libstdc++ 4.3, 
+    libc++ and newer, overridable with 'HALF_ENABLE_CPP11_CSTDINT').
+
+  - Certain C++11 single-precision mathematical functions from <cmath> for 
+    an improved implementation of their half-precision counterparts to work 
+    (enabled for VC++ 2013, libstdc++ 4.3, libc++ and newer, overridable with 
+    'HALF_ENABLE_CPP11_CMATH').
+
+  - Hash functor 'std::hash' from <functional> (enabled for VC++ 2010, 
+    libstdc++ 4.3, libc++ and newer, overridable with 'HALF_ENABLE_CPP11_HASH').
+
+The library has been tested successfully with Visual C++ 2005-2015, gcc 4.4-4.8 
+and clang 3.1. Please contact me if you have any problems, suggestions or even 
+just success testing it on other platforms.
+
+
+DOCUMENTATION
+-------------
+
+Here follow some general words about the usage of the library and its 
+implementation. For a complete documentation of its iterface look at the 
+corresponding website http://half.sourceforge.net. You may also generate the 
+complete developer documentation from the library's only include file's doxygen 
+comments, but this is more relevant to developers rather than mere users (for 
+reasons described below).
+
+BASIC USAGE
+
+To make use of the library just include its only header file half.hpp, which 
+defines all half-precision functionality inside the 'half_float' namespace. The 
+actual 16-bit half-precision data type is represented by the 'half' type. This 
+type behaves like the builtin floating point types as much as possible, 
+supporting the usual arithmetic, comparison and streaming operators, which 
+makes its use pretty straight-forward:
+
+    using half_float::half;
+half a(3.4), b(5);
+half c = a * b;
+c += 3;
+if(c > a)
+  std::cout << c << std::endl;
+
+Additionally the 'half_float' namespace also defines half-precision versions 
+for all mathematical functions of the C++ standard library, which can be used 
+directly through ADL:
+
+    half a(-3.14159);
+half s = sin(abs(a));
+long l = lround(s);
+
+You may also specify explicit half-precision literals, since the library 
+provides a user-defined literal inside the 'half_float::literal' namespace, 
+which you just need to import (assuming support for C++11 user-defined literals):
+
+    using namespace half_float::literal;
+half x = 1.0_h;
+
+Furthermore the library provides proper specializations for 
+'std::numeric_limits', defining various implementation properties, and 
+'std::hash' for hashing half-precision numbers (assuming support for C++11 
+'std::hash'). Similar to the corresponding preprocessor symbols from <cmath> 
+the library also defines the 'HUGE_VALH' constant and maybe the 'FP_FAST_FMAH' 
+symbol.
+
+CONVERSIONS AND ROUNDING
+
+The half is explicitly constructible/convertible from a single-precision float 
+argument. Thus it is also explicitly constructible/convertible from any type 
+implicitly convertible to float, but constructing it from types like double or 
+int will involve the usual warnings arising when implicitly converting those to 
+float because of the lost precision. On the one hand those warnings are 
+intentional, because converting those types to half neccessarily also reduces 
+precision. But on the other hand they are raised for explicit conversions from 
+those types, when the user knows what he is doing. So if those warnings keep 
+bugging you, then you won't get around first explicitly converting to float 
+before converting to half, or use the 'half_cast' described below. In addition 
+you can also directly assign float values to halfs.
+
+In contrast to the float-to-half conversion, which reduces precision, the 
+conversion from half to float (and thus to any other type implicitly 
+convertible from float) is implicit, because all values represetable with 
+half-precision are also representable with single-precision. This way the 
+half-to-float conversion behaves similar to the builtin float-to-double 
+conversion and all arithmetic expressions involving both half-precision and 
+single-precision arguments will be of single-precision type. This way you can 
+also directly use the mathematical functions of the C++ standard library, 
+though in this case you will invoke the single-precision versions which will 
+also return single-precision values, which is (even if maybe performing the 
+exact same computation, see below) not as conceptually clean when working in a 
+half-precision environment.
+
+The default rounding mode for conversions from float to half uses truncation 
+(round toward zero, but mapping overflows to infinity) for rounding values not 
+representable exactly in half-precision. This is the fastest rounding possible 
+and is usually sufficient. But by redefining the 'HALF_ROUND_STYLE' 
+preprocessor symbol (before including half.hpp) this default can be overridden 
+with one of the other standard rounding modes using their respective constants 
+or the equivalent values of 'std::float_round_style' (it can even be 
+synchronized with the underlying single-precision implementation by defining it 
+to 'std::numeric_limits<float>::round_style'):
+
+  - 'std::round_indeterminate' or -1 for the fastest rounding (default).
+
+  - 'std::round_toward_zero' or 0 for rounding toward zero.
+
+  - std::round_to_nearest' or 1 for rounding to the nearest value.
+
+  - std::round_toward_infinity' or 2 for rounding toward positive infinity.
+
+  - std::round_toward_neg_infinity' or 3 for rounding toward negative infinity.
+
+In addition to changing the overall default rounding mode one can also use the 
+'half_cast'. This converts between half and any built-in arithmetic type using 
+a configurable rounding mode (or the default rounding mode if none is 
+specified). In addition to a configurable rounding mode, 'half_cast' has 
+another big difference to a mere 'static_cast': Any conversions are performed 
+directly using the given rounding mode, without any intermediate conversion 
+to/from 'float'. This is especially relevant for conversions to integer types, 
+which don't necessarily truncate anymore. But also for conversions from 
+'double' or 'long double' this may produce more precise results than a 
+pre-conversion to 'float' using the single-precision implementation's current 
+rounding mode would.
+
+    half a = half_cast<half>(4.2);
+half b = half_cast<half, std::numeric_limits<float>::round_style>(4.2f);
+assert(half_cast<int, std::round_to_nearest>(0.7_h) == 1);
+assert(half_cast<half, std::round_toward_zero>(4097) == 4096.0_h);
+assert(half_cast<half, std::round_toward_infinity>(4097) == 4100.0_h);
+assert(half_cast<half, std::round_toward_infinity>(std::numeric_limits<double>::min()) > 0.0_h);
+
+When using round to nearest(either as default or through 'half_cast') ties are by default resolved by rounding them away from
+    zero(and thus equal to the behaviour of the 'round' function)
+        .But by redefining the 'HALF_ROUND_TIES_TO_EVEN' preprocessor symbol to
+    1(before including half.hpp)this default can be changed to the slightly slower but less biased and more IEEE
+    - conformant behaviour of rounding half -
+    way cases to the nearest even value.
+
+#define HALF_ROUND_TIES_TO_EVEN 1
+#include <half.hpp>
+    ... assert(half_cast<int, std::round_to_nearest>(3.5_h) == half_cast<int, std::round_to_nearest>(4.5_h));
+
+IMPLEMENTATION
+
+For performance reasons (and ease of implementation) many of the mathematical 
+functions provided by the library as well as all arithmetic operations are 
+actually carried out in single-precision under the hood, calling to the C++ 
+standard library implementations of those functions whenever appropriate, 
+meaning the arguments are converted to floats and the result back to half. But 
+to reduce the conversion overhead as much as possible any temporary values 
+inside of lengthy expressions are kept in single-precision as long as possible, 
+while still maintaining a strong half-precision type to the outside world. Only 
+when finally assigning the value to a half or calling a function that works 
+directly on halfs is the actual conversion done (or never, when further 
+converting the result to float.
+
+This approach has two implications. First of all you have to treat the 
+library's documentation at http://half.sourceforge.net as a simplified version, 
+describing the behaviour of the library as if implemented this way. The actual 
+argument and return types of functions and operators may involve other internal 
+types (feel free to generate the exact developer documentation from the Doxygen 
+comments in the library's header file if you really need to). But nevertheless 
+the behaviour is exactly like specified in the documentation. The other 
+implication is, that in the presence of rounding errors or over-/underflows 
+arithmetic expressions may produce different results when compared to 
+converting to half-precision after each individual operation:
+
+    half a = std::numeric_limits<half>::max() * 2.0_h / 2.0_h;       // a = MAX
+    half b = half(std::numeric_limits<half>::max() * 2.0_h) / 2.0_h; // b = INF
+    assert( a != b );
+
+But this should only be a problem in very few cases. One last word has to be 
+said when talking about performance. Even with its efforts in reducing 
+conversion overhead as much as possible, the software half-precision 
+implementation can most probably not beat the direct use of single-precision 
+computations. Usually using actual float values for all computations and 
+temproraries and using halfs only for storage is the recommended way. On the 
+one hand this somehow makes the provided mathematical functions obsolete 
+(especially in light of the implicit conversion from half to float), but 
+nevertheless the goal of this library was to provide a complete and 
+conceptually clean half-precision implementation, to which the standard 
+mathematical functions belong, even if usually not needed.
+
+IEEE CONFORMANCE
+
+The half type uses the standard IEEE representation with 1 sign bit, 5 exponent 
+bits and 10 mantissa bits (11 when counting the hidden bit). It supports all 
+types of special values, like subnormal values, infinity and NaNs. But there 
+are some limitations to the complete conformance to the IEEE 754 standard:
+
+  - The implementation does not differentiate between signalling and quiet 
+    NaNs, this means operations on halfs are not specified to trap on 
+    signalling NaNs (though they may, see last point).
+
+  - Though arithmetic operations are internally rounded to single-precision 
+    using the underlying single-precision implementation's current rounding 
+    mode, those values are then converted to half-precision using the default 
+    half-precision rounding mode (changed by defining 'HALF_ROUND_STYLE' 
+    accordingly). This mixture of rounding modes is also the reason why 
+    'std::numeric_limits<half>::round_style' may actually return 
+    'std::round_indeterminate' when half- and single-precision rounding modes 
+    don't match.
+
+  - Because of internal truncation it may also be that certain single-precision 
+    NaNs will be wrongly converted to half-precision infinity, though this is 
+    very unlikely to happen, since most single-precision implementations don't 
+    tend to only set the lowest bits of a NaN mantissa.
+
+  - The implementation does not provide any floating point exceptions, thus 
+    arithmetic operations or mathematical functions are not specified to invoke 
+    proper floating point exceptions. But due to many functions implemented in 
+    single-precision, those may still invoke floating point exceptions of the 
+    underlying single-precision implementation.
+
+Some of those points could have been circumvented by controlling the floating 
+point environment using <cfenv> or implementing a similar exception mechanism. 
+But this would have required excessive runtime checks giving two high an impact 
+on performance for something that is rarely ever needed. If you really need to 
+rely on proper floating point exceptions, it is recommended to explicitly 
+perform computations using the built-in floating point types to be on the safe 
+side. In the same way, if you really need to rely on a particular rounding 
+behaviour, it is recommended to either use single-precision computations and 
+explicitly convert the result to half-precision using 'half_cast' and 
+specifying the desired rounding mode, or synchronize the default half-precision 
+rounding mode to the rounding mode of the single-precision implementation (most 
+likely 'HALF_ROUND_STYLE=1', 'HALF_ROUND_TIES_TO_EVEN=1'). But this is really 
+considered an expert-scenario that should be used only when necessary, since 
+actually working with half-precision usually comes with a certain 
+tolerance/ignorance of exactness considerations and proper rounding comes with 
+a certain performance cost.
+
+
+CREDITS AND CONTACT
+-------------------
+
+This library is developed by CHRISTIAN RAU and released under the MIT License 
+(see LICENSE.txt). If you have any questions or problems with it, feel free to 
+contact me at rauy@users.sourceforge.net.
+
+Additional credit goes to JEROEN VAN DER ZIJP for his paper on "Fast Half Float 
+Conversions", whose algorithms have been used in the library for converting 
+between half-precision and single-precision values.
diff --git a/src/half/half.hpp b/src/half/half.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..acdbdf30f6c76a074280c95be445a7949d5e231f
--- /dev/null
+++ b/src/half/half.hpp
@@ -0,0 +1,3496 @@
+
+/*! \file half.hpp
+ *
+ *  \brief Implements half precision functionality.
+ */
+
+// half - IEEE 754-based half-precision floating point library.
+//
+// Copyright (c) 2012-2017 Christian Rau <rauy@users.sourceforge.net>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+// Version 1.12.0
+
+#ifndef HALF_HALF_HPP
+#define HALF_HALF_HPP
+
+/// Combined gcc version number.
+#define HALF_GNUC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+// check C++11 language features
+#if defined(__clang__)  // clang
+#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if(defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+/*#elif defined(__INTEL_COMPILER)								//Intel C++
+        #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)		????????
+                #define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+        #endif
+        #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)			????????
+                #define HALF_ENABLE_CPP11_CONSTEXPR 1
+        #endif
+        #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)			????????
+                #define HALF_ENABLE_CPP11_NOEXCEPT 1
+        #endif
+        #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG)			????????
+                #define HALF_ENABLE_CPP11_LONG_LONG 1
+        #endif*/
+#elif defined(__GNUC__)  // gcc
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#endif
+#elif defined(_MSC_VER)  // Visual C++
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#define HALF_POP_WARNINGS 1
+#pragma warning(push)
+#pragma warning(disable : 4099 4127 4146)  // struct vs class, constant in if, negative unsigned
+#endif
+
+// check C++11 library features
+#include <utility>
+#if defined(_LIBCPP_VERSION)  // libc++
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CSTDINT
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CMATH
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#ifndef HALF_ENABLE_CPP11_HASH
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#endif
+#elif defined(__GLIBCXX__)  // libstdc++
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+#ifdef __clang__
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH)
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH)
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#else
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH)
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH)
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#endif
+#endif
+#elif defined(_CPPLIB_VER)  // Dinkumware/Visual C++
+#if _CPPLIB_VER >= 520
+#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CSTDINT
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#ifndef HALF_ENABLE_CPP11_HASH
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#endif
+#if _CPPLIB_VER >= 610
+#ifndef HALF_ENABLE_CPP11_CMATH
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#endif
+#endif
+#undef HALF_GNUC_VERSION
+
+// support constexpr
+#if HALF_ENABLE_CPP11_CONSTEXPR
+#define HALF_CONSTEXPR constexpr
+#define HALF_CONSTEXPR_CONST constexpr
+#else
+#define HALF_CONSTEXPR
+#define HALF_CONSTEXPR_CONST const
+#endif
+
+// support noexcept
+#if HALF_ENABLE_CPP11_NOEXCEPT
+#define HALF_NOEXCEPT noexcept
+#define HALF_NOTHROW noexcept
+#else
+#define HALF_NOEXCEPT
+#define HALF_NOTHROW throw()
+#endif
+
+#include <algorithm>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+#include <type_traits>
+#endif
+#if HALF_ENABLE_CPP11_CSTDINT
+#include <cstdint>
+#endif
+#if HALF_ENABLE_CPP11_HASH
+#include <functional>
+#endif
+
+/// Default rounding mode.
+/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and `float`s as well as
+/// for the half_cast() if not specifying a rounding mode explicitly. It can be redefined (before including half.hpp) to one
+/// of the standard rounding modes using their respective constants or the equivalent values of `std::float_round_style`:
+///
+/// `std::float_round_style`         | value | rounding
+/// ---------------------------------|-------|-------------------------
+/// `std::round_indeterminate`       | -1    | fastest (default)
+/// `std::round_toward_zero`         | 0     | toward zero
+/// `std::round_to_nearest`          | 1     | to nearest
+/// `std::round_toward_infinity`     | 2     | toward positive infinity
+/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
+///
+/// By default this is set to `-1` (`std::round_indeterminate`), which uses truncation (round toward zero, but with overflows
+/// set to infinity) and is the fastest rounding mode possible. It can even be set to `std::numeric_limits<float>::round_style`
+/// to synchronize the rounding mode with that of the underlying single-precision implementation.
+#ifndef HALF_ROUND_STYLE
+#define HALF_ROUND_STYLE -1  // = std::round_indeterminate
+#endif
+
+/// Tie-breaking behaviour for round to nearest.
+/// This specifies if ties in round to nearest should be resolved by rounding to the nearest even value. By default this is
+/// defined to `0` resulting in the faster but slightly more biased behaviour of rounding away from zero in half-way cases (and
+/// thus equal to the round() function), but can be redefined to `1` (before including half.hpp) if more IEEE-conformant
+/// behaviour is needed.
+#ifndef HALF_ROUND_TIES_TO_EVEN
+#define HALF_ROUND_TIES_TO_EVEN 0  // ties away from zero
+#endif
+
+/// Value signaling overflow.
+/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to a positive value signaling the overflow of an
+/// operation, in particular it just evaluates to positive infinity.
+#define HUGE_VALH std::numeric_limits<half_float::half>::infinity()
+
+/// Fast half-precision fma function.
+/// This symbol is only defined if the fma() function generally executes as fast as, or faster than, a separate
+/// half-precision multiplication followed by an addition. Due to the internal single-precision implementation of all
+/// arithmetic operations, this is in fact always the case.
+#define FP_FAST_FMAH 1
+
+#ifndef FP_ILOGB0
+#define FP_ILOGB0 INT_MIN
+#endif
+#ifndef FP_ILOGBNAN
+#define FP_ILOGBNAN INT_MAX
+#endif
+#ifndef FP_SUBNORMAL
+#define FP_SUBNORMAL 0
+#endif
+#ifndef FP_ZERO
+#define FP_ZERO 1
+#endif
+#ifndef FP_NAN
+#define FP_NAN 2
+#endif
+#ifndef FP_INFINITE
+#define FP_INFINITE 3
+#endif
+#ifndef FP_NORMAL
+#define FP_NORMAL 4
+#endif
+
+/// Main namespace for half precision functionality.
+/// This namespace contains all the functionality provided by the library.
+namespace half_float
+{
+class half;
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+/// Library-defined half-precision literals.
+/// Import this namespace to enable half-precision floating point literals:
+/// ~~~~{.cpp}
+/// using namespace half_float::literal;
+/// half_float::half = 4.2_h;
+/// ~~~~
+namespace literal
+{
+half operator""_h(long double);
+}
+#endif
+
+/// \internal
+/// \brief Implementation details.
+namespace detail
+{
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+/// Conditional type.
+template <bool B, typename T, typename F>
+struct conditional : std::conditional<B, T, F>
+{
+};
+
+/// Helper for tag dispatching.
+template <bool B>
+struct bool_type : std::integral_constant<bool, B>
+{
+};
+using std::false_type;
+using std::true_type;
+
+/// Type traits for floating point types.
+template <typename T>
+struct is_float : std::is_floating_point<T>
+{
+};
+#else
+/// Conditional type.
+template <bool, typename T, typename>
+struct conditional
+{
+  typedef T type;
+};
+template <typename T, typename F>
+struct conditional<false, T, F>
+{
+  typedef F type;
+};
+
+/// Helper for tag dispatching.
+template <bool>
+struct bool_type
+{
+};
+typedef bool_type<true> true_type;
+typedef bool_type<false> false_type;
+
+/// Type traits for floating point types.
+template <typename>
+struct is_float : false_type
+{
+};
+template <typename T>
+struct is_float<const T> : is_float<T>
+{
+};
+template <typename T>
+struct is_float<volatile T> : is_float<T>
+{
+};
+template <typename T>
+struct is_float<const volatile T> : is_float<T>
+{
+};
+template <>
+struct is_float<float> : true_type
+{
+};
+template <>
+struct is_float<double> : true_type
+{
+};
+template <>
+struct is_float<long double> : true_type
+{
+};
+#endif
+
+/// Type traits for floating point bits.
+template <typename T>
+struct bits
+{
+  typedef unsigned char type;
+};
+template <typename T>
+struct bits<const T> : bits<T>
+{
+};
+template <typename T>
+struct bits<volatile T> : bits<T>
+{
+};
+template <typename T>
+struct bits<const volatile T> : bits<T>
+{
+};
+
+#if HALF_ENABLE_CPP11_CSTDINT
+/// Unsigned integer of (at least) 16 bits width.
+typedef std::uint_least16_t uint16;
+
+/// Unsigned integer of (at least) 32 bits width.
+template <>
+struct bits<float>
+{
+  typedef std::uint_least32_t type;
+};
+
+/// Unsigned integer of (at least) 64 bits width.
+template <>
+struct bits<double>
+{
+  typedef std::uint_least64_t type;
+};
+#else
+/// Unsigned integer of (at least) 16 bits width.
+typedef unsigned short uint16;
+
+/// Unsigned integer of (at least) 32 bits width.
+template <>
+struct bits<float> : conditional<std::numeric_limits<unsigned int>::digits >= 32, unsigned int, unsigned long>
+{
+};
+
+#if HALF_ENABLE_CPP11_LONG_LONG
+/// Unsigned integer of (at least) 64 bits width.
+template <>
+struct bits<double> : conditional<std::numeric_limits<unsigned long>::digits >= 64, unsigned long, unsigned long long>
+{
+};
+#else
+/// Unsigned integer of (at least) 64 bits width.
+template <>
+struct bits<double>
+{
+  typedef unsigned long type;
+};
+#endif
+#endif
+
+/// Tag type for binary construction.
+struct binary_t
+{
+};
+
+/// Tag for binary construction.
+HALF_CONSTEXPR_CONST binary_t binary = binary_t();
+
+/// Temporary half-precision expression.
+/// This class represents a half-precision expression which just stores a single-precision value internally.
+struct expr
+{
+  /// Conversion constructor.
+  /// \param f single-precision value to convert
+  explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {}
+
+  /// Conversion to single-precision.
+  /// \return single precision value representing expression value
+  HALF_CONSTEXPR operator float() const HALF_NOEXCEPT { return value_; }
+
+ private:
+  /// Internal expression value stored in single-precision.
+  float value_;
+};
+
+/// SFINAE helper for generic half-precision functions.
+/// This class template has to be specialized for each valid combination of argument types to provide a corresponding
+/// `type` member equivalent to \a T.
+/// \tparam T type to return
+template <typename T, typename, typename = void, typename = void>
+struct enable
+{
+};
+template <typename T>
+struct enable<T, half, void, void>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, expr, void, void>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, half, half, void>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, half, expr, void>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, expr, half, void>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, expr, expr, void>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, half, half, half>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, half, half, expr>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, half, expr, half>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, half, expr, expr>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, expr, half, half>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, expr, half, expr>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, expr, expr, half>
+{
+  typedef T type;
+};
+template <typename T>
+struct enable<T, expr, expr, expr>
+{
+  typedef T type;
+};
+
+/// Return type for specialized generic 2-argument half-precision functions.
+/// This class template has to be specialized for each valid combination of argument types to provide a corresponding
+/// `type` member denoting the appropriate return type.
+/// \tparam T first argument type
+/// \tparam U first argument type
+template <typename T, typename U>
+struct result : enable<expr, T, U>
+{
+};
+template <>
+struct result<half, half>
+{
+  typedef half type;
+};
+
+/// \name Classification helpers
+/// \{
+
+/// Check for infinity.
+/// \tparam T argument type (builtin floating point type)
+/// \param arg value to query
+/// \retval true if infinity
+/// \retval false else
+template <typename T>
+bool builtin_isinf(T arg)
+{
+#if HALF_ENABLE_CPP11_CMATH
+  return std::isinf(arg);
+#elif defined(_MSC_VER)
+  return !::_finite(static_cast<double>(arg)) && !::_isnan(static_cast<double>(arg));
+#else
+  return arg == std::numeric_limits<T>::infinity() || arg == -std::numeric_limits<T>::infinity();
+#endif
+}
+
+/// Check for NaN.
+/// \tparam T argument type (builtin floating point type)
+/// \param arg value to query
+/// \retval true if not a number
+/// \retval false else
+template <typename T>
+bool builtin_isnan(T arg)
+{
+#if HALF_ENABLE_CPP11_CMATH
+  return std::isnan(arg);
+#elif defined(_MSC_VER)
+  return ::_isnan(static_cast<double>(arg)) != 0;
+#else
+  return arg != arg;
+#endif
+}
+
+/// Check sign.
+/// \tparam T argument type (builtin floating point type)
+/// \param arg value to query
+/// \retval true if signbit set
+/// \retval false else
+template <typename T>
+bool builtin_signbit(T arg)
+{
+#if HALF_ENABLE_CPP11_CMATH
+  return std::signbit(arg);
+#else
+  return arg < T() || (arg == T() && T(1) / arg < T());
+#endif
+}
+
+/// \}
+/// \name Conversion
+/// \{
+
+/// Convert IEEE single-precision to half-precision.
+/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \param value single-precision value
+/// \return binary representation of half-precision value
+template <std::float_round_style R>
+uint16 float2half_impl(float value, true_type)
+{
+  typedef bits<float>::type uint32;
+  uint32 bits;  // = *reinterpret_cast<uint32*>(&value);		//violating strict aliasing!
+  std::memcpy(&bits, &value, sizeof(float));
+  /*			uint16 hbits = (bits>>16) & 0x8000;
+                          bits &= 0x7FFFFFFF;
+                          int exp = bits >> 23;
+                          if(exp == 255)
+                                  return hbits | 0x7C00 | (0x3FF&-static_cast<unsigned>((bits&0x7FFFFF)!=0));
+                          if(exp > 142)
+                          {
+                                  if(R == std::round_toward_infinity)
+                                          return hbits | 0x7C00 - (hbits>>15);
+                                  if(R == std::round_toward_neg_infinity)
+                                          return hbits | 0x7BFF + (hbits>>15);
+                                  return hbits | 0x7BFF + (R!=std::round_toward_zero);
+                          }
+                          int g, s;
+                          if(exp > 112)
+                          {
+                                  g = (bits>>12) & 1;
+                                  s = (bits&0xFFF) != 0;
+                                  hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF);
+                          }
+                          else if(exp > 101)
+                          {
+                                  int i = 125 - exp;
+                                  bits = (bits&0x7FFFFF) | 0x800000;
+                                  g = (bits>>i) & 1;
+                                  s = (bits&((1L<<i)-1)) != 0;
+                                  hbits |= bits >> (i+1);
+                          }
+                          else
+                          {
+                                  g = 0;
+                                  s = bits != 0;
+                          }
+                          if(R == std::round_to_nearest)
+                                  #if HALF_ROUND_TIES_TO_EVEN
+                                          hbits += g & (s|hbits);
+                                  #else
+                                          hbits += g;
+                                  #endif
+                          else if(R == std::round_toward_infinity)
+                                  hbits += ~(hbits>>15) & (s|g);
+                          else if(R == std::round_toward_neg_infinity)
+                                  hbits += (hbits>>15) & (g|s);
+  */
+  static const uint16 base_table[512] = {
+      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
+      0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
+      0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,
+      0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+      0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+      0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+      0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+      0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+      0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+      0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+      0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+      0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+      0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+      0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+      0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+      0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+      0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
+      0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
+      0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,
+      0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+      0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+      0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+      0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+      0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+      0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+      0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00};
+  static const unsigned char shift_table[512] = {
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13};
+  uint16 hbits = base_table[bits >> 23] + static_cast<uint16>((bits & 0x7FFFFF) >> shift_table[bits >> 23]);
+  if(R == std::round_to_nearest)
+    hbits += (((bits & 0x7FFFFF) >> (shift_table[bits >> 23] - 1)) | (((bits >> 23) & 0xFF) == 102)) & ((hbits & 0x7C00) != 0x7C00)
+#if HALF_ROUND_TIES_TO_EVEN
+             & (((((static_cast<uint32>(1) << (shift_table[bits >> 23] - 1)) - 1) & bits) != 0) | hbits)
+#endif
+        ;
+  else if(R == std::round_toward_zero)
+    hbits -= ((hbits & 0x7FFF) == 0x7C00) & ~shift_table[bits >> 23];
+  else if(R == std::round_toward_infinity)
+    hbits += ((((bits & 0x7FFFFF & ((static_cast<uint32>(1) << (shift_table[bits >> 23])) - 1)) != 0) |
+               (((bits >> 23) <= 102) & ((bits >> 23) != 0))) &
+              (hbits < 0x7C00)) -
+             ((hbits == 0xFC00) & ((bits >> 23) != 511));
+  else if(R == std::round_toward_neg_infinity)
+    hbits += ((((bits & 0x7FFFFF & ((static_cast<uint32>(1) << (shift_table[bits >> 23])) - 1)) != 0) |
+               (((bits >> 23) <= 358) & ((bits >> 23) != 256))) &
+              (hbits < 0xFC00) & (hbits >> 15)) -
+             ((hbits == 0x7C00) & ((bits >> 23) != 255));
+  return hbits;
+}
+
+/// Convert IEEE double-precision to half-precision.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \param value double-precision value
+/// \return binary representation of half-precision value
+template <std::float_round_style R>
+uint16 float2half_impl(double value, true_type)
+{
+  typedef bits<float>::type uint32;
+  typedef bits<double>::type uint64;
+  uint64 bits;  // = *reinterpret_cast<uint64*>(&value);		//violating strict aliasing!
+  std::memcpy(&bits, &value, sizeof(double));
+  uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF;
+  uint16 hbits = (hi >> 16) & 0x8000;
+  hi &= 0x7FFFFFFF;
+  int exp = hi >> 20;
+  if(exp == 2047)
+    return hbits | 0x7C00 | (0x3FF & -static_cast<unsigned>((bits & 0xFFFFFFFFFFFFF) != 0));
+  if(exp > 1038)
+    {
+      if(R == std::round_toward_infinity)
+        return hbits | 0x7C00 - (hbits >> 15);
+      if(R == std::round_toward_neg_infinity)
+        return hbits | 0x7BFF + (hbits >> 15);
+      return hbits | 0x7BFF + (R != std::round_toward_zero);
+    }
+  int g, s = lo != 0;
+  if(exp > 1008)
+    {
+      g = (hi >> 9) & 1;
+      s |= (hi & 0x1FF) != 0;
+      hbits |= ((exp - 1008) << 10) | ((hi >> 10) & 0x3FF);
+    }
+  else if(exp > 997)
+    {
+      int i = 1018 - exp;
+      hi    = (hi & 0xFFFFF) | 0x100000;
+      g     = (hi >> i) & 1;
+      s |= (hi & ((1L << i) - 1)) != 0;
+      hbits |= hi >> (i + 1);
+    }
+  else
+    {
+      g = 0;
+      s |= hi != 0;
+    }
+  if(R == std::round_to_nearest)
+#if HALF_ROUND_TIES_TO_EVEN
+    hbits += g & (s | hbits);
+#else
+    hbits += g;
+#endif
+  else if(R == std::round_toward_infinity)
+    hbits += ~(hbits >> 15) & (s | g);
+  else if(R == std::round_toward_neg_infinity)
+    hbits += (hbits >> 15) & (g | s);
+  return hbits;
+}
+
+/// Convert non-IEEE floating point to half-precision.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam T source type (builtin floating point type)
+/// \param value floating point value
+/// \return binary representation of half-precision value
+template <std::float_round_style R, typename T>
+uint16 float2half_impl(T value, ...)
+{
+  uint16 hbits = static_cast<unsigned>(builtin_signbit(value)) << 15;
+  if(value == T())
+    return hbits;
+  if(builtin_isnan(value))
+    return hbits | 0x7FFF;
+  if(builtin_isinf(value))
+    return hbits | 0x7C00;
+  int exp;
+  std::frexp(value, &exp);
+  if(exp > 16)
+    {
+      if(R == std::round_toward_infinity)
+        return hbits | (0x7C00 - (hbits >> 15));
+      else if(R == std::round_toward_neg_infinity)
+        return hbits | (0x7BFF + (hbits >> 15));
+      return hbits | (0x7BFF + (R != std::round_toward_zero));
+    }
+  if(exp < -13)
+    value = std::ldexp(value, 24);
+  else
+    {
+      value = std::ldexp(value, 11 - exp);
+      hbits |= ((exp + 13) << 10);
+    }
+  T ival, frac = std::modf(value, &ival);
+  hbits += static_cast<uint16>(std::abs(static_cast<int>(ival)));
+  if(R == std::round_to_nearest)
+    {
+      frac = std::abs(frac);
+#if HALF_ROUND_TIES_TO_EVEN
+      hbits += (frac > T(0.5)) | ((frac == T(0.5)) & hbits);
+#else
+      hbits += frac >= T(0.5);
+#endif
+    }
+  else if(R == std::round_toward_infinity)
+    hbits += frac > T();
+  else if(R == std::round_toward_neg_infinity)
+    hbits += frac < T();
+  return hbits;
+}
+
+/// Convert floating point to half-precision.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam T source type (builtin floating point type)
+/// \param value floating point value
+/// \return binary representation of half-precision value
+template <std::float_round_style R, typename T>
+uint16 float2half(T value)
+{
+  return float2half_impl<R>(value, bool_type < std::numeric_limits<T>::is_iec559 && sizeof(typename bits<T>::type) == sizeof(T) > ());
+}
+
+/// Convert integer to half-precision floating point.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam S `true` if value negative, `false` else
+/// \tparam T type to convert (builtin integer type)
+/// \param value non-negative integral value
+/// \return binary representation of half-precision value
+template <std::float_round_style R, bool S, typename T>
+uint16 int2half_impl(T value)
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+  static_assert(std::is_integral<T>::value, "int to half conversion only supports builtin integer types");
+#endif
+  if(S)
+    value = -value;
+  uint16 bits = S << 15;
+  if(value > 0xFFFF)
+    {
+      if(R == std::round_toward_infinity)
+        bits |= 0x7C00 - S;
+      else if(R == std::round_toward_neg_infinity)
+        bits |= 0x7BFF + S;
+      else
+        bits |= 0x7BFF + (R != std::round_toward_zero);
+    }
+  else if(value)
+    {
+      unsigned int m = value, exp = 24;
+      for(; m < 0x400; m <<= 1, --exp)
+        ;
+      for(; m > 0x7FF; m >>= 1, ++exp)
+        ;
+      bits |= (exp << 10) + m;
+      if(exp > 24)
+        {
+          if(R == std::round_to_nearest)
+            bits += (value >> (exp - 25)) & 1
+#if HALF_ROUND_TIES_TO_EVEN
+                    & (((((1 << (exp - 25)) - 1) & value) != 0) | bits)
+#endif
+                ;
+          else if(R == std::round_toward_infinity)
+            bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & !S;
+          else if(R == std::round_toward_neg_infinity)
+            bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & S;
+        }
+    }
+  return bits;
+}
+
+/// Convert integer to half-precision floating point.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam T type to convert (builtin integer type)
+/// \param value integral value
+/// \return binary representation of half-precision value
+template <std::float_round_style R, typename T>
+uint16 int2half(T value)
+{
+  return (value < 0) ? int2half_impl<R, true>(value) : int2half_impl<R, false>(value);
+}
+
+/// Convert half-precision to IEEE single-precision.
+/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+/// \param value binary representation of half-precision value
+/// \return single-precision value
+inline float half2float_impl(uint16 value, float, true_type)
+{
+  typedef bits<float>::type uint32;
+  /*			uint32 bits = static_cast<uint32>(value&0x8000) << 16;
+                          int abs = value & 0x7FFF;
+                          if(abs)
+                          {
+                                  bits |= 0x38000000 << static_cast<unsigned>(abs>=0x7C00);
+                                  for(; abs<0x400; abs<<=1,bits-=0x800000) ;
+                                  bits += static_cast<uint32>(abs) << 13;
+                          }
+  */
+  static const uint32 mantissa_table[2048] = {
+      0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000,
+      0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, 0x35900000, 0x35980000,
+      0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000,
+      0x35F00000, 0x35F80000, 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000,
+      0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 0x36400000, 0x36440000,
+      0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000,
+      0x36700000, 0x36740000, 0x36780000, 0x367C0000, 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000,
+      0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,
+      0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000,
+      0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000,
+      0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000,
+      0x36DC0000, 0x36DE0000, 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000,
+      0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000,
+      0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000,
+      0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
+      0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,
+      0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000,
+      0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
+      0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000,
+      0x373E0000, 0x373F0000, 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000,
+      0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000,
+      0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000,
+      0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000,
+      0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,
+      0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000,
+      0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 0x37800000, 0x37808000, 0x37810000, 0x37818000,
+      0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
+      0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000,
+      0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, 0x37900000, 0x37908000,
+      0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
+      0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000,
+      0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,
+      0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000,
+      0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000,
+      0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000,
+      0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000,
+      0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000,
+      0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000,
+      0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000,
+      0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,
+      0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000,
+      0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000,
+      0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000,
+      0x37D70000, 0x37D78000, 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000,
+      0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, 0x37E00000, 0x37E08000,
+      0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000,
+      0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000,
+      0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,
+      0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000,
+      0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000,
+      0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000,
+      0x37FF0000, 0x37FF8000, 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000,
+      0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, 0x38040000, 0x38044000,
+      0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000,
+      0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000,
+      0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
+      0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000,
+      0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, 0x38100000, 0x38104000, 0x38108000, 0x3810C000,
+      0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000,
+      0x38138000, 0x3813C000, 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000,
+      0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 0x38180000, 0x38184000,
+      0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000,
+      0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000,
+      0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,
+      0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000,
+      0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, 0x38240000, 0x38244000, 0x38248000, 0x3824C000,
+      0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000,
+      0x38278000, 0x3827C000, 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000,
+      0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000,
+      0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000,
+      0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000,
+      0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,
+      0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000,
+      0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, 0x38380000, 0x38384000, 0x38388000, 0x3838C000,
+      0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000,
+      0x383B8000, 0x383BC000, 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000,
+      0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 0x38400000, 0x38404000,
+      0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000,
+      0x38430000, 0x38434000, 0x38438000, 0x3843C000, 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000,
+      0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,
+      0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000,
+      0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000,
+      0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000,
+      0x384F8000, 0x384FC000, 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000,
+      0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 0x38540000, 0x38544000,
+      0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000,
+      0x38570000, 0x38574000, 0x38578000, 0x3857C000, 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000,
+      0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,
+      0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000,
+      0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 0x38600000, 0x38604000, 0x38608000, 0x3860C000,
+      0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000,
+      0x38638000, 0x3863C000, 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000,
+      0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000,
+      0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000,
+      0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000,
+      0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,
+      0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000,
+      0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 0x38740000, 0x38744000, 0x38748000, 0x3874C000,
+      0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000,
+      0x38778000, 0x3877C000, 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000,
+      0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, 0x387C0000, 0x387C4000,
+      0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000,
+      0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000,
+      0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,
+      0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000,
+      0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 0x38040000, 0x38042000, 0x38044000, 0x38046000,
+      0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000,
+      0x3805C000, 0x3805E000, 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000,
+      0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, 0x38080000, 0x38082000,
+      0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000,
+      0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000,
+      0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
+      0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000,
+      0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000,
+      0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000,
+      0x380FC000, 0x380FE000, 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000,
+      0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, 0x38120000, 0x38122000,
+      0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000,
+      0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000,
+      0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,
+      0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000,
+      0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 0x38180000, 0x38182000, 0x38184000, 0x38186000,
+      0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000,
+      0x3819C000, 0x3819E000, 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000,
+      0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, 0x381C0000, 0x381C2000,
+      0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000,
+      0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000,
+      0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,
+      0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000,
+      0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, 0x38220000, 0x38222000, 0x38224000, 0x38226000,
+      0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000,
+      0x3823C000, 0x3823E000, 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000,
+      0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 0x38260000, 0x38262000,
+      0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000,
+      0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000,
+      0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,
+      0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000,
+      0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000,
+      0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000,
+      0x382DC000, 0x382DE000, 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000,
+      0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 0x38300000, 0x38302000,
+      0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000,
+      0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000,
+      0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,
+      0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000,
+      0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, 0x38360000, 0x38362000, 0x38364000, 0x38366000,
+      0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000,
+      0x3837C000, 0x3837E000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000,
+      0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 0x383A0000, 0x383A2000,
+      0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000,
+      0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000,
+      0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,
+      0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000,
+      0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, 0x38400000, 0x38402000, 0x38404000, 0x38406000,
+      0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000,
+      0x3841C000, 0x3841E000, 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000,
+      0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 0x38440000, 0x38442000,
+      0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000,
+      0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000,
+      0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,
+      0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000,
+      0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000,
+      0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000,
+      0x384BC000, 0x384BE000, 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000,
+      0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000,
+      0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000,
+      0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000,
+      0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
+      0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000,
+      0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 0x38540000, 0x38542000, 0x38544000, 0x38546000,
+      0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000,
+      0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000,
+      0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, 0x38580000, 0x38582000,
+      0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
+      0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000,
+      0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,
+      0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000,
+      0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000,
+      0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000,
+      0x385FC000, 0x385FE000, 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000,
+      0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, 0x38620000, 0x38622000,
+      0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000,
+      0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000,
+      0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,
+      0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000,
+      0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
+      0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000,
+      0x3869C000, 0x3869E000, 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000,
+      0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, 0x386C0000, 0x386C2000,
+      0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000,
+      0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000,
+      0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,
+      0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000,
+      0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 0x38720000, 0x38722000, 0x38724000, 0x38726000,
+      0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000,
+      0x3873C000, 0x3873E000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000,
+      0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, 0x38760000, 0x38762000,
+      0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
+      0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000,
+      0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,
+      0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000,
+      0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000,
+      0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000,
+      0x387DC000, 0x387DE000, 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000,
+      0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000};
+  static const uint32 exponent_table[64] = {
+      0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000,
+      0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, 0x09000000, 0x09800000,
+      0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000,
+      0x0F000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000,
+      0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 0x88000000, 0x88800000,
+      0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000,
+      0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000};
+  static const unsigned short offset_table[64] = {0,    1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+                                                  1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+                                                  1024, 1024, 1024, 1024, 1024, 1024, 0,    1024, 1024, 1024, 1024, 1024, 1024,
+                                                  1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+                                                  1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024};
+  uint32 bits = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + exponent_table[value >> 10];
+  //			return *reinterpret_cast<float*>(&bits);			//violating strict aliasing!
+  float out;
+  std::memcpy(&out, &bits, sizeof(float));
+  return out;
+}
+
+/// Convert half-precision to IEEE double-precision.
+/// \param value binary representation of half-precision value
+/// \return double-precision value
+inline double half2float_impl(uint16 value, double, true_type)
+{
+  typedef bits<float>::type uint32;
+  typedef bits<double>::type uint64;
+  uint32 hi = static_cast<uint32>(value & 0x8000) << 16;
+  int abs   = value & 0x7FFF;
+  if(abs)
+    {
+      hi |= 0x3F000000 << static_cast<unsigned>(abs >= 0x7C00);
+      for(; abs < 0x400; abs <<= 1, hi -= 0x100000)
+        ;
+      hi += static_cast<uint32>(abs) << 10;
+    }
+  uint64 bits = static_cast<uint64>(hi) << 32;
+  //			return *reinterpret_cast<double*>(&bits);			//violating strict aliasing!
+  double out;
+  std::memcpy(&out, &bits, sizeof(double));
+  return out;
+}
+
+/// Convert half-precision to non-IEEE floating point.
+/// \tparam T type to convert to (builtin integer type)
+/// \param value binary representation of half-precision value
+/// \return floating point value
+template <typename T>
+T half2float_impl(uint16 value, T, ...)
+{
+  T out;
+  int abs = value & 0x7FFF;
+  if(abs > 0x7C00)
+    out = std::numeric_limits<T>::has_quiet_NaN ? std::numeric_limits<T>::quiet_NaN() : T();
+  else if(abs == 0x7C00)
+    out = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() : std::numeric_limits<T>::max();
+  else if(abs > 0x3FF)
+    out = std::ldexp(static_cast<T>((abs & 0x3FF) | 0x400), (abs >> 10) - 25);
+  else
+    out = std::ldexp(static_cast<T>(abs), -24);
+  return (value & 0x8000) ? -out : out;
+}
+
+/// Convert half-precision to floating point.
+/// \tparam T type to convert to (builtin integer type)
+/// \param value binary representation of half-precision value
+/// \return floating point value
+template <typename T>
+T half2float(uint16 value)
+{
+  return half2float_impl(value, T(),
+                         bool_type < std::numeric_limits<T>::is_iec559 && sizeof(typename bits<T>::type) == sizeof(T) > ());
+}
+
+/// Convert half-precision floating point to integer.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam E `true` for round to even, `false` for round away from zero
+/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
+/// \param value binary representation of half-precision value
+/// \return integral value
+template <std::float_round_style R, bool E, typename T>
+T half2int_impl(uint16 value)
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+  static_assert(std::is_integral<T>::value, "half to int conversion only supports builtin integer types");
+#endif
+  unsigned int e = value & 0x7FFF;
+  if(e >= 0x7C00)
+    return (value & 0x8000) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
+  if(e < 0x3800)
+    {
+      if(R == std::round_toward_infinity)
+        return T(~(value >> 15) & (e != 0));
+      else if(R == std::round_toward_neg_infinity)
+        return -T(value > 0x8000);
+      return T();
+    }
+  unsigned int m = (value & 0x3FF) | 0x400;
+  e >>= 10;
+  if(e < 25)
+    {
+      if(R == std::round_to_nearest)
+        m += (1 << (24 - e)) - (~(m >> (25 - e)) & E);
+      else if(R == std::round_toward_infinity)
+        m += ((value >> 15) - 1) & ((1 << (25 - e)) - 1U);
+      else if(R == std::round_toward_neg_infinity)
+        m += -(value >> 15) & ((1 << (25 - e)) - 1U);
+      m >>= 25 - e;
+    }
+  else
+    m <<= e - 25;
+  return (value & 0x8000) ? -static_cast<T>(m) : static_cast<T>(m);
+}
+
+/// Convert half-precision floating point to integer.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
+/// \param value binary representation of half-precision value
+/// \return integral value
+template <std::float_round_style R, typename T>
+T half2int(uint16 value)
+{
+  return half2int_impl<R, HALF_ROUND_TIES_TO_EVEN, T>(value);
+}
+
+/// Convert half-precision floating point to integer using round-to-nearest-away-from-zero.
+/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
+/// \param value binary representation of half-precision value
+/// \return integral value
+template <typename T>
+T half2int_up(uint16 value)
+{
+  return half2int_impl<std::round_to_nearest, 0, T>(value);
+}
+
+/// Round half-precision number to nearest integer value.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \tparam E `true` for round to even, `false` for round away from zero
+/// \param value binary representation of half-precision value
+/// \return half-precision bits for nearest integral value
+template <std::float_round_style R, bool E>
+uint16 round_half_impl(uint16 value)
+{
+  unsigned int e = value & 0x7FFF;
+  uint16 result  = value;
+  if(e < 0x3C00)
+    {
+      result &= 0x8000;
+      if(R == std::round_to_nearest)
+        result |= 0x3C00U & -(e >= (0x3800 + E));
+      else if(R == std::round_toward_infinity)
+        result |= 0x3C00U & -(~(value >> 15) & (e != 0));
+      else if(R == std::round_toward_neg_infinity)
+        result |= 0x3C00U & -(value > 0x8000);
+    }
+  else if(e < 0x6400)
+    {
+      e                 = 25 - (e >> 10);
+      unsigned int mask = (1 << e) - 1;
+      if(R == std::round_to_nearest)
+        result += (1 << (e - 1)) - (~(result >> e) & E);
+      else if(R == std::round_toward_infinity)
+        result += mask & ((value >> 15) - 1);
+      else if(R == std::round_toward_neg_infinity)
+        result += mask & -(value >> 15);
+      result &= ~mask;
+    }
+  return result;
+}
+
+/// Round half-precision number to nearest integer value.
+/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+/// \param value binary representation of half-precision value
+/// \return half-precision bits for nearest integral value
+template <std::float_round_style R>
+uint16 round_half(uint16 value)
+{
+  return round_half_impl<R, HALF_ROUND_TIES_TO_EVEN>(value);
+}
+
+/// Round half-precision number to nearest integer value using round-to-nearest-away-from-zero.
+/// \param value binary representation of half-precision value
+/// \return half-precision bits for nearest integral value
+inline uint16 round_half_up(uint16 value) { return round_half_impl<std::round_to_nearest, 0>(value); }
+/// \}
+
+struct functions;
+template <typename>
+struct unary_specialized;
+template <typename, typename>
+struct binary_specialized;
+template <typename, typename, std::float_round_style>
+struct half_caster;
+}  // namespace detail
+
+/// Half-precision floating point type.
+/// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and
+/// conversions. It is implicitly convertible to single-precision floating point, which makes artihmetic expressions and
+/// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations
+/// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to
+/// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic
+/// expressions are kept in single-precision as long as possible (while of course still maintaining a strong half-precision type).
+///
+/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and
+/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which
+/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the
+/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be of
+/// exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will most
+/// probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying 16-bit
+/// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if
+/// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the case on
+/// nearly any reasonable platform.
+///
+/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable
+/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation.
+class half
+{
+  friend struct detail::functions;
+  friend struct detail::unary_specialized<half>;
+  friend struct detail::binary_specialized<half, half>;
+  template <typename, typename, std::float_round_style>
+  friend struct detail::half_caster;
+  friend class std::numeric_limits<half>;
+#if HALF_ENABLE_CPP11_HASH
+  friend struct std::hash<half>;
+#endif
+#if HALF_ENABLE_CPP11_USER_LITERALS
+  friend half literal::operator""_h(long double);
+#endif
+
+ public:
+  /// Default constructor.
+  /// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics
+  /// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics.
+  HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
+
+  /// Copy constructor.
+  /// \tparam T type of concrete half expression
+  /// \param rhs half expression to copy from
+  half(detail::expr rhs) : data_(detail::float2half<round_style>(static_cast<float>(rhs))) {}
+
+  /// Conversion constructor.
+  /// \param rhs float to convert
+  explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
+
+  /// Conversion to single-precision.
+  /// \return single precision value representing expression value
+  operator float() const { return detail::half2float<float>(data_); }
+
+  /// Assignment operator.
+  /// \tparam T type of concrete half expression
+  /// \param rhs half expression to copy from
+  /// \return reference to this half
+  half &operator=(detail::expr rhs) { return *this = static_cast<float>(rhs); }
+
+  /// Arithmetic assignment.
+  /// \tparam T type of concrete half expression
+  /// \param rhs half expression to add
+  /// \return reference to this half
+  template <typename T>
+  typename detail::enable<half &, T>::type operator+=(T rhs)
+  {
+    return *this += static_cast<float>(rhs);
+  }
+
+  /// Arithmetic assignment.
+  /// \tparam T type of concrete half expression
+  /// \param rhs half expression to subtract
+  /// \return reference to this half
+  template <typename T>
+  typename detail::enable<half &, T>::type operator-=(T rhs)
+  {
+    return *this -= static_cast<float>(rhs);
+  }
+
+  /// Arithmetic assignment.
+  /// \tparam T type of concrete half expression
+  /// \param rhs half expression to multiply with
+  /// \return reference to this half
+  template <typename T>
+  typename detail::enable<half &, T>::type operator*=(T rhs)
+  {
+    return *this *= static_cast<float>(rhs);
+  }
+
+  /// Arithmetic assignment.
+  /// \tparam T type of concrete half expression
+  /// \param rhs half expression to divide by
+  /// \return reference to this half
+  template <typename T>
+  typename detail::enable<half &, T>::type operator/=(T rhs)
+  {
+    return *this /= static_cast<float>(rhs);
+  }
+
+  /// Assignment operator.
+  /// \param rhs single-precision value to copy from
+  /// \return reference to this half
+  half &operator=(float rhs)
+  {
+    data_ = detail::float2half<round_style>(rhs);
+    return *this;
+  }
+
+  /// Arithmetic assignment.
+  /// \param rhs single-precision value to add
+  /// \return reference to this half
+  half &operator+=(float rhs)
+  {
+    data_ = detail::float2half<round_style>(detail::half2float<float>(data_) + rhs);
+    return *this;
+  }
+
+  /// Arithmetic assignment.
+  /// \param rhs single-precision value to subtract
+  /// \return reference to this half
+  half &operator-=(float rhs)
+  {
+    data_ = detail::float2half<round_style>(detail::half2float<float>(data_) - rhs);
+    return *this;
+  }
+
+  /// Arithmetic assignment.
+  /// \param rhs single-precision value to multiply with
+  /// \return reference to this half
+  half &operator*=(float rhs)
+  {
+    data_ = detail::float2half<round_style>(detail::half2float<float>(data_) * rhs);
+    return *this;
+  }
+
+  /// Arithmetic assignment.
+  /// \param rhs single-precision value to divide by
+  /// \return reference to this half
+  half &operator/=(float rhs)
+  {
+    data_ = detail::float2half<round_style>(detail::half2float<float>(data_) / rhs);
+    return *this;
+  }
+
+  /// Prefix increment.
+  /// \return incremented half value
+  half &operator++() { return *this += 1.0f; }
+
+  /// Prefix decrement.
+  /// \return decremented half value
+  half &operator--() { return *this -= 1.0f; }
+
+  /// Postfix increment.
+  /// \return non-incremented half value
+  half operator++(int)
+  {
+    half out(*this);
+    ++*this;
+    return out;
+  }
+
+  /// Postfix decrement.
+  /// \return non-decremented half value
+  half operator--(int)
+  {
+    half out(*this);
+    --*this;
+    return out;
+  }
+
+ private:
+  /// Rounding mode to use
+  static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE);
+
+  /// Constructor.
+  /// \param bits binary representation to set half to
+  HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT : data_(bits) {}
+
+  /// Internal binary representation
+  detail::uint16 data_;
+};
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+namespace literal
+{
+/// Half literal.
+/// While this returns an actual half-precision value, half literals can unfortunately not be constant expressions due
+/// to rather involved conversions.
+/// \param value literal value
+/// \return half with given value (if representable)
+inline half operator""_h(long double value) { return half(detail::binary, detail::float2half<half::round_style>(value)); }
+}  // namespace literal
+#endif
+
+namespace detail
+{
+/// Wrapper implementing unspecialized half-precision functions.
+struct functions
+{
+  /// Addition implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \return Half-precision sum stored in single-precision
+  static expr plus(float x, float y) { return expr(x + y); }
+
+  /// Subtraction implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \return Half-precision difference stored in single-precision
+  static expr minus(float x, float y) { return expr(x - y); }
+
+  /// Multiplication implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \return Half-precision product stored in single-precision
+  static expr multiplies(float x, float y) { return expr(x * y); }
+
+  /// Division implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \return Half-precision quotient stored in single-precision
+  static expr divides(float x, float y) { return expr(x / y); }
+
+  /// Output implementation.
+  /// \param out stream to write to
+  /// \param arg value to write
+  /// \return reference to stream
+  template <typename charT, typename traits>
+  static std::basic_ostream<charT, traits> &write(std::basic_ostream<charT, traits> &out, float arg)
+  {
+    return out << arg;
+  }
+
+  /// Input implementation.
+  /// \param in stream to read from
+  /// \param arg half to read into
+  /// \return reference to stream
+  template <typename charT, typename traits>
+  static std::basic_istream<charT, traits> &read(std::basic_istream<charT, traits> &in, half &arg)
+  {
+    float f;
+    if(in >> f)
+      arg = f;
+    return in;
+  }
+
+  /// Modulo implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \return Half-precision division remainder stored in single-precision
+  static expr fmod(float x, float y) { return expr(std::fmod(x, y)); }
+
+  /// Remainder implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \return Half-precision division remainder stored in single-precision
+  static expr remainder(float x, float y)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::remainder(x, y));
+#else
+    if(builtin_isnan(x) || builtin_isnan(y))
+      return expr(std::numeric_limits<float>::quiet_NaN());
+    float ax = std::fabs(x), ay = std::fabs(y);
+    if(ax >= 65536.0f || ay < std::ldexp(1.0f, -24))
+      return expr(std::numeric_limits<float>::quiet_NaN());
+    if(ay >= 65536.0f)
+      return expr(x);
+    if(ax == ay)
+      return expr(builtin_signbit(x) ? -0.0f : 0.0f);
+    ax       = std::fmod(ax, ay + ay);
+    float y2 = 0.5f * ay;
+    if(ax > y2)
+      {
+        ax -= ay;
+        if(ax >= y2)
+          ax -= ay;
+      }
+    return expr(builtin_signbit(x) ? -ax : ax);
+#endif
+  }
+
+  /// Remainder implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \param quo address to store quotient bits at
+  /// \return Half-precision division remainder stored in single-precision
+  static expr remquo(float x, float y, int *quo)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::remquo(x, y, quo));
+#else
+    if(builtin_isnan(x) || builtin_isnan(y))
+      return expr(std::numeric_limits<float>::quiet_NaN());
+    bool sign = builtin_signbit(x), qsign = static_cast<bool>(sign ^ builtin_signbit(y));
+    float ax = std::fabs(x), ay = std::fabs(y);
+    if(ax >= 65536.0f || ay < std::ldexp(1.0f, -24))
+      return expr(std::numeric_limits<float>::quiet_NaN());
+    if(ay >= 65536.0f)
+      return expr(x);
+    if(ax == ay)
+      return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f);
+    ax       = std::fmod(ax, 8.0f * ay);
+    int cquo = 0;
+    if(ax >= 4.0f * ay)
+      {
+        ax -= 4.0f * ay;
+        cquo += 4;
+      }
+    if(ax >= 2.0f * ay)
+      {
+        ax -= 2.0f * ay;
+        cquo += 2;
+      }
+    float y2 = 0.5f * ay;
+    if(ax > y2)
+      {
+        ax -= ay;
+        ++cquo;
+        if(ax >= y2)
+          {
+            ax -= ay;
+            ++cquo;
+          }
+      }
+    return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax);
+#endif
+  }
+
+  /// Positive difference implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \return Positive difference stored in single-precision
+  static expr fdim(float x, float y)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::fdim(x, y));
+#else
+    return expr((x <= y) ? 0.0f : (x - y));
+#endif
+  }
+
+  /// Fused multiply-add implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \param z third operand
+  /// \return \a x * \a y + \a z stored in single-precision
+  static expr fma(float x, float y, float z)
+  {
+#if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF)
+    return expr(std::fma(x, y, z));
+#else
+    return expr(x * y + z);
+#endif
+  }
+
+  /// Get NaN.
+  /// \return Half-precision quiet NaN
+  static half nanh() { return half(binary, 0x7FFF); }
+
+  /// Exponential implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr exp(float arg) { return expr(std::exp(arg)); }
+
+  /// Exponential implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr expm1(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::expm1(arg));
+#else
+    return expr(static_cast<float>(std::exp(static_cast<double>(arg)) - 1.0));
+#endif
+  }
+
+  /// Binary exponential implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr exp2(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::exp2(arg));
+#else
+    return expr(static_cast<float>(std::exp(arg * 0.69314718055994530941723212145818)));
+#endif
+  }
+
+  /// Logarithm implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr log(float arg) { return expr(std::log(arg)); }
+
+  /// Common logarithm implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr log10(float arg) { return expr(std::log10(arg)); }
+
+  /// Logarithm implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr log1p(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::log1p(arg));
+#else
+    return expr(static_cast<float>(std::log(1.0 + arg)));
+#endif
+  }
+
+  /// Binary logarithm implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr log2(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::log2(arg));
+#else
+    return expr(static_cast<float>(std::log(static_cast<double>(arg)) * 1.4426950408889634073599246810019));
+#endif
+  }
+
+  /// Square root implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr sqrt(float arg) { return expr(std::sqrt(arg)); }
+
+  /// Cubic root implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr cbrt(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::cbrt(arg));
+#else
+    if(builtin_isnan(arg) || builtin_isinf(arg))
+      return expr(arg);
+    return expr(builtin_signbit(arg) ? -static_cast<float>(std::pow(-static_cast<double>(arg), 1.0 / 3.0))
+                                     : static_cast<float>(std::pow(static_cast<double>(arg), 1.0 / 3.0)));
+#endif
+  }
+
+  /// Hypotenuse implementation.
+  /// \param x first argument
+  /// \param y second argument
+  /// \return function value stored in single-preicision
+  static expr hypot(float x, float y)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::hypot(x, y));
+#else
+    return expr((builtin_isinf(x) || builtin_isinf(y))
+                    ? std::numeric_limits<float>::infinity()
+                    : static_cast<float>(std::sqrt(static_cast<double>(x) * x + static_cast<double>(y) * y)));
+#endif
+  }
+
+  /// Power implementation.
+  /// \param base value to exponentiate
+  /// \param exp power to expontiate to
+  /// \return function value stored in single-preicision
+  static expr pow(float base, float exp) { return expr(std::pow(base, exp)); }
+
+  /// Sine implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr sin(float arg) { return expr(std::sin(arg)); }
+
+  /// Cosine implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr cos(float arg) { return expr(std::cos(arg)); }
+
+  /// Tan implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr tan(float arg) { return expr(std::tan(arg)); }
+
+  /// Arc sine implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr asin(float arg) { return expr(std::asin(arg)); }
+
+  /// Arc cosine implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr acos(float arg) { return expr(std::acos(arg)); }
+
+  /// Arc tangent implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr atan(float arg) { return expr(std::atan(arg)); }
+
+  /// Arc tangent implementation.
+  /// \param x first argument
+  /// \param y second argument
+  /// \return function value stored in single-preicision
+  static expr atan2(float x, float y) { return expr(std::atan2(x, y)); }
+
+  /// Hyperbolic sine implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr sinh(float arg) { return expr(std::sinh(arg)); }
+
+  /// Hyperbolic cosine implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr cosh(float arg) { return expr(std::cosh(arg)); }
+
+  /// Hyperbolic tangent implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr tanh(float arg) { return expr(std::tanh(arg)); }
+
+  /// Hyperbolic area sine implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr asinh(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::asinh(arg));
+#else
+    return expr((arg == -std::numeric_limits<float>::infinity()) ? arg
+                                                                 : static_cast<float>(std::log(arg + std::sqrt(arg * arg + 1.0))));
+#endif
+  }
+
+  /// Hyperbolic area cosine implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr acosh(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::acosh(arg));
+#else
+    return expr((arg < -1.0f) ? std::numeric_limits<float>::quiet_NaN()
+                              : static_cast<float>(std::log(arg + std::sqrt(arg * arg - 1.0))));
+#endif
+  }
+
+  /// Hyperbolic area tangent implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr atanh(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::atanh(arg));
+#else
+    return expr(static_cast<float>(0.5 * std::log((1.0 + arg) / (1.0 - arg))));
+#endif
+  }
+
+  /// Error function implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr erf(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::erf(arg));
+#else
+    return expr(static_cast<float>(erf(static_cast<double>(arg))));
+#endif
+  }
+
+  /// Complementary implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr erfc(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::erfc(arg));
+#else
+    return expr(static_cast<float>(1.0 - erf(static_cast<double>(arg))));
+#endif
+  }
+
+  /// Gamma logarithm implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr lgamma(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::lgamma(arg));
+#else
+    if(builtin_isinf(arg))
+      return expr(std::numeric_limits<float>::infinity());
+    if(arg < 0.0f)
+      {
+        float i, f = std::modf(-arg, &i);
+        if(f == 0.0f)
+          return expr(std::numeric_limits<float>::infinity());
+        return expr(static_cast<float>(1.1447298858494001741434273513531 -
+                                       std::log(std::abs(std::sin(3.1415926535897932384626433832795 * f))) - lgamma(1.0 - arg)));
+      }
+    return expr(static_cast<float>(lgamma(static_cast<double>(arg))));
+#endif
+  }
+
+  /// Gamma implementation.
+  /// \param arg function argument
+  /// \return function value stored in single-preicision
+  static expr tgamma(float arg)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::tgamma(arg));
+#else
+    if(arg == 0.0f)
+      return builtin_signbit(arg) ? expr(-std::numeric_limits<float>::infinity()) : expr(std::numeric_limits<float>::infinity());
+    if(arg < 0.0f)
+      {
+        float i, f = std::modf(-arg, &i);
+        if(f == 0.0f)
+          return expr(std::numeric_limits<float>::quiet_NaN());
+        double value =
+            3.1415926535897932384626433832795 / (std::sin(3.1415926535897932384626433832795 * f) * std::exp(lgamma(1.0 - arg)));
+        return expr(static_cast<float>((std::fmod(i, 2.0f) == 0.0f) ? -value : value));
+      }
+    if(builtin_isinf(arg))
+      return expr(arg);
+    return expr(static_cast<float>(std::exp(lgamma(static_cast<double>(arg)))));
+#endif
+  }
+
+  /// Floor implementation.
+  /// \param arg value to round
+  /// \return rounded value
+  static half floor(half arg) { return half(binary, round_half<std::round_toward_neg_infinity>(arg.data_)); }
+
+  /// Ceiling implementation.
+  /// \param arg value to round
+  /// \return rounded value
+  static half ceil(half arg) { return half(binary, round_half<std::round_toward_infinity>(arg.data_)); }
+
+  /// Truncation implementation.
+  /// \param arg value to round
+  /// \return rounded value
+  static half trunc(half arg) { return half(binary, round_half<std::round_toward_zero>(arg.data_)); }
+
+  /// Nearest integer implementation.
+  /// \param arg value to round
+  /// \return rounded value
+  static half round(half arg) { return half(binary, round_half_up(arg.data_)); }
+
+  /// Nearest integer implementation.
+  /// \param arg value to round
+  /// \return rounded value
+  static long lround(half arg) { return detail::half2int_up<long>(arg.data_); }
+
+  /// Nearest integer implementation.
+  /// \param arg value to round
+  /// \return rounded value
+  static half rint(half arg) { return half(binary, round_half<half::round_style>(arg.data_)); }
+
+  /// Nearest integer implementation.
+  /// \param arg value to round
+  /// \return rounded value
+  static long lrint(half arg) { return detail::half2int<half::round_style, long>(arg.data_); }
+
+#if HALF_ENABLE_CPP11_LONG_LONG
+  /// Nearest integer implementation.
+  /// \param arg value to round
+  /// \return rounded value
+  static long long llround(half arg) { return detail::half2int_up<long long>(arg.data_); }
+
+  /// Nearest integer implementation.
+  /// \param arg value to round
+  /// \return rounded value
+  static long long llrint(half arg) { return detail::half2int<half::round_style, long long>(arg.data_); }
+#endif
+
+  /// Decompression implementation.
+  /// \param arg number to decompress
+  /// \param exp address to store exponent at
+  /// \return normalized significant
+  static half frexp(half arg, int *exp)
+  {
+    int m = arg.data_ & 0x7FFF, e = -14;
+    if(m >= 0x7C00 || !m)
+      return *exp = 0, arg;
+    for(; m < 0x400; m <<= 1, --e)
+      ;
+    return *exp = e + (m >> 10), half(binary, (arg.data_ & 0x8000) | 0x3800 | (m & 0x3FF));
+  }
+
+  /// Decompression implementation.
+  /// \param arg number to decompress
+  /// \param iptr address to store integer part at
+  /// \return fractional part
+  static half modf(half arg, half *iptr)
+  {
+    unsigned int e = arg.data_ & 0x7FFF;
+    if(e >= 0x6400)
+      return *iptr = arg, half(binary, arg.data_ & (0x8000U | -(e > 0x7C00)));
+    if(e < 0x3C00)
+      return iptr->data_ = arg.data_ & 0x8000, arg;
+    e >>= 10;
+    unsigned int mask = (1 << (25 - e)) - 1, m = arg.data_ & mask;
+    iptr->data_ = arg.data_ & ~mask;
+    if(!m)
+      return half(binary, arg.data_ & 0x8000);
+    for(; m < 0x400; m <<= 1, --e)
+      ;
+    return half(binary, static_cast<uint16>((arg.data_ & 0x8000) | (e << 10) | (m & 0x3FF)));
+  }
+
+  /// Scaling implementation.
+  /// \param arg number to scale
+  /// \param exp power of two to scale by
+  /// \return scaled number
+  static half scalbln(half arg, long exp)
+  {
+    unsigned int m = arg.data_ & 0x7FFF;
+    if(m >= 0x7C00 || !m)
+      return arg;
+    for(; m < 0x400; m <<= 1, --exp)
+      ;
+    exp += m >> 10;
+    uint16 value = arg.data_ & 0x8000;
+    if(exp > 30)
+      {
+        if(half::round_style == std::round_toward_zero)
+          value |= 0x7BFF;
+        else if(half::round_style == std::round_toward_infinity)
+          value |= 0x7C00 - (value >> 15);
+        else if(half::round_style == std::round_toward_neg_infinity)
+          value |= 0x7BFF + (value >> 15);
+        else
+          value |= 0x7C00;
+      }
+    else if(exp > 0)
+      value |= (exp << 10) | (m & 0x3FF);
+    else if(exp > -11)
+      {
+        m = (m & 0x3FF) | 0x400;
+        if(half::round_style == std::round_to_nearest)
+          {
+            m += 1 << -exp;
+#if HALF_ROUND_TIES_TO_EVEN
+            m -= (m >> (1 - exp)) & 1;
+#endif
+          }
+        else if(half::round_style == std::round_toward_infinity)
+          m += ((value >> 15) - 1) & ((1 << (1 - exp)) - 1U);
+        else if(half::round_style == std::round_toward_neg_infinity)
+          m += -(value >> 15) & ((1 << (1 - exp)) - 1U);
+        value |= m >> (1 - exp);
+      }
+    else if(half::round_style == std::round_toward_infinity)
+      value -= (value >> 15) - 1;
+    else if(half::round_style == std::round_toward_neg_infinity)
+      value += value >> 15;
+    return half(binary, value);
+  }
+
+  /// Exponent implementation.
+  /// \param arg number to query
+  /// \return floating point exponent
+  static int ilogb(half arg)
+  {
+    int abs = arg.data_ & 0x7FFF;
+    if(!abs)
+      return FP_ILOGB0;
+    if(abs < 0x7C00)
+      {
+        int exp = (abs >> 10) - 15;
+        if(abs < 0x400)
+          for(; abs < 0x200; abs <<= 1, --exp)
+            ;
+        return exp;
+      }
+    if(abs > 0x7C00)
+      return FP_ILOGBNAN;
+    return INT_MAX;
+  }
+
+  /// Exponent implementation.
+  /// \param arg number to query
+  /// \return floating point exponent
+  static half logb(half arg)
+  {
+    int abs = arg.data_ & 0x7FFF;
+    if(!abs)
+      return half(binary, 0xFC00);
+    if(abs < 0x7C00)
+      {
+        int exp = (abs >> 10) - 15;
+        if(abs < 0x400)
+          for(; abs < 0x200; abs <<= 1, --exp)
+            ;
+        uint16 bits = (exp < 0) << 15;
+        if(exp)
+          {
+            unsigned int m = std::abs(exp) << 6, e = 18;
+            for(; m < 0x400; m <<= 1, --e)
+              ;
+            bits |= (e << 10) + m;
+          }
+        return half(binary, bits);
+      }
+    if(abs > 0x7C00)
+      return arg;
+    return half(binary, 0x7C00);
+  }
+
+  /// Enumeration implementation.
+  /// \param from number to increase/decrease
+  /// \param to direction to enumerate into
+  /// \return next representable number
+  static half nextafter(half from, half to)
+  {
+    uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
+    if(fabs > 0x7C00)
+      return from;
+    if(tabs > 0x7C00 || from.data_ == to.data_ || !(fabs | tabs))
+      return to;
+    if(!fabs)
+      return half(binary, (to.data_ & 0x8000) + 1);
+    bool lt = ((fabs == from.data_) ? static_cast<int>(fabs) : -static_cast<int>(fabs)) <
+              ((tabs == to.data_) ? static_cast<int>(tabs) : -static_cast<int>(tabs));
+    return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast<unsigned>(lt)) << 1) - 1);
+  }
+
+  /// Enumeration implementation.
+  /// \param from number to increase/decrease
+  /// \param to direction to enumerate into
+  /// \return next representable number
+  static half nexttoward(half from, long double to)
+  {
+    if(isnan(from))
+      return from;
+    long double lfrom = static_cast<long double>(from);
+    if(builtin_isnan(to) || lfrom == to)
+      return half(static_cast<float>(to));
+    if(!(from.data_ & 0x7FFF))
+      return half(binary, (static_cast<detail::uint16>(builtin_signbit(to)) << 15) + 1);
+    return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast<unsigned>(lfrom < to)) << 1) - 1);
+  }
+
+  /// Sign implementation
+  /// \param x first operand
+  /// \param y second operand
+  /// \return composed value
+  static half copysign(half x, half y) { return half(binary, x.data_ ^ ((x.data_ ^ y.data_) & 0x8000)); }
+
+  /// Classification implementation.
+  /// \param arg value to classify
+  /// \retval true if infinite number
+  /// \retval false else
+  static int fpclassify(half arg)
+  {
+    unsigned int abs = arg.data_ & 0x7FFF;
+    return abs ? ((abs > 0x3FF) ? ((abs >= 0x7C00) ? ((abs > 0x7C00) ? FP_NAN : FP_INFINITE) : FP_NORMAL) : FP_SUBNORMAL) : FP_ZERO;
+  }
+
+  /// Classification implementation.
+  /// \param arg value to classify
+  /// \retval true if finite number
+  /// \retval false else
+  static bool isfinite(half arg) { return (arg.data_ & 0x7C00) != 0x7C00; }
+
+  /// Classification implementation.
+  /// \param arg value to classify
+  /// \retval true if infinite number
+  /// \retval false else
+  static bool isinf(half arg) { return (arg.data_ & 0x7FFF) == 0x7C00; }
+
+  /// Classification implementation.
+  /// \param arg value to classify
+  /// \retval true if not a number
+  /// \retval false else
+  static bool isnan(half arg) { return (arg.data_ & 0x7FFF) > 0x7C00; }
+
+  /// Classification implementation.
+  /// \param arg value to classify
+  /// \retval true if normal number
+  /// \retval false else
+  static bool isnormal(half arg) { return ((arg.data_ & 0x7C00) != 0) & ((arg.data_ & 0x7C00) != 0x7C00); }
+
+  /// Sign bit implementation.
+  /// \param arg value to check
+  /// \retval true if signed
+  /// \retval false if unsigned
+  static bool signbit(half arg) { return (arg.data_ & 0x8000) != 0; }
+
+  /// Comparison implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \retval true if operands equal
+  /// \retval false else
+  static bool isequal(half x, half y) { return (x.data_ == y.data_ || !((x.data_ | y.data_) & 0x7FFF)) && !isnan(x); }
+
+  /// Comparison implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \retval true if operands not equal
+  /// \retval false else
+  static bool isnotequal(half x, half y) { return (x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF)) || isnan(x); }
+
+  /// Comparison implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \retval true if \a x > \a y
+  /// \retval false else
+  static bool isgreater(half x, half y)
+  {
+    int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+    return xabs <= 0x7C00 && yabs <= 0x7C00 && (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs));
+  }
+
+  /// Comparison implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \retval true if \a x >= \a y
+  /// \retval false else
+  static bool isgreaterequal(half x, half y)
+  {
+    int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+    return xabs <= 0x7C00 && yabs <= 0x7C00 && (((xabs == x.data_) ? xabs : -xabs) >= ((yabs == y.data_) ? yabs : -yabs));
+  }
+
+  /// Comparison implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \retval true if \a x < \a y
+  /// \retval false else
+  static bool isless(half x, half y)
+  {
+    int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+    return xabs <= 0x7C00 && yabs <= 0x7C00 && (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs));
+  }
+
+  /// Comparison implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \retval true if \a x <= \a y
+  /// \retval false else
+  static bool islessequal(half x, half y)
+  {
+    int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+    return xabs <= 0x7C00 && yabs <= 0x7C00 && (((xabs == x.data_) ? xabs : -xabs) <= ((yabs == y.data_) ? yabs : -yabs));
+  }
+
+  /// Comparison implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \retval true if either \a x > \a y nor \a x < \a y
+  /// \retval false else
+  static bool islessgreater(half x, half y)
+  {
+    int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+    if(xabs > 0x7C00 || yabs > 0x7C00)
+      return false;
+    int a = (xabs == x.data_) ? xabs : -xabs, b = (yabs == y.data_) ? yabs : -yabs;
+    return a < b || a > b;
+  }
+
+  /// Comparison implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \retval true if operand unordered
+  /// \retval false else
+  static bool isunordered(half x, half y) { return isnan(x) || isnan(y); }
+
+ private:
+  static double erf(double arg)
+  {
+    if(builtin_isinf(arg))
+      return (arg < 0.0) ? -1.0 : 1.0;
+    double x2 = arg * arg, ax2 = 0.147 * x2,
+           value = std::sqrt(1.0 - std::exp(-x2 * (1.2732395447351626861510701069801 + ax2) / (1.0 + ax2)));
+    return builtin_signbit(arg) ? -value : value;
+  }
+
+  static double lgamma(double arg)
+  {
+    double v = 1.0;
+    for(; arg < 8.0; ++arg)
+      v *= arg;
+    double w = 1.0 / (arg * arg);
+    return (((((((-0.02955065359477124183006535947712 * w + 0.00641025641025641025641025641026) * w +
+                 -0.00191752691752691752691752691753) *
+                    w +
+                8.4175084175084175084175084175084e-4) *
+                   w +
+               -5.952380952380952380952380952381e-4) *
+                  w +
+              7.9365079365079365079365079365079e-4) *
+                 w +
+             -0.00277777777777777777777777777778) *
+                w +
+            0.08333333333333333333333333333333) /
+               arg +
+           0.91893853320467274178032973640562 - std::log(v) - arg + (arg - 0.5) * std::log(arg);
+  }
+};
+
+/// Wrapper for unary half-precision functions needing specialization for individual argument types.
+/// \tparam T argument type
+template <typename T>
+struct unary_specialized
+{
+  /// Negation implementation.
+  /// \param arg value to negate
+  /// \return negated value
+  static HALF_CONSTEXPR half negate(half arg) { return half(binary, arg.data_ ^ 0x8000); }
+
+  /// Absolute value implementation.
+  /// \param arg function argument
+  /// \return absolute value
+  static half fabs(half arg) { return half(binary, arg.data_ & 0x7FFF); }
+};
+template <>
+struct unary_specialized<expr>
+{
+  static HALF_CONSTEXPR expr negate(float arg) { return expr(-arg); }
+  static expr fabs(float arg) { return expr(std::fabs(arg)); }
+};
+
+/// Wrapper for binary half-precision functions needing specialization for individual argument types.
+/// \tparam T first argument type
+/// \tparam U first argument type
+template <typename T, typename U>
+struct binary_specialized
+{
+  /// Minimum implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \return minimum value
+  static expr fmin(float x, float y)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::fmin(x, y));
+#else
+    if(builtin_isnan(x))
+      return expr(y);
+    if(builtin_isnan(y))
+      return expr(x);
+    return expr(std::min(x, y));
+#endif
+  }
+
+  /// Maximum implementation.
+  /// \param x first operand
+  /// \param y second operand
+  /// \return maximum value
+  static expr fmax(float x, float y)
+  {
+#if HALF_ENABLE_CPP11_CMATH
+    return expr(std::fmax(x, y));
+#else
+    if(builtin_isnan(x))
+      return expr(y);
+    if(builtin_isnan(y))
+      return expr(x);
+    return expr(std::max(x, y));
+#endif
+  }
+};
+template <>
+struct binary_specialized<half, half>
+{
+  static half fmin(half x, half y)
+  {
+    int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+    if(xabs > 0x7C00)
+      return y;
+    if(yabs > 0x7C00)
+      return x;
+    return (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs)) ? y : x;
+  }
+  static half fmax(half x, half y)
+  {
+    int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+    if(xabs > 0x7C00)
+      return y;
+    if(yabs > 0x7C00)
+      return x;
+    return (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs)) ? y : x;
+  }
+};
+
+/// Helper class for half casts.
+/// This class template has to be specialized for all valid cast argument to define an appropriate static `cast` member
+/// function and a corresponding `type` member denoting its return type.
+/// \tparam T destination type
+/// \tparam U source type
+/// \tparam R rounding mode to use
+template <typename T, typename U, std::float_round_style R = (std::float_round_style)(HALF_ROUND_STYLE)>
+struct half_caster
+{
+};
+template <typename U, std::float_round_style R>
+struct half_caster<half, U, R>
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+  static_assert(std::is_arithmetic<U>::value, "half_cast from non-arithmetic type unsupported");
+#endif
+
+  static half cast(U arg) { return cast_impl(arg, is_float<U>()); };
+
+ private:
+  static half cast_impl(U arg, true_type) { return half(binary, float2half<R>(arg)); }
+  static half cast_impl(U arg, false_type) { return half(binary, int2half<R>(arg)); }
+};
+template <typename T, std::float_round_style R>
+struct half_caster<T, half, R>
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+  static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
+#endif
+
+  static T cast(half arg) { return cast_impl(arg, is_float<T>()); }
+
+ private:
+  static T cast_impl(half arg, true_type) { return half2float<T>(arg.data_); }
+  static T cast_impl(half arg, false_type) { return half2int<R, T>(arg.data_); }
+};
+template <typename T, std::float_round_style R>
+struct half_caster<T, expr, R>
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+  static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
+#endif
+
+  static T cast(expr arg) { return cast_impl(arg, is_float<T>()); }
+
+ private:
+  static T cast_impl(float arg, true_type) { return static_cast<T>(arg); }
+  static T cast_impl(half arg, false_type) { return half2int<R, T>(arg.data_); }
+};
+template <std::float_round_style R>
+struct half_caster<half, half, R>
+{
+  static half cast(half arg) { return arg; }
+};
+template <std::float_round_style R>
+struct half_caster<half, expr, R> : half_caster<half, half, R>
+{
+};
+
+/// \name Comparison operators
+/// \{
+
+/// Comparison for equality.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if operands equal
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator==(T x, U y)
+{
+  return functions::isequal(x, y);
+}
+
+/// Comparison for inequality.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if operands not equal
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator!=(T x, U y)
+{
+  return functions::isnotequal(x, y);
+}
+
+/// Comparison for less than.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less than \a y
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator<(T x, U y)
+{
+  return functions::isless(x, y);
+}
+
+/// Comparison for greater than.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater than \a y
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator>(T x, U y)
+{
+  return functions::isgreater(x, y);
+}
+
+/// Comparison for less equal.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less equal \a y
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator<=(T x, U y)
+{
+  return functions::islessequal(x, y);
+}
+
+/// Comparison for greater equal.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater equal \a y
+/// \retval false else
+template <typename T, typename U>
+typename enable<bool, T, U>::type operator>=(T x, U y)
+{
+  return functions::isgreaterequal(x, y);
+}
+
+/// \}
+/// \name Arithmetic operators
+/// \{
+
+/// Add halfs.
+/// \param x left operand
+/// \param y right operand
+/// \return sum of half expressions
+template <typename T, typename U>
+typename enable<expr, T, U>::type operator+(T x, U y)
+{
+  return functions::plus(x, y);
+}
+
+/// Subtract halfs.
+/// \param x left operand
+/// \param y right operand
+/// \return difference of half expressions
+template <typename T, typename U>
+typename enable<expr, T, U>::type operator-(T x, U y)
+{
+  return functions::minus(x, y);
+}
+
+/// Multiply halfs.
+/// \param x left operand
+/// \param y right operand
+/// \return product of half expressions
+template <typename T, typename U>
+typename enable<expr, T, U>::type operator*(T x, U y)
+{
+  return functions::multiplies(x, y);
+}
+
+/// Divide halfs.
+/// \param x left operand
+/// \param y right operand
+/// \return quotient of half expressions
+template <typename T, typename U>
+typename enable<expr, T, U>::type operator/(T x, U y)
+{
+  return functions::divides(x, y);
+}
+
+/// Identity.
+/// \param arg operand
+/// \return uncahnged operand
+template <typename T>
+HALF_CONSTEXPR typename enable<T, T>::type operator+(T arg)
+{
+  return arg;
+}
+
+/// Negation.
+/// \param arg operand
+/// \return negated operand
+template <typename T>
+HALF_CONSTEXPR typename enable<T, T>::type operator-(T arg)
+{
+  return unary_specialized<T>::negate(arg);
+}
+
+/// \}
+/// \name Input and output
+/// \{
+
+/// Output operator.
+/// \param out output stream to write into
+/// \param arg half expression to write
+/// \return reference to output stream
+template <typename T, typename charT, typename traits>
+typename enable<std::basic_ostream<charT, traits> &, T>::type operator<<(std::basic_ostream<charT, traits> &out, T arg)
+{
+  return functions::write(out, arg);
+}
+
+/// Input operator.
+/// \param in input stream to read from
+/// \param arg half to read into
+/// \return reference to input stream
+template <typename charT, typename traits>
+std::basic_istream<charT, traits> &operator>>(std::basic_istream<charT, traits> &in, half &arg)
+{
+  return functions::read(in, arg);
+}
+
+/// \}
+/// \name Basic mathematical operations
+/// \{
+
+/// Absolute value.
+/// \param arg operand
+/// \return absolute value of \a arg
+//		template<typename T> typename enable<T,T>::type abs(T arg) { return unary_specialized<T>::fabs(arg); }
+inline half abs(half arg) { return unary_specialized<half>::fabs(arg); }
+inline expr abs(expr arg) { return unary_specialized<expr>::fabs(arg); }
+
+/// Absolute value.
+/// \param arg operand
+/// \return absolute value of \a arg
+//		template<typename T> typename enable<T,T>::type fabs(T arg) { return unary_specialized<T>::fabs(arg); }
+inline half fabs(half arg) { return unary_specialized<half>::fabs(arg); }
+inline expr fabs(expr arg) { return unary_specialized<expr>::fabs(arg); }
+
+/// Remainder of division.
+/// \param x first operand
+/// \param y second operand
+/// \return remainder of floating point division.
+//		template<typename T,typename U> typename enable<expr,T,U>::type fmod(T x, U y) { return functions::fmod(x, y); }
+inline expr fmod(half x, half y) { return functions::fmod(x, y); }
+inline expr fmod(half x, expr y) { return functions::fmod(x, y); }
+inline expr fmod(expr x, half y) { return functions::fmod(x, y); }
+inline expr fmod(expr x, expr y) { return functions::fmod(x, y); }
+
+/// Remainder of division.
+/// \param x first operand
+/// \param y second operand
+/// \return remainder of floating point division.
+//		template<typename T,typename U> typename enable<expr,T,U>::type remainder(T x, U y) { return functions::remainder(x,
+// y);
+//}
+inline expr remainder(half x, half y) { return functions::remainder(x, y); }
+inline expr remainder(half x, expr y) { return functions::remainder(x, y); }
+inline expr remainder(expr x, half y) { return functions::remainder(x, y); }
+inline expr remainder(expr x, expr y) { return functions::remainder(x, y); }
+
+/// Remainder of division.
+/// \param x first operand
+/// \param y second operand
+/// \param quo address to store some bits of quotient at
+/// \return remainder of floating point division.
+//		template<typename T,typename U> typename enable<expr,T,U>::type remquo(T x, U y, int *quo) { return
+// functions::remquo(x, y, quo); }
+inline expr remquo(half x, half y, int *quo) { return functions::remquo(x, y, quo); }
+inline expr remquo(half x, expr y, int *quo) { return functions::remquo(x, y, quo); }
+inline expr remquo(expr x, half y, int *quo) { return functions::remquo(x, y, quo); }
+inline expr remquo(expr x, expr y, int *quo) { return functions::remquo(x, y, quo); }
+
+/// Fused multiply add.
+/// \param x first operand
+/// \param y second operand
+/// \param z third operand
+/// \return ( \a x * \a y ) + \a z rounded as one operation.
+//		template<typename T,typename U,typename V> typename enable<expr,T,U,V>::type fma(T x, U y, V z) { return
+// functions::fma(x, y, z); }
+inline expr fma(half x, half y, half z) { return functions::fma(x, y, z); }
+inline expr fma(half x, half y, expr z) { return functions::fma(x, y, z); }
+inline expr fma(half x, expr y, half z) { return functions::fma(x, y, z); }
+inline expr fma(half x, expr y, expr z) { return functions::fma(x, y, z); }
+inline expr fma(expr x, half y, half z) { return functions::fma(x, y, z); }
+inline expr fma(expr x, half y, expr z) { return functions::fma(x, y, z); }
+inline expr fma(expr x, expr y, half z) { return functions::fma(x, y, z); }
+inline expr fma(expr x, expr y, expr z) { return functions::fma(x, y, z); }
+
+/// Maximum of half expressions.
+/// \param x first operand
+/// \param y second operand
+/// \return maximum of operands
+//		template<typename T,typename U> typename result<T,U>::type fmax(T x, U y) { return binary_specialized<T,U>::fmax(x, y);
+//}
+inline half fmax(half x, half y) { return binary_specialized<half, half>::fmax(x, y); }
+inline expr fmax(half x, expr y) { return binary_specialized<half, expr>::fmax(x, y); }
+inline expr fmax(expr x, half y) { return binary_specialized<expr, half>::fmax(x, y); }
+inline expr fmax(expr x, expr y) { return binary_specialized<expr, expr>::fmax(x, y); }
+
+/// Minimum of half expressions.
+/// \param x first operand
+/// \param y second operand
+/// \return minimum of operands
+//		template<typename T,typename U> typename result<T,U>::type fmin(T x, U y) { return binary_specialized<T,U>::fmin(x, y);
+//}
+inline half fmin(half x, half y) { return binary_specialized<half, half>::fmin(x, y); }
+inline expr fmin(half x, expr y) { return binary_specialized<half, expr>::fmin(x, y); }
+inline expr fmin(expr x, half y) { return binary_specialized<expr, half>::fmin(x, y); }
+inline expr fmin(expr x, expr y) { return binary_specialized<expr, expr>::fmin(x, y); }
+
+/// Positive difference.
+/// \param x first operand
+/// \param y second operand
+/// \return \a x - \a y or 0 if difference negative
+//		template<typename T,typename U> typename enable<expr,T,U>::type fdim(T x, U y) { return functions::fdim(x, y); }
+inline expr fdim(half x, half y) { return functions::fdim(x, y); }
+inline expr fdim(half x, expr y) { return functions::fdim(x, y); }
+inline expr fdim(expr x, half y) { return functions::fdim(x, y); }
+inline expr fdim(expr x, expr y) { return functions::fdim(x, y); }
+
+/// Get NaN value.
+/// \return quiet NaN
+inline half nanh(const char *) { return functions::nanh(); }
+
+/// \}
+/// \name Exponential functions
+/// \{
+
+/// Exponential function.
+/// \param arg function argument
+/// \return e raised to \a arg
+//		template<typename T> typename enable<expr,T>::type exp(T arg) { return functions::exp(arg); }
+inline expr exp(half arg) { return functions::exp(arg); }
+inline expr exp(expr arg) { return functions::exp(arg); }
+
+/// Exponential minus one.
+/// \param arg function argument
+/// \return e raised to \a arg subtracted by 1
+//		template<typename T> typename enable<expr,T>::type expm1(T arg) { return functions::expm1(arg); }
+inline expr expm1(half arg) { return functions::expm1(arg); }
+inline expr expm1(expr arg) { return functions::expm1(arg); }
+
+/// Binary exponential.
+/// \param arg function argument
+/// \return 2 raised to \a arg
+//		template<typename T> typename enable<expr,T>::type exp2(T arg) { return functions::exp2(arg); }
+inline expr exp2(half arg) { return functions::exp2(arg); }
+inline expr exp2(expr arg) { return functions::exp2(arg); }
+
+/// Natural logorithm.
+/// \param arg function argument
+/// \return logarithm of \a arg to base e
+//		template<typename T> typename enable<expr,T>::type log(T arg) { return functions::log(arg); }
+inline expr log(half arg) { return functions::log(arg); }
+inline expr log(expr arg) { return functions::log(arg); }
+
+/// Common logorithm.
+/// \param arg function argument
+/// \return logarithm of \a arg to base 10
+//		template<typename T> typename enable<expr,T>::type log10(T arg) { return functions::log10(arg); }
+inline expr log10(half arg) { return functions::log10(arg); }
+inline expr log10(expr arg) { return functions::log10(arg); }
+
+/// Natural logorithm.
+/// \param arg function argument
+/// \return logarithm of \a arg plus 1 to base e
+//		template<typename T> typename enable<expr,T>::type log1p(T arg) { return functions::log1p(arg); }
+inline expr log1p(half arg) { return functions::log1p(arg); }
+inline expr log1p(expr arg) { return functions::log1p(arg); }
+
+/// Binary logorithm.
+/// \param arg function argument
+/// \return logarithm of \a arg to base 2
+//		template<typename T> typename enable<expr,T>::type log2(T arg) { return functions::log2(arg); }
+inline expr log2(half arg) { return functions::log2(arg); }
+inline expr log2(expr arg) { return functions::log2(arg); }
+
+/// \}
+/// \name Power functions
+/// \{
+
+/// Square root.
+/// \param arg function argument
+/// \return square root of \a arg
+//		template<typename T> typename enable<expr,T>::type sqrt(T arg) { return functions::sqrt(arg); }
+inline expr sqrt(half arg) { return functions::sqrt(arg); }
+inline expr sqrt(expr arg) { return functions::sqrt(arg); }
+
+/// Cubic root.
+/// \param arg function argument
+/// \return cubic root of \a arg
+//		template<typename T> typename enable<expr,T>::type cbrt(T arg) { return functions::cbrt(arg); }
+inline expr cbrt(half arg) { return functions::cbrt(arg); }
+inline expr cbrt(expr arg) { return functions::cbrt(arg); }
+
+/// Hypotenuse function.
+/// \param x first argument
+/// \param y second argument
+/// \return square root of sum of squares without internal over- or underflows
+//		template<typename T,typename U> typename enable<expr,T,U>::type hypot(T x, U y) { return functions::hypot(x, y); }
+inline expr hypot(half x, half y) { return functions::hypot(x, y); }
+inline expr hypot(half x, expr y) { return functions::hypot(x, y); }
+inline expr hypot(expr x, half y) { return functions::hypot(x, y); }
+inline expr hypot(expr x, expr y) { return functions::hypot(x, y); }
+
+/// Power function.
+/// \param base first argument
+/// \param exp second argument
+/// \return \a base raised to \a exp
+//		template<typename T,typename U> typename enable<expr,T,U>::type pow(T base, U exp) { return functions::pow(base, exp);
+//}
+inline expr pow(half base, half exp) { return functions::pow(base, exp); }
+inline expr pow(half base, expr exp) { return functions::pow(base, exp); }
+inline expr pow(expr base, half exp) { return functions::pow(base, exp); }
+inline expr pow(expr base, expr exp) { return functions::pow(base, exp); }
+
+/// \}
+/// \name Trigonometric functions
+/// \{
+
+/// Sine function.
+/// \param arg function argument
+/// \return sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type sin(T arg) { return functions::sin(arg); }
+inline expr sin(half arg) { return functions::sin(arg); }
+inline expr sin(expr arg) { return functions::sin(arg); }
+
+/// Cosine function.
+/// \param arg function argument
+/// \return cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type cos(T arg) { return functions::cos(arg); }
+inline expr cos(half arg) { return functions::cos(arg); }
+inline expr cos(expr arg) { return functions::cos(arg); }
+
+/// Tangent function.
+/// \param arg function argument
+/// \return tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type tan(T arg) { return functions::tan(arg); }
+inline expr tan(half arg) { return functions::tan(arg); }
+inline expr tan(expr arg) { return functions::tan(arg); }
+
+/// Arc sine.
+/// \param arg function argument
+/// \return arc sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type asin(T arg) { return functions::asin(arg); }
+inline expr asin(half arg) { return functions::asin(arg); }
+inline expr asin(expr arg) { return functions::asin(arg); }
+
+/// Arc cosine function.
+/// \param arg function argument
+/// \return arc cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type acos(T arg) { return functions::acos(arg); }
+inline expr acos(half arg) { return functions::acos(arg); }
+inline expr acos(expr arg) { return functions::acos(arg); }
+
+/// Arc tangent function.
+/// \param arg function argument
+/// \return arc tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type atan(T arg) { return functions::atan(arg); }
+inline expr atan(half arg) { return functions::atan(arg); }
+inline expr atan(expr arg) { return functions::atan(arg); }
+
+/// Arc tangent function.
+/// \param x first argument
+/// \param y second argument
+/// \return arc tangent value
+//		template<typename T,typename U> typename enable<expr,T,U>::type atan2(T x, U y) { return functions::atan2(x, y); }
+inline expr atan2(half x, half y) { return functions::atan2(x, y); }
+inline expr atan2(half x, expr y) { return functions::atan2(x, y); }
+inline expr atan2(expr x, half y) { return functions::atan2(x, y); }
+inline expr atan2(expr x, expr y) { return functions::atan2(x, y); }
+
+/// \}
+/// \name Hyperbolic functions
+/// \{
+
+/// Hyperbolic sine.
+/// \param arg function argument
+/// \return hyperbolic sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type sinh(T arg) { return functions::sinh(arg); }
+inline expr sinh(half arg) { return functions::sinh(arg); }
+inline expr sinh(expr arg) { return functions::sinh(arg); }
+
+/// Hyperbolic cosine.
+/// \param arg function argument
+/// \return hyperbolic cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type cosh(T arg) { return functions::cosh(arg); }
+inline expr cosh(half arg) { return functions::cosh(arg); }
+inline expr cosh(expr arg) { return functions::cosh(arg); }
+
+/// Hyperbolic tangent.
+/// \param arg function argument
+/// \return hyperbolic tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type tanh(T arg) { return functions::tanh(arg); }
+inline expr tanh(half arg) { return functions::tanh(arg); }
+inline expr tanh(expr arg) { return functions::tanh(arg); }
+
+/// Hyperbolic area sine.
+/// \param arg function argument
+/// \return area sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type asinh(T arg) { return functions::asinh(arg); }
+inline expr asinh(half arg) { return functions::asinh(arg); }
+inline expr asinh(expr arg) { return functions::asinh(arg); }
+
+/// Hyperbolic area cosine.
+/// \param arg function argument
+/// \return area cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type acosh(T arg) { return functions::acosh(arg); }
+inline expr acosh(half arg) { return functions::acosh(arg); }
+inline expr acosh(expr arg) { return functions::acosh(arg); }
+
+/// Hyperbolic area tangent.
+/// \param arg function argument
+/// \return area tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type atanh(T arg) { return functions::atanh(arg); }
+inline expr atanh(half arg) { return functions::atanh(arg); }
+inline expr atanh(expr arg) { return functions::atanh(arg); }
+
+/// \}
+/// \name Error and gamma functions
+/// \{
+
+/// Error function.
+/// \param arg function argument
+/// \return error function value of \a arg
+//		template<typename T> typename enable<expr,T>::type erf(T arg) { return functions::erf(arg); }
+inline expr erf(half arg) { return functions::erf(arg); }
+inline expr erf(expr arg) { return functions::erf(arg); }
+
+/// Complementary error function.
+/// \param arg function argument
+/// \return 1 minus error function value of \a arg
+//		template<typename T> typename enable<expr,T>::type erfc(T arg) { return functions::erfc(arg); }
+inline expr erfc(half arg) { return functions::erfc(arg); }
+inline expr erfc(expr arg) { return functions::erfc(arg); }
+
+/// Natural logarithm of gamma function.
+/// \param arg function argument
+/// \return natural logarith of gamma function for \a arg
+//		template<typename T> typename enable<expr,T>::type lgamma(T arg) { return functions::lgamma(arg); }
+inline expr lgamma(half arg) { return functions::lgamma(arg); }
+inline expr lgamma(expr arg) { return functions::lgamma(arg); }
+
+/// Gamma function.
+/// \param arg function argument
+/// \return gamma function value of \a arg
+//		template<typename T> typename enable<expr,T>::type tgamma(T arg) { return functions::tgamma(arg); }
+inline expr tgamma(half arg) { return functions::tgamma(arg); }
+inline expr tgamma(expr arg) { return functions::tgamma(arg); }
+
+/// \}
+/// \name Rounding
+/// \{
+
+/// Nearest integer not less than half value.
+/// \param arg half to round
+/// \return nearest integer not less than \a arg
+//		template<typename T> typename enable<half,T>::type ceil(T arg) { return functions::ceil(arg); }
+inline half ceil(half arg) { return functions::ceil(arg); }
+inline half ceil(expr arg) { return functions::ceil(arg); }
+
+/// Nearest integer not greater than half value.
+/// \param arg half to round
+/// \return nearest integer not greater than \a arg
+//		template<typename T> typename enable<half,T>::type floor(T arg) { return functions::floor(arg); }
+inline half floor(half arg) { return functions::floor(arg); }
+inline half floor(expr arg) { return functions::floor(arg); }
+
+/// Nearest integer not greater in magnitude than half value.
+/// \param arg half to round
+/// \return nearest integer not greater in magnitude than \a arg
+//		template<typename T> typename enable<half,T>::type trunc(T arg) { return functions::trunc(arg); }
+inline half trunc(half arg) { return functions::trunc(arg); }
+inline half trunc(expr arg) { return functions::trunc(arg); }
+
+/// Nearest integer.
+/// \param arg half to round
+/// \return nearest integer, rounded away from zero in half-way cases
+//		template<typename T> typename enable<half,T>::type round(T arg) { return functions::round(arg); }
+inline half round(half arg) { return functions::round(arg); }
+inline half round(expr arg) { return functions::round(arg); }
+
+/// Nearest integer.
+/// \param arg half to round
+/// \return nearest integer, rounded away from zero in half-way cases
+//		template<typename T> typename enable<long,T>::type lround(T arg) { return functions::lround(arg); }
+inline long lround(half arg) { return functions::lround(arg); }
+inline long lround(expr arg) { return functions::lround(arg); }
+
+/// Nearest integer using half's internal rounding mode.
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<half,T>::type nearbyint(T arg) { return functions::nearbyint(arg); }
+inline half nearbyint(half arg) { return functions::rint(arg); }
+inline half nearbyint(expr arg) { return functions::rint(arg); }
+
+/// Nearest integer using half's internal rounding mode.
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<half,T>::type rint(T arg) { return functions::rint(arg); }
+inline half rint(half arg) { return functions::rint(arg); }
+inline half rint(expr arg) { return functions::rint(arg); }
+
+/// Nearest integer using half's internal rounding mode.
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<long,T>::type lrint(T arg) { return functions::lrint(arg); }
+inline long lrint(half arg) { return functions::lrint(arg); }
+inline long lrint(expr arg) { return functions::lrint(arg); }
+#if HALF_ENABLE_CPP11_LONG_LONG
+/// Nearest integer.
+/// \param arg half to round
+/// \return nearest integer, rounded away from zero in half-way cases
+//		template<typename T> typename enable<long long,T>::type llround(T arg) { return functions::llround(arg); }
+inline long long llround(half arg) { return functions::llround(arg); }
+inline long long llround(expr arg) { return functions::llround(arg); }
+
+/// Nearest integer using half's internal rounding mode.
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<long long,T>::type llrint(T arg) { return functions::llrint(arg); }
+inline long long llrint(half arg) { return functions::llrint(arg); }
+inline long long llrint(expr arg) { return functions::llrint(arg); }
+#endif
+
+/// \}
+/// \name Floating point manipulation
+/// \{
+
+/// Decompress floating point number.
+/// \param arg number to decompress
+/// \param exp address to store exponent at
+/// \return significant in range [0.5, 1)
+//		template<typename T> typename enable<half,T>::type frexp(T arg, int *exp) { return functions::frexp(arg, exp); }
+inline half frexp(half arg, int *exp) { return functions::frexp(arg, exp); }
+inline half frexp(expr arg, int *exp) { return functions::frexp(arg, exp); }
+
+/// Multiply by power of two.
+/// \param arg number to modify
+/// \param exp power of two to multiply with
+/// \return \a arg multplied by 2 raised to \a exp
+//		template<typename T> typename enable<half,T>::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); }
+inline half ldexp(half arg, int exp) { return functions::scalbln(arg, exp); }
+inline half ldexp(expr arg, int exp) { return functions::scalbln(arg, exp); }
+
+/// Extract integer and fractional parts.
+/// \param arg number to decompress
+/// \param iptr address to store integer part at
+/// \return fractional part
+//		template<typename T> typename enable<half,T>::type modf(T arg, half *iptr) { return functions::modf(arg, iptr); }
+inline half modf(half arg, half *iptr) { return functions::modf(arg, iptr); }
+inline half modf(expr arg, half *iptr) { return functions::modf(arg, iptr); }
+
+/// Multiply by power of two.
+/// \param arg number to modify
+/// \param exp power of two to multiply with
+/// \return \a arg multplied by 2 raised to \a exp
+//		template<typename T> typename enable<half,T>::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); }
+inline half scalbn(half arg, int exp) { return functions::scalbln(arg, exp); }
+inline half scalbn(expr arg, int exp) { return functions::scalbln(arg, exp); }
+
+/// Multiply by power of two.
+/// \param arg number to modify
+/// \param exp power of two to multiply with
+/// \return \a arg multplied by 2 raised to \a exp
+//		template<typename T> typename enable<half,T>::type scalbln(T arg, long exp) { return functions::scalbln(arg, exp); }
+inline half scalbln(half arg, long exp) { return functions::scalbln(arg, exp); }
+inline half scalbln(expr arg, long exp) { return functions::scalbln(arg, exp); }
+
+/// Extract exponent.
+/// \param arg number to query
+/// \return floating point exponent
+/// \retval FP_ILOGB0 for zero
+/// \retval FP_ILOGBNAN for NaN
+/// \retval MAX_INT for infinity
+//		template<typename T> typename enable<int,T>::type ilogb(T arg) { return functions::ilogb(arg); }
+inline int ilogb(half arg) { return functions::ilogb(arg); }
+inline int ilogb(expr arg) { return functions::ilogb(arg); }
+
+/// Extract exponent.
+/// \param arg number to query
+/// \return floating point exponent
+//		template<typename T> typename enable<half,T>::type logb(T arg) { return functions::logb(arg); }
+inline half logb(half arg) { return functions::logb(arg); }
+inline half logb(expr arg) { return functions::logb(arg); }
+
+/// Next representable value.
+/// \param from value to compute next representable value for
+/// \param to direction towards which to compute next value
+/// \return next representable value after \a from in direction towards \a to
+//		template<typename T,typename U> typename enable<half,T,U>::type nextafter(T from, U to) { return
+// functions::nextafter(from, to); }
+inline half nextafter(half from, half to) { return functions::nextafter(from, to); }
+inline half nextafter(half from, expr to) { return functions::nextafter(from, to); }
+inline half nextafter(expr from, half to) { return functions::nextafter(from, to); }
+inline half nextafter(expr from, expr to) { return functions::nextafter(from, to); }
+
+/// Next representable value.
+/// \param from value to compute next representable value for
+/// \param to direction towards which to compute next value
+/// \return next representable value after \a from in direction towards \a to
+//		template<typename T> typename enable<half,T>::type nexttoward(T from, long double to) { return
+// functions::nexttoward(from, to); }
+inline half nexttoward(half from, long double to) { return functions::nexttoward(from, to); }
+inline half nexttoward(expr from, long double to) { return functions::nexttoward(from, to); }
+
+/// Take sign.
+/// \param x value to change sign for
+/// \param y value to take sign from
+/// \return value equal to \a x in magnitude and to \a y in sign
+//		template<typename T,typename U> typename enable<half,T,U>::type copysign(T x, U y) { return functions::copysign(x, y);
+//}
+inline half copysign(half x, half y) { return functions::copysign(x, y); }
+inline half copysign(half x, expr y) { return functions::copysign(x, y); }
+inline half copysign(expr x, half y) { return functions::copysign(x, y); }
+inline half copysign(expr x, expr y) { return functions::copysign(x, y); }
+
+/// \}
+/// \name Floating point classification
+/// \{
+
+/// Classify floating point value.
+/// \param arg number to classify
+/// \retval FP_ZERO for positive and negative zero
+/// \retval FP_SUBNORMAL for subnormal numbers
+/// \retval FP_INFINITY for positive and negative infinity
+/// \retval FP_NAN for NaNs
+/// \retval FP_NORMAL for all other (normal) values
+//		template<typename T> typename enable<int,T>::type fpclassify(T arg) { return functions::fpclassify(arg); }
+inline int fpclassify(half arg) { return functions::fpclassify(arg); }
+inline int fpclassify(expr arg) { return functions::fpclassify(arg); }
+
+/// Check if finite number.
+/// \param arg number to check
+/// \retval true if neither infinity nor NaN
+/// \retval false else
+//		template<typename T> typename enable<bool,T>::type isfinite(T arg) { return functions::isfinite(arg); }
+inline bool isfinite(half arg) { return functions::isfinite(arg); }
+inline bool isfinite(expr arg) { return functions::isfinite(arg); }
+
+/// Check for infinity.
+/// \param arg number to check
+/// \retval true for positive or negative infinity
+/// \retval false else
+//		template<typename T> typename enable<bool,T>::type isinf(T arg) { return functions::isinf(arg); }
+inline bool isinf(half arg) { return functions::isinf(arg); }
+inline bool isinf(expr arg) { return functions::isinf(arg); }
+
+/// Check for NaN.
+/// \param arg number to check
+/// \retval true for NaNs
+/// \retval false else
+//		template<typename T> typename enable<bool,T>::type isnan(T arg) { return functions::isnan(arg); }
+inline bool isnan(half arg) { return functions::isnan(arg); }
+inline bool isnan(expr arg) { return functions::isnan(arg); }
+
+/// Check if normal number.
+/// \param arg number to check
+/// \retval true if normal number
+/// \retval false if either subnormal, zero, infinity or NaN
+//		template<typename T> typename enable<bool,T>::type isnormal(T arg) { return functions::isnormal(arg); }
+inline bool isnormal(half arg) { return functions::isnormal(arg); }
+inline bool isnormal(expr arg) { return functions::isnormal(arg); }
+
+/// Check sign.
+/// \param arg number to check
+/// \retval true for negative number
+/// \retval false for positive number
+//		template<typename T> typename enable<bool,T>::type signbit(T arg) { return functions::signbit(arg); }
+inline bool signbit(half arg) { return functions::signbit(arg); }
+inline bool signbit(expr arg) { return functions::signbit(arg); }
+
+/// \}
+/// \name Comparison
+/// \{
+
+/// Comparison for greater than.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater than \a y
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isgreater(T x, U y) { return functions::isgreater(x,
+// y);
+//}
+inline bool isgreater(half x, half y) { return functions::isgreater(x, y); }
+inline bool isgreater(half x, expr y) { return functions::isgreater(x, y); }
+inline bool isgreater(expr x, half y) { return functions::isgreater(x, y); }
+inline bool isgreater(expr x, expr y) { return functions::isgreater(x, y); }
+
+/// Comparison for greater equal.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater equal \a y
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isgreaterequal(T x, U y) { return
+// functions::isgreaterequal(x, y); }
+inline bool isgreaterequal(half x, half y) { return functions::isgreaterequal(x, y); }
+inline bool isgreaterequal(half x, expr y) { return functions::isgreaterequal(x, y); }
+inline bool isgreaterequal(expr x, half y) { return functions::isgreaterequal(x, y); }
+inline bool isgreaterequal(expr x, expr y) { return functions::isgreaterequal(x, y); }
+
+/// Comparison for less than.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less than \a y
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isless(T x, U y) { return functions::isless(x, y); }
+inline bool isless(half x, half y) { return functions::isless(x, y); }
+inline bool isless(half x, expr y) { return functions::isless(x, y); }
+inline bool isless(expr x, half y) { return functions::isless(x, y); }
+inline bool isless(expr x, expr y) { return functions::isless(x, y); }
+
+/// Comparison for less equal.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less equal \a y
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type islessequal(T x, U y) { return
+// functions::islessequal(x, y); }
+inline bool islessequal(half x, half y) { return functions::islessequal(x, y); }
+inline bool islessequal(half x, expr y) { return functions::islessequal(x, y); }
+inline bool islessequal(expr x, half y) { return functions::islessequal(x, y); }
+inline bool islessequal(expr x, expr y) { return functions::islessequal(x, y); }
+
+/// Comarison for less or greater.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if either less or greater
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type islessgreater(T x, U y) { return
+// functions::islessgreater(x, y); }
+inline bool islessgreater(half x, half y) { return functions::islessgreater(x, y); }
+inline bool islessgreater(half x, expr y) { return functions::islessgreater(x, y); }
+inline bool islessgreater(expr x, half y) { return functions::islessgreater(x, y); }
+inline bool islessgreater(expr x, expr y) { return functions::islessgreater(x, y); }
+
+/// Check if unordered.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if unordered (one or two NaN operands)
+/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isunordered(T x, U y) { return
+// functions::isunordered(x, y); }
+inline bool isunordered(half x, half y) { return functions::isunordered(x, y); }
+inline bool isunordered(half x, expr y) { return functions::isunordered(x, y); }
+inline bool isunordered(expr x, half y) { return functions::isunordered(x, y); }
+inline bool isunordered(expr x, expr y) { return functions::isunordered(x, y); }
+
+/// \name Casting
+/// \{
+
+/// Cast to or from half-precision floating point number.
+/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted
+/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+/// It uses the default rounding mode.
+///
+/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types
+/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler
+/// error and casting between [half](\ref half_float::half)s is just a no-op.
+/// \tparam T destination type (half or built-in arithmetic type)
+/// \tparam U source type (half or built-in arithmetic type)
+/// \param arg value to cast
+/// \return \a arg converted to destination type
+template <typename T, typename U>
+T half_cast(U arg)
+{
+  return half_caster<T, U>::cast(arg);
+}
+
+/// Cast to or from half-precision floating point number.
+/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted
+/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+///
+/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types
+/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler
+/// error and casting between [half](\ref half_float::half)s is just a no-op.
+/// \tparam T destination type (half or built-in arithmetic type)
+/// \tparam R rounding mode to use.
+/// \tparam U source type (half or built-in arithmetic type)
+/// \param arg value to cast
+/// \return \a arg converted to destination type
+template <typename T, std::float_round_style R, typename U>
+T half_cast(U arg)
+{
+  return half_caster<T, U, R>::cast(arg);
+}
+/// \}
+}  // namespace detail
+
+using detail::operator==;
+using detail::operator!=;
+using detail::operator<;
+using detail::operator>;
+using detail::operator<=;
+using detail::operator>=;
+using detail::operator+;
+using detail::operator-;
+using detail::operator*;
+using detail::operator/;
+using detail::operator<<;
+using detail::operator>>;
+
+using detail::abs;
+using detail::acos;
+using detail::acosh;
+using detail::asin;
+using detail::asinh;
+using detail::atan;
+using detail::atan2;
+using detail::atanh;
+using detail::cbrt;
+using detail::ceil;
+using detail::cos;
+using detail::cosh;
+using detail::erf;
+using detail::erfc;
+using detail::exp;
+using detail::exp2;
+using detail::expm1;
+using detail::fabs;
+using detail::fdim;
+using detail::floor;
+using detail::fma;
+using detail::fmax;
+using detail::fmin;
+using detail::fmod;
+using detail::hypot;
+using detail::lgamma;
+using detail::log;
+using detail::log10;
+using detail::log1p;
+using detail::log2;
+using detail::lrint;
+using detail::lround;
+using detail::nanh;
+using detail::nearbyint;
+using detail::pow;
+using detail::remainder;
+using detail::remquo;
+using detail::rint;
+using detail::round;
+using detail::sin;
+using detail::sinh;
+using detail::sqrt;
+using detail::tan;
+using detail::tanh;
+using detail::tgamma;
+using detail::trunc;
+#if HALF_ENABLE_CPP11_LONG_LONG
+using detail::llrint;
+using detail::llround;
+#endif
+using detail::copysign;
+using detail::fpclassify;
+using detail::frexp;
+using detail::ilogb;
+using detail::isfinite;
+using detail::isgreater;
+using detail::isgreaterequal;
+using detail::isinf;
+using detail::isless;
+using detail::islessequal;
+using detail::islessgreater;
+using detail::isnan;
+using detail::isnormal;
+using detail::isunordered;
+using detail::ldexp;
+using detail::logb;
+using detail::modf;
+using detail::nextafter;
+using detail::nexttoward;
+using detail::scalbln;
+using detail::scalbn;
+using detail::signbit;
+
+using detail::half_cast;
+}  // namespace half_float
+
+/// Extensions to the C++ standard library.
+namespace std
+{
+/// Numeric limits for half-precision floats.
+/// Because of the underlying single-precision implementation of many operations, it inherits some properties from
+/// `std::numeric_limits<float>`.
+template <>
+class numeric_limits<half_float::half> : public numeric_limits<float>
+{
+ public:
+  /// Supports signed values.
+  static HALF_CONSTEXPR_CONST bool is_signed = true;
+
+  /// Is not exact.
+  static HALF_CONSTEXPR_CONST bool is_exact = false;
+
+  /// Doesn't provide modulo arithmetic.
+  static HALF_CONSTEXPR_CONST bool is_modulo = false;
+
+  /// IEEE conformant.
+  static HALF_CONSTEXPR_CONST bool is_iec559 = true;
+
+  /// Supports infinity.
+  static HALF_CONSTEXPR_CONST bool has_infinity = true;
+
+  /// Supports quiet NaNs.
+  static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
+
+  /// Supports subnormal values.
+  static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
+
+  /// Rounding mode.
+  /// Due to the mix of internal single-precision computations (using the rounding mode of the underlying
+  /// single-precision implementation) with the rounding mode of the single-to-half conversions, the actual rounding
+  /// mode might be `std::round_indeterminate` if the default half-precision rounding mode doesn't match the
+  /// single-precision rounding mode.
+  static HALF_CONSTEXPR_CONST float_round_style round_style =
+      (std::numeric_limits<float>::round_style == half_float::half::round_style) ? half_float::half::round_style : round_indeterminate;
+
+  /// Significant digits.
+  static HALF_CONSTEXPR_CONST int digits = 11;
+
+  /// Significant decimal digits.
+  static HALF_CONSTEXPR_CONST int digits10 = 3;
+
+  /// Required decimal digits to represent all possible values.
+  static HALF_CONSTEXPR_CONST int max_digits10 = 5;
+
+  /// Number base.
+  static HALF_CONSTEXPR_CONST int radix = 2;
+
+  /// One more than smallest exponent.
+  static HALF_CONSTEXPR_CONST int min_exponent = -13;
+
+  /// Smallest normalized representable power of 10.
+  static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
+
+  /// One more than largest exponent
+  static HALF_CONSTEXPR_CONST int max_exponent = 16;
+
+  /// Largest finitely representable power of 10.
+  static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
+
+  /// Smallest positive normal value.
+  static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0400); }
+
+  /// Smallest finite value.
+  static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0xFBFF); }
+
+  /// Largest finite value.
+  static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7BFF); }
+
+  /// Difference between one and next representable value.
+  static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x1400); }
+
+  /// Maximum rounding error.
+  static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
+  {
+    return half_float::half(half_float::detail::binary, (round_style == std::round_to_nearest) ? 0x3800 : 0x3C00);
+  }
+
+  /// Positive infinity.
+  static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7C00); }
+
+  /// Quiet NaN.
+  static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7FFF); }
+
+  /// Signalling NaN.
+  static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7DFF); }
+
+  /// Smallest positive subnormal value.
+  static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0001); }
+};
+
+#if HALF_ENABLE_CPP11_HASH
+/// Hash function for half-precision floats.
+/// This is only defined if C++11 `std::hash` is supported and enabled.
+template <>
+struct hash<half_float::half>  //: unary_function<half_float::half,size_t>
+{
+  /// Type of function argument.
+  typedef half_float::half argument_type;
+
+  /// Function return type.
+  typedef size_t result_type;
+
+  /// Compute hash function.
+  /// \param arg half to hash
+  /// \return hash value
+  result_type operator()(argument_type arg) const
+  {
+    return hash<half_float::detail::uint16>()(static_cast<unsigned>(arg.data_) & -(arg.data_ != 0x8000));
+  }
+};
+#endif
+}  // namespace std
+
+#undef HALF_CONSTEXPR
+#undef HALF_CONSTEXPR_CONST
+#undef HALF_NOEXCEPT
+#undef HALF_NOTHROW
+#ifdef HALF_POP_WARNINGS
+#pragma warning(pop)
+#undef HALF_POP_WARNINGS
+#endif
+
+#endif
diff --git a/src/io/hdf5_util.cc b/src/io/hdf5_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac76de332008cf114bb70122b2c53b5189e03ef5
--- /dev/null
+++ b/src/io/hdf5_util.cc
@@ -0,0 +1,669 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  hdf5_util.cc
+ *
+ *  \brief some helper functions for HDF5 I/O routines
+ */
+
+#include "gadgetconfig.h"
+
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../io/hdf5_util.h"
+
+#define HALF_ROUND_STYLE 1
+#include "../half/half.hpp"
+using half_float::half;
+
+hid_t Halfprec_memtype;
+hid_t Int48_memtype;
+hid_t Int128_memtype;
+
+static herr_t hdf5_conv_half(hid_t src, hid_t dst, H5T_cdata_t *cdata, size_t nelmts, size_t buf_str, size_t bkg_str, void *buf,
+                             void *background, hid_t plist)
+{
+  size_t src_size = H5Tget_size(src);
+  size_t dst_size = H5Tget_size(dst);
+
+  char *src_buf = (char *)buf;
+  char *dst_buf = (char *)buf;
+
+  int direction;
+
+  switch(cdata->command)
+    {
+      case H5T_CONV_INIT:
+        /*
+         * We are being queried to see if we handle this
+         * conversion.
+         */
+        if(H5Tequal(src, Halfprec_memtype) || H5Tequal(dst, Halfprec_memtype))
+          {
+            cdata->need_bkg = H5T_BKG_NO;
+            return 0;
+          }
+        else
+          return -1;
+        break;
+
+      case H5T_CONV_FREE:
+        break;
+
+      case H5T_CONV_CONV:
+        /*
+         * Convert each element, watch out for overlap src
+         * with dst on the left-most element of the buffer.
+         * If the destination size is larger than the source size,
+         * then we must process the elements from right to left.
+         */
+
+        if(dst_size > src_size)
+          {
+            direction = -1;
+            src_buf += (nelmts - 1) * src_size;
+            dst_buf += (nelmts - 1) * dst_size;
+          }
+        else
+          {
+            direction = 1;
+          }
+
+        for(size_t i = 0; i < nelmts; i++)
+          {
+            if(src_size == 2)
+              {
+                if(dst_size == 4)
+                  {
+                    *((float *)dst_buf) = (float)(*((half *)src_buf));
+                  }
+                else if(dst_size == 8)
+                  {
+                    *((double *)dst_buf) = (float)(*((half *)src_buf));
+                  }
+              }
+            else if(dst_size == 2)
+              {
+                if(src_size == 4)
+                  {
+                    *((half *)dst_buf) = (half)(*((float *)src_buf));
+                  }
+                else if(src_size == 8)
+                  {
+                    *((half *)dst_buf) = (half)(*((double *)src_buf));
+                  }
+              }
+            src_buf += src_size * direction;
+            dst_buf += dst_size * direction;
+          }
+
+        break;
+
+      default:
+        /*
+         * Unknown command.
+         */
+        return -1;
+    }
+  return 0;
+}
+
+void my_create_HDF5_halfprec_handler(void)
+{
+  /* define the half-precision type */
+
+  Halfprec_memtype = H5Tcopy(H5T_NATIVE_FLOAT);
+
+  H5Tset_fields(Halfprec_memtype, 15, 10, 5, 0, 10);
+  H5Tset_ebias(Halfprec_memtype, 15);
+  H5Tset_precision(Halfprec_memtype, 16);
+  H5Tset_size(Halfprec_memtype, 2);
+
+  H5Tregister(H5T_PERS_SOFT, "half-converter", Halfprec_memtype, Halfprec_memtype, hdf5_conv_half);
+}
+
+void my_create_HDF5_special_integer_types(void)
+{
+  Int48_memtype = H5Tcopy(H5T_NATIVE_UINT32);
+  H5Tset_precision(Int48_memtype, 48);
+
+  Int128_memtype = H5Tcopy(H5T_NATIVE_UINT64);
+  H5Tset_precision(Int128_memtype, 128);
+}
+
+/*! \file hdf5_util.c
+ *
+ *  \brief Contains the wrapper functions to the HDF5 library functions.
+ *
+ *  The wrapper functions explicitly check for error conditions and terminate
+ *  the run if such conditions occur.
+ *
+ *  The HDF5 error handler is disabled in case of termination not to repeat
+ *  the error message of the handler again at the program exit.
+ */
+
+/*!
+ * This routine wraps creating a file to give a nice error message
+ */
+hid_t my_H5Fcreate(const char *fname, unsigned int flags, hid_t fcpl_id, hid_t fapl_id)
+{
+  hid_t file_id = H5Fcreate(fname, flags, fcpl_id, fapl_id);
+
+  if(file_id < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to create file %s\n", fname);
+    }
+
+  return file_id;
+}
+
+/*!
+ * This routine wraps creating a group to give a nice error message
+ */
+hid_t my_H5Gcreate(hid_t loc_id, const char *groupname, size_t size_hint)
+{
+  hid_t group_id = H5Gcreate(loc_id, groupname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+  if(group_id < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to create group %s\n", groupname);
+    }
+
+  return group_id;
+}
+
+/*!
+ * This routine wraps creating a dataset to give a nice error message
+ */
+hid_t my_H5Dcreate(hid_t loc_id, const char *datasetname, hid_t type_id, hid_t space_id, hid_t dcpl_id)
+{
+  hid_t dataset_id = H5Dcreate(loc_id, datasetname, type_id, space_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
+
+  if(dataset_id < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to create dataset %s\n", datasetname);
+    }
+
+  return dataset_id;
+}
+
+/*!
+ * This routine wraps writing a dataset to give a nice error message
+ */
+herr_t my_H5Dwrite(hid_t dataset_id, hid_t mem_type_id, hid_t mem_space_id, hid_t file_space_id, hid_t xfer_plist_id, const void *buf,
+                   const char *datasetname)
+{
+  herr_t status = H5Dwrite(dataset_id, mem_type_id, mem_space_id, file_space_id, xfer_plist_id, buf);
+
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to write dataset %s\n", datasetname);
+    }
+
+  return status;
+}
+
+/*!
+ * This routine wraps creating an attribute to give a nice error message
+ */
+hid_t my_H5Acreate(hid_t loc_id, const char *attr_name, hid_t type_id, hid_t space_id, hid_t acpl_id)
+{
+  if(H5Aexists(loc_id, attr_name))
+    H5Adelete(loc_id, attr_name);
+
+  hid_t attribute_id = H5Acreate(loc_id, attr_name, type_id, space_id, acpl_id, H5P_DEFAULT);
+
+  if(attribute_id < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to create attribute %s\n", attr_name);
+    }
+
+  return attribute_id;
+}
+
+/*!
+ * This routine wraps writing an attribute to give a nice error message
+ */
+herr_t my_H5Awrite(hid_t attr_id, hid_t mem_type_id, const void *buf, const char *attr_name)
+{
+  herr_t status = H5Awrite(attr_id, mem_type_id, buf);
+
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to write attribute %s\n", attr_name);
+    }
+
+  return status;
+}
+
+/*!
+ * This routine wraps creating a dataspace to give a nice error message
+ */
+hid_t my_H5Screate(H5S_class_t type)
+{
+  hid_t dataspace_id = H5Screate(type);
+  if(dataspace_id < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      switch(type)
+        {
+          case H5S_SCALAR:
+            Terminate("Error detected in HDF5: unable to create a scalar dataspace\n");
+            break;
+          case H5S_SIMPLE:
+            Terminate("Error detected in HDF5: unable to create a simple dataspace\n");
+            break;
+          default:
+            Terminate("Error detected in HDF5: unknown dataspace type\n");
+            break;
+        }
+    }
+  return dataspace_id;
+}
+
+/*!
+ * This routine wraps creating a simple dataspace to give a nice error message
+ */
+hid_t my_H5Screate_simple(int rank, const hsize_t *current_dims, const hsize_t *maximum_dims)
+{
+  hid_t dataspace_id = H5Screate_simple(rank, current_dims, maximum_dims);
+  if(dataspace_id < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to create a simple dataspace\n");
+    }
+  return dataspace_id;
+}
+
+/*!
+ * This routine wraps opening a file to give a nice error message
+ */
+hid_t my_H5Fopen(const char *fname, unsigned int flags, hid_t fapl_id)
+{
+  hid_t file_id = H5Fopen(fname, flags, fapl_id);
+  if(file_id < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to open file %s\n", fname);
+    }
+  return file_id;
+}
+
+/*!
+ * This routine wraps opening a group to give a nice error message
+ */
+hid_t my_H5Gopen(hid_t loc_id, const char *groupname)
+{
+  hid_t group = H5Gopen(loc_id, groupname, H5P_DEFAULT);
+  if(group < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to open group %s\n", groupname);
+    }
+  return group;
+}
+
+/*!
+ * This routine wraps opening a dataset to give a nice error message
+ */
+hid_t my_H5Dopen(hid_t file_id, const char *datasetname)
+{
+  hid_t dataset = H5Dopen(file_id, datasetname, H5P_DEFAULT);
+  if(dataset < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to open dataset %s\n", datasetname);
+    }
+  return dataset;
+}
+
+herr_t my_H5Dset_extent(hid_t dset_id, const hsize_t size[])
+{
+  herr_t status = H5Dset_extent(dset_id, size);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to set extent of dataset\n");
+    }
+  return status;
+}
+
+/*!
+ * This routine wraps opening a dataset. However, in contrast to my_H5Dopen(), if the dataset
+ * does not exist it does not terminate the run. This is useful while reading an ICs file
+ * because in that case a non-exisitng dataset is put to zero (see also read_ic.c)
+ */
+hid_t my_H5Dopen_if_existing(hid_t file_id, const char *datasetname)
+{
+  /* save error handler and disable it */
+  H5E_auto_t errfunc;
+  void *client_data;
+  H5Eget_auto(H5E_DEFAULT, &errfunc, &client_data);
+  H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+
+  hid_t dataset = H5Dopen(file_id, datasetname, H5P_DEFAULT);
+
+  /* reset error handler */
+  H5Eset_auto(H5E_DEFAULT, errfunc, client_data);
+
+  return dataset;
+}
+
+/*!
+ * This routine wraps opening an attribute to give a nice error message
+ */
+hid_t my_H5Aopen_name(hid_t loc_id, const char *attr_name)
+{
+  hid_t attribute_id = H5Aopen_name(loc_id, attr_name);
+  if(attribute_id < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to open attribute %s\n", attr_name);
+    }
+  return attribute_id;
+}
+
+/*!
+ * This routine wraps reading a dataset to give a nice error message
+ */
+herr_t my_H5Dread(hid_t dataset_id, hid_t mem_type_id, hid_t mem_space_id, hid_t file_space_id, hid_t xfer_plist_id, void *buf,
+                  const char *datasetname)
+{
+  herr_t status = H5Dread(dataset_id, mem_type_id, mem_space_id, file_space_id, xfer_plist_id, buf);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to read dataset %s\n", datasetname);
+    }
+  return status;
+}
+
+/*!
+ * This routine wraps reading an attribute to give a nice error message
+ */
+herr_t my_H5Aread(hid_t attr_id, hid_t mem_type_id, void *buf, const char *attr_name, hssize_t size)
+{
+  hid_t hdf5_space   = H5Aget_space(attr_id);
+  hssize_t attr_size = H5Sget_simple_extent_npoints(hdf5_space);
+  H5Sclose(hdf5_space);
+
+  if(attr_size != size)
+    {
+      H5E_auto_t errfunc;
+      void *client_data;
+      H5Eget_auto(H5E_DEFAULT, &errfunc, &client_data);
+      errfunc(H5P_DEFAULT, client_data);
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: mismatch in size for attribute %s, expected size = %lld, actual attribute size = %lld\n",
+                attr_name, size, attr_size);
+    }
+
+  herr_t status = H5Aread(attr_id, mem_type_id, buf);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to read attribute %s\n", attr_name);
+    }
+  return status;
+}
+
+/*!
+ * This routine wraps closing an attribute to give a nice error message
+ */
+herr_t my_H5Aclose(hid_t attr_id, const char *attr_name)
+{
+  herr_t status = H5Aclose(attr_id);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to close attribute %s\n", attr_name);
+    }
+  return status;
+}
+
+/*!
+ * This routine wraps closing a dataset to give a nice error message
+ */
+herr_t my_H5Dclose(hid_t dataset_id, const char *datasetname)
+{
+  herr_t status = H5Dclose(dataset_id);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to close dataset %s\n", datasetname);
+    }
+  return status;
+}
+
+/*!
+ * This routine wraps closing a group to give a nice error message
+ */
+herr_t my_H5Gclose(hid_t group_id, const char *groupname)
+{
+  herr_t status = H5Gclose(group_id);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to close group %s\n", groupname);
+    }
+  return status;
+}
+
+herr_t my_H5Pclose(hid_t plist)
+{
+  herr_t status = H5Pclose(plist);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to close property\n");
+    }
+  return status;
+}
+
+/*!
+ * This routine wraps closing a file to give a nice error message
+ */
+herr_t my_H5Fclose(hid_t file_id, const char *fname)
+{
+  herr_t status = H5Fclose(file_id);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to close file %s\n", fname);
+    }
+  return status;
+}
+
+/*!
+ * This routine wraps closing a dataspace to give a nice error message
+ */
+herr_t my_H5Sclose(hid_t dataspace_id, H5S_class_t type)
+{
+  herr_t status = H5Sclose(dataspace_id);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      switch(type)
+        {
+          case H5S_SCALAR:
+            Terminate("Error detected in HDF5: unable to close a scalar dataspace\n");
+            break;
+          case H5S_SIMPLE:
+            Terminate("Error detected in HDF5: unable to close a simple dataspace\n");
+            break;
+          default:
+            Terminate("Error detected in HDF5: unknown dataspace type\n");
+            break;
+        }
+    }
+  return status;
+}
+
+/*!
+ * This routine wraps copying an existing datatype to give a nice error message
+ */
+hid_t my_H5Tcopy(hid_t type_id)
+{
+  hid_t datatype_id = H5Tcopy(type_id);
+  if(datatype_id < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: could not properly copy datatype\n");
+    }
+  return datatype_id;
+}
+
+/*!
+ * This routine wraps closing a datatype to give a nice error message
+ */
+herr_t my_H5Tclose(hid_t type_id)
+{
+  herr_t status = H5Tclose(type_id);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: could not properly close datatype\n");
+    }
+  return status;
+}
+
+/*!
+ * This routine wraps selecting a hyperslab to give a nice error message
+ */
+herr_t my_H5Sselect_hyperslab(hid_t space_id, H5S_seloper_t op, const hsize_t *start, const hsize_t *stride, const hsize_t *count,
+                              const hsize_t *block)
+{
+  herr_t status = H5Sselect_hyperslab(space_id, op, start, stride, count, block);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: could not properly select the chosen hyperslab\n");
+    }
+  return status;
+}
+
+/*!
+ * This routine wraps returning the size in bytes of a given datatype to give a nice error message
+ */
+size_t my_H5Tget_size(hid_t datatype_id)
+{
+  size_t size = H5Tget_size(datatype_id);
+  if(size == 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: unable to determine the size of the given datatype\n");
+    }
+  return size;
+}
+
+/*!
+ * This routine wraps setting the size in bytes of a given datatype to give a nice error message
+ */
+herr_t my_H5Tset_size(hid_t datatype_id, size_t size)
+{
+  herr_t status = H5Tset_size(datatype_id, size);
+  if(status < 0)
+    {
+      H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+      Terminate("Error detected in HDF5: could not properly set the size of the given datatype\n");
+    }
+  return status;
+}
+
+void read_scalar_attribute(hid_t handle, const char *attr_name, void *buf, hid_t mem_type_id)
+{
+  hid_t hdf5_attribute = my_H5Aopen_name(handle, attr_name);
+  my_H5Aread(hdf5_attribute, mem_type_id, buf, attr_name, 1);
+  my_H5Aclose(hdf5_attribute, attr_name);
+}
+
+int read_scalar_attribute(hid_t handle, const char *attr_name, const char *alternative_name, void *buf, hid_t mem_type_id)
+{
+  if(H5Aexists(handle, attr_name))
+    {
+      hid_t hdf5_attribute = my_H5Aopen_name(handle, attr_name);
+      my_H5Aread(hdf5_attribute, mem_type_id, buf, attr_name, 1);
+      my_H5Aclose(hdf5_attribute, attr_name);
+      return 0;
+    }
+  else
+    {
+      hid_t hdf5_attribute = my_H5Aopen_name(handle, alternative_name);
+      my_H5Aread(hdf5_attribute, mem_type_id, buf, alternative_name, 1);
+      my_H5Aclose(hdf5_attribute, alternative_name);
+      return 1;
+    }
+}
+
+void read_vector_attribute(hid_t handle, const char *attr_name, void *buf, hid_t mem_type_id, int length)
+{
+  hid_t hdf5_attribute = my_H5Aopen_name(handle, attr_name);
+  my_H5Aread(hdf5_attribute, mem_type_id, buf, attr_name, length);
+  my_H5Aclose(hdf5_attribute, attr_name);
+}
+
+void write_scalar_attribute(hid_t handle, const char *attr_name, const void *buf, hid_t mem_type_id)
+{
+  if(H5Aexists(handle, attr_name))
+    H5Adelete(handle, attr_name);
+
+  hid_t hdf5_dataspace = my_H5Screate(H5S_SCALAR);
+
+  hid_t hdf5_attribute = my_H5Acreate(handle, attr_name, mem_type_id, hdf5_dataspace, H5P_DEFAULT);
+
+  my_H5Awrite(hdf5_attribute, mem_type_id, buf, attr_name);
+
+  my_H5Aclose(hdf5_attribute, attr_name);
+  my_H5Sclose(hdf5_dataspace, H5S_SCALAR);
+}
+
+void write_vector_attribute(hid_t handle, const char *attr_name, const void *buf, hid_t mem_type_id, int length)
+{
+  if(H5Aexists(handle, attr_name))
+    H5Adelete(handle, attr_name);
+
+  hsize_t adim[1] = {(hsize_t)length};
+
+  hid_t hdf5_dataspace = my_H5Screate(H5S_SIMPLE);
+  H5Sset_extent_simple(hdf5_dataspace, 1, adim, NULL);
+
+  hid_t hdf5_attribute = my_H5Acreate(handle, attr_name, mem_type_id, hdf5_dataspace, H5P_DEFAULT);
+
+  my_H5Awrite(hdf5_attribute, mem_type_id, buf, attr_name);
+
+  my_H5Aclose(hdf5_attribute, attr_name);
+  my_H5Sclose(hdf5_dataspace, H5S_SIMPLE);
+}
+
+void write_string_attribute(hid_t handle, const char *attr_name, const char *buf)
+{
+  if(H5Aexists(handle, attr_name))
+    H5Adelete(handle, attr_name);
+
+  hid_t atype = my_H5Tcopy(H5T_C_S1);
+  my_H5Tset_size(atype, strlen(buf));
+
+  hid_t hdf5_dataspace = my_H5Screate(H5S_SCALAR);
+  hid_t hdf5_attribute = my_H5Acreate(handle, attr_name, atype, hdf5_dataspace, H5P_DEFAULT);
+
+  my_H5Awrite(hdf5_attribute, atype, buf, attr_name);
+
+  my_H5Aclose(hdf5_attribute, attr_name);
+  my_H5Sclose(hdf5_dataspace, H5S_SCALAR);
+}
diff --git a/src/io/hdf5_util.h b/src/io/hdf5_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..449cf67576673c78db21483e9ad47535535cfbfa
--- /dev/null
+++ b/src/io/hdf5_util.h
@@ -0,0 +1,67 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  hdf5_util.h
+ *
+ *  \brief declares some helper functions for HDF5 I/O routines
+ */
+
+#ifndef HDF5_UTIL_H
+#define HDF5_UTIL_H
+
+#include <hdf5.h>
+
+#define COMPRESSION_CHUNKSIZE 1000
+
+extern hid_t Halfprec_memtype;
+extern hid_t Int48_memtype;
+extern hid_t Int128_memtype;
+
+void my_create_HDF5_special_integer_types(void);
+void my_create_HDF5_halfprec_handler(void);
+hid_t my_H5Fcreate(const char *fname, unsigned flags, hid_t fcpl_id, hid_t fapl_id);
+hid_t my_H5Gcreate(hid_t loc_id, const char *groupname, size_t size_hint);
+hid_t my_H5Dcreate(hid_t loc_id, const char *datasetname, hid_t type_id, hid_t space_id, hid_t dcpl_id);
+hid_t my_H5Acreate(hid_t loc_id, const char *attr_name, hid_t type_id, hid_t space_id, hid_t acpl_id);
+hid_t my_H5Screate(H5S_class_t type);
+hid_t my_H5Screate_simple(int rank, const hsize_t *current_dims, const hsize_t *maximum_dims);
+herr_t my_H5Dwrite(hid_t dataset_id, hid_t mem_type_id, hid_t mem_space_id, hid_t file_space_id, hid_t xfer_plist_id, const void *buf,
+                   const char *datasetname);
+herr_t my_H5Awrite(hid_t attr_id, hid_t mem_type_id, const void *buf, const char *attr_name);
+hid_t my_H5Fopen(const char *fname, unsigned int flags, hid_t fapl_id);
+hid_t my_H5Dopen(hid_t file_id, const char *datasetname);
+hid_t my_H5Dopen_if_existing(hid_t file_id, const char *datasetname);
+herr_t my_H5Dset_extent(hid_t dset_id, const hsize_t size[]);
+herr_t my_H5Dread(hid_t dataset_id, hid_t mem_type_id, hid_t mem_space_id, hid_t file_space_id, hid_t xfer_plist_id, void *buf,
+                  const char *datasetname);
+hid_t my_H5Gopen(hid_t loc_id, const char *groupname);
+hid_t my_H5Aopen_name(hid_t loc_id, const char *attr_name);
+herr_t my_H5Aread(hid_t attr_id, hid_t mem_type_id, void *buf, const char *attr_name, hssize_t size);
+
+herr_t my_H5Aclose(hid_t attr_id, const char *attr_name);
+herr_t my_H5Dclose(hid_t dataset_id, const char *datasetname);
+herr_t my_H5Gclose(hid_t group_id, const char *groupname);
+herr_t my_H5Fclose(hid_t file_id, const char *fname);
+herr_t my_H5Sclose(hid_t dataspace_id, H5S_class_t type);
+herr_t my_H5Pclose(hid_t plist);
+
+hid_t my_H5Tcopy(hid_t type_id);
+herr_t my_H5Tclose(hid_t type_id);
+
+herr_t my_H5Sselect_hyperslab(hid_t space_id, H5S_seloper_t op, const hsize_t *start, const hsize_t *stride, const hsize_t *count,
+                              const hsize_t *block);
+size_t my_H5Tget_size(hid_t datatype_id);
+herr_t my_H5Tset_size(hid_t datatype_id, size_t size);
+
+void write_scalar_attribute(hid_t handle, const char *attr_name, const void *buf, hid_t mem_type_id);
+void write_vector_attribute(hid_t handle, const char *attr_name, const void *buf, hid_t mem_type_id, int length);
+void write_string_attribute(hid_t handle, const char *attr_name, const char *buf);
+
+void read_scalar_attribute(hid_t handle, const char *attr_name, void *buf, hid_t mem_type_id);
+int read_scalar_attribute(hid_t handle, const char *attr_name, const char *alternative_name, void *buf, hid_t mem_type_id);
+void read_vector_attribute(hid_t handle, const char *attr_name, void *buf, hid_t mem_type_id, int length);
+
+#endif
diff --git a/src/io/io.cc b/src/io/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cfd2603794ac6d52599f957139e1d062a6deae0
--- /dev/null
+++ b/src/io/io.cc
@@ -0,0 +1,2499 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file io.cc
+ *
+ * \brief main routines for driving I/O in Gadget's three file formats for snapshots, group catalogues etc.
+ */
+
+#include "gadgetconfig.h"
+
+#include <errno.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <algorithm>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../io/parameters.h"
+#include "../lightcone/lightcone.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+#define HALF_ROUND_STYLE 1
+#include "../half/half.hpp"
+using half_float::half;
+
+/* local functions */
+
+IO_Def::~IO_Def()
+{
+  if(N_IO_Fields > 0)
+    Mem.myfree_movable(IO_Fields);
+}
+
+/*!
+ * \param field Specifies the field as an enumeration type iofields (io_private.h), e.g. IO_POS.
+ * \param label The label of the dataset (4 characters, for obsolete old format=2)
+ * \param datasetname The name of the hdf5 dataset (maximum 256 characters, for format=3)
+ * \param type_in_memory The type of the field in the memory
+ * \param type_in_file_output The output type in the file if it is written (upon read, the type found will be converted to the memory
+ * type) \param read_flag This flags whether the field should be ignored upon read on (use SKIP_ON_READ, else READ_IF_PRESENT). \param
+ * values_per_block The number of values per field, e.g. 1 for mass, 3 for velocities \param array The array in which the value is
+ * stored \param pointer_to_field A Pointer to the field in one of the global arrays, e.g. &SphP[0].Density, or &P[0].Vel[0] \param
+ * io_func Alternatively, if the value to output/input is not a simple field, you can define a function which handles i/o \param
+ * typelist_bitmask Specifies for which particle type the field is present, e.g. 1+2+8 => field present for particle types 0,1,3 (or
+ * use ALL_TYPES, GAS_ONLY,...)
+ */
+void IO_Def::init_field(const char *label, const char *datasetname, enum types_in_memory type_in_memory,
+                        enum types_in_file type_in_file_output, enum read_flags read_flag, int values_per_block, enum arrays array,
+                        void *pointer_to_field, void (*io_func)(IO_Def *, int, int, void *, int), int typelist_bitmask, int flagunit,
+                        double a, double h, double L, double M, double V, double c, bool compression_on)
+{
+  const int alloc_step = 5;
+
+  if(values_per_block < 1)  // if we have no values, we don't register the field
+    return;
+
+  if(Max_IO_Fields == 0)
+    {
+      IO_Fields     = (IO_Field *)Mem.mymalloc_movable(&IO_Fields, "IO_Fields", alloc_step * sizeof(IO_Field));
+      Max_IO_Fields = alloc_step;
+    }
+  else if(Max_IO_Fields == N_IO_Fields)
+    {
+      Max_IO_Fields = ((Max_IO_Fields / alloc_step) + 1) * alloc_step;
+      IO_Fields     = (IO_Field *)Mem.myrealloc_movable(IO_Fields, Max_IO_Fields * sizeof(IO_Field));
+    }
+
+  int n           = N_IO_Fields++;
+  IO_Field *field = &IO_Fields[n];
+
+  strncpy(field->label, label, LABEL_LEN);
+  field->label[LABEL_LEN] = 0;
+
+  strncpy(field->datasetname, datasetname, DATASETNAME_LEN);
+  field->datasetname[DATASETNAME_LEN] = 0;
+
+  field->type_in_memory      = type_in_memory;
+  field->type_in_file_output = type_in_file_output;
+  field->read_flag           = read_flag;
+  field->values_per_block    = values_per_block;
+  field->typelist            = typelist_bitmask;
+#ifdef ALLOW_HDF5_COMPRESSION
+  field->compression_on = compression_on;
+#else
+  field->compression_on = false;
+#endif
+
+  field->array   = array;
+  field->io_func = io_func;
+
+  if(array == A_NONE)
+    {
+      field->offset = 0;
+    }
+  else
+    {
+      field->offset = (size_t)pointer_to_field - (size_t)get_base_address_of_structure(array, 0);
+    }
+
+  field->hasunit = flagunit;
+  field->a       = a;
+  field->h       = h;
+  field->L       = L;
+  field->M       = M;
+  field->V       = V;
+  field->c       = c;
+}
+
+/*! \brief This function determines on how many files a given snapshot or group/desc catalogue is distributed.
+ *
+ *  \param fname file name of the snapshot as given in the parameter file
+ */
+int IO_Def::find_files(const char *fname, const char *fname_multiple)
+{
+  FILE *fd;
+  char buf[200], buf1[200];
+  int dummy, files_found = 0;
+
+  if(file_format == FILEFORMAT_HDF5)
+    {
+      sprintf(buf, "%s.%d.hdf5", fname_multiple, 0);
+      sprintf(buf1, "%s.hdf5", fname);
+    }
+  else
+    {
+      sprintf(buf, "%s.%d", fname_multiple, 0);
+      sprintf(buf1, "%s", fname);
+    }
+
+  memset(header_buf, 0, header_size);
+
+  if(ThisTask == 0)
+    {
+      if((fd = fopen(buf, "r")))
+        {
+          if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+            {
+              if(file_format == FILEFORMAT_LEGACY2)
+                {
+                  my_fread(&dummy, sizeof(dummy), 1, fd);
+                  my_fread(&dummy, sizeof(dummy), 1, fd);
+                  my_fread(&dummy, sizeof(dummy), 1, fd);
+                  my_fread(&dummy, sizeof(dummy), 1, fd);
+                }
+
+              my_fread(&dummy, sizeof(dummy), 1, fd);
+              my_fread(header_buf, header_size, 1, fd);
+              my_fread(&dummy, sizeof(dummy), 1, fd);
+            }
+          fclose(fd);
+
+          if(file_format == FILEFORMAT_HDF5)
+            read_header_fields(buf);
+
+          files_found = 1;
+        }
+    }
+
+  MPI_Bcast(header_buf, header_size, MPI_BYTE, 0, Communicator);
+  MPI_Bcast(&files_found, 1, MPI_INT, 0, Communicator);
+
+  if(get_filenr_from_header() > 0)
+    return get_filenr_from_header();
+
+  if(ThisTask == 0)
+    {
+      if((fd = fopen(buf1, "r")))
+        {
+          if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+            {
+              if(file_format == FILEFORMAT_LEGACY2)
+                {
+                  my_fread(&dummy, sizeof(dummy), 1, fd);
+                  my_fread(&dummy, sizeof(dummy), 1, fd);
+                  my_fread(&dummy, sizeof(dummy), 1, fd);
+                  my_fread(&dummy, sizeof(dummy), 1, fd);
+                }
+
+              my_fread(&dummy, sizeof(dummy), 1, fd);
+              my_fread(header_buf, header_size, 1, fd);
+              my_fread(&dummy, sizeof(dummy), 1, fd);
+            }
+          fclose(fd);
+
+          if(file_format == FILEFORMAT_HDF5)
+            read_header_fields(buf1);
+
+          set_filenr_in_header(1);
+
+          files_found = 1;
+        }
+    }
+
+  MPI_Bcast(header_buf, header_size, MPI_BYTE, 0, Communicator);
+  MPI_Bcast(&files_found, 1, MPI_INT, 0, Communicator);
+
+  if(get_filenr_from_header() > 0)
+    return get_filenr_from_header();
+
+  if(files_found != 0)
+    Terminate("Have found IC files, but number of files in header seems to be zero\n");
+
+  Terminate("\nCan't find files, neither as '%s'\nnor as '%s'\n", buf, buf1);
+
+  return 0;
+}
+
+void IO_Def::read_files_driver(const char *fname, int rep, int num_files)
+{
+  if(rep == 0)
+    {
+      ntype_in_files =
+          (long long *)Mem.mymalloc_movable(&ntype_in_files, "ntype_in_files", num_files * N_DataGroups * sizeof(long long));
+      memset(ntype_in_files, 0, num_files * N_DataGroups * sizeof(long long));
+    }
+
+  void *CommBuffer = Mem.mymalloc("CommBuffer", COMMBUFFERSIZE);
+
+  int rest_files = num_files;
+
+  while(rest_files > NTask)
+    {
+      char buf[MAXLEN_PATH];
+
+      sprintf(buf, "%s.%d", fname, ThisTask + (rest_files - NTask));
+      if(file_format == FILEFORMAT_HDF5)
+        sprintf(buf, "%s.%d.hdf5", fname, ThisTask + (rest_files - NTask));
+
+      int ngroups = NTask / All.MaxFilesWithConcurrentIO;
+      if((NTask % All.MaxFilesWithConcurrentIO))
+        ngroups++;
+      int groupMaster = (ThisTask / ngroups) * ngroups;
+
+      for(int gr = 0; gr < ngroups; gr++)
+        {
+          if(ThisTask == (groupMaster + gr)) /* ok, it's this processor's turn */
+            {
+              if(rep == 0)
+                share_particle_number_in_file(buf, ThisTask + (rest_files - NTask), ThisTask, ThisTask);
+              else
+                read_file(buf, ThisTask + (rest_files - NTask), ThisTask, ThisTask, CommBuffer);
+            }
+          MPI_Barrier(Communicator);
+        }
+
+      rest_files -= NTask;
+    }
+
+  if(rest_files > 0)
+    {
+      int masterTask, filenr, lastTask;
+
+      distribute_file(rest_files, &filenr, &masterTask, &lastTask);
+
+      char buf[MAXLEN_PATH];
+
+      if(num_files > 1)
+        {
+          sprintf(buf, "%s.%d", fname, filenr);
+          if(file_format == FILEFORMAT_HDF5)
+            sprintf(buf, "%s.%d.hdf5", fname, filenr);
+        }
+      else
+        {
+          sprintf(buf, "%s", fname);
+          if(file_format == FILEFORMAT_HDF5)
+            sprintf(buf, "%s.hdf5", fname);
+        }
+
+      int ngroups = rest_files / All.MaxFilesWithConcurrentIO;
+      if((rest_files % All.MaxFilesWithConcurrentIO))
+        ngroups++;
+
+      for(int gr = 0; gr < ngroups; gr++)
+        {
+          if((filenr / All.MaxFilesWithConcurrentIO) == gr) /* ok, it's this processor's turn */
+            {
+              if(rep == 0)
+                share_particle_number_in_file(buf, filenr, masterTask, lastTask);
+              else
+                read_file(buf, filenr, masterTask, lastTask, CommBuffer);
+            }
+          MPI_Barrier(Communicator);
+        }
+    }
+
+  Mem.myfree(CommBuffer);
+
+  if(rep == 0)
+    MPI_Allreduce(MPI_IN_PLACE, ntype_in_files, num_files * N_DataGroups, MPI_LONG_LONG, MPI_MAX, Communicator);
+  else
+    {
+      /* we are done */
+      Mem.myfree_movable(ntype_in_files);
+      ntype_in_files = NULL;
+    }
+}
+
+/*! \brief This function distributes the particle numbers in the file fname
+ *  to tasks 'readTask' to 'lastTask', and calculates the number of particles each task gets.
+ *
+ *  \param fname filename to be read
+ *  \param readTask task responsible for reading the file fname
+ *  \param lastTask last Task which gets data contained in the file
+ *  \param readTypes readTypes is a bitfield that
+ *  determines what particle types to read, only if the bit
+ *  corresponding to a particle type is set, the corresponding data is
+ *  loaded, otherwise its particle number is set to zero. (This is
+ *  only implemented for HDF5 files.)
+ */
+void IO_Def::share_particle_number_in_file(const char *fname, int filenr, int readTask, int lastTask)
+{
+  long long n_type[N_DataGroups], npart[N_DataGroups];
+  unsigned int blksize1, blksize2;
+
+  if(ThisTask == readTask)
+    {
+      if(file_format == FILEFORMAT_HDF5)
+        {
+          read_header_fields(fname);
+        }
+      else if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+        {
+          FILE *fd = 0;
+
+          if(!(fd = fopen(fname, "r")))
+            Terminate("can't open file `%s' for reading initial conditions.\n", fname);
+
+          if(file_format == FILEFORMAT_LEGACY2)
+            {
+              char label[LABEL_LEN];
+              int nextblock;
+              my_fread(&blksize1, sizeof(int), 1, fd);
+              my_fread(&label, sizeof(char), LABEL_LEN, fd);
+              my_fread(&nextblock, sizeof(int), 1, fd);
+              printf("%s Reading header => '%c%c%c%c' (%d byte)\n", info, label[0], label[1], label[2], label[3], nextblock);
+              my_fread(&blksize2, sizeof(int), 1, fd);
+            }
+
+          my_fread(&blksize1, sizeof(int), 1, fd);
+          my_fread(header_buf, header_size, 1, fd);
+          my_fread(&blksize2, sizeof(int), 1, fd);
+
+#ifdef GADGET2_HEADER
+          if(blksize1 != 256 || blksize2 != 256)
+            Terminate("incorrect GADGET2 header format, blksize1=%d blksize2=%d  header_size=%d\n", blksize1, blksize2,
+                      (int)header_size);
+#else
+          if(blksize1 != blksize2)
+            Terminate("incorrect header format, blksize1=%d blksize2=%d  header_size=%d \n%s \n", blksize1, blksize2, (int)header_size,
+                      blksize1 == 256 ? "You may need to set GADGET2_HEADER" : "");
+#endif
+          fclose(fd);
+        }
+      else
+        Terminate("illegal format");
+
+      for(int task = readTask + 1; task <= lastTask; task++)
+        MPI_Ssend(header_buf, header_size, MPI_BYTE, task, TAG_HEADER, Communicator);
+    }
+  else
+    MPI_Recv(header_buf, header_size, MPI_BYTE, readTask, TAG_HEADER, Communicator, MPI_STATUS_IGNORE);
+
+  read_file_header(fname, filenr, readTask, lastTask, n_type, npart, NULL);
+
+  if(ThisTask == readTask)
+    {
+      mpi_printf("READIC: Reading file `%s' on task=%d and distribute it to %d to %d.\n", fname, ThisTask, readTask, lastTask);
+      myflush(stdout);
+    }
+
+  for(int type = 0; type < N_DataGroups; type++)
+    {
+      ntype_in_files[filenr * N_DataGroups + type] = npart[type];
+
+      long long n_in_file = npart[type];
+      int ntask           = lastTask - readTask + 1;
+      int n_for_this_task = n_in_file / ntask;
+      if((ThisTask - readTask) < (n_in_file % ntask))
+        n_for_this_task++;
+
+      read_increase_numbers(type, n_for_this_task);
+    }
+}
+
+/*! \brief This function fills the write buffer with particle data.
+ *
+ *  New output blocks can in principle be added here.
+ *
+ *  \param blocknr ID of the output block (i.e. position, velocities...)
+ *  \param startindex pointer containing the offset in the write buffer
+ *  \param pc number of particle to be put in the buffer
+ *  \param type particle type
+ */
+void IO_Def::fill_write_buffer(int blocknr, int *startindex, int pc, int type, void *CommBuffer)
+{
+  if(blocknr < 0 || blocknr >= N_IO_Fields)
+    Terminate("something is wrong here: blocknr=%d N_IO_Fields=%d", blocknr, N_IO_Fields);
+
+  IO_Field *field = &IO_Fields[blocknr];
+
+  int *intp        = (int *)CommBuffer;
+  long long *longp = (long long *)CommBuffer;
+  float *floatp    = (float *)CommBuffer;
+  double *doublep  = (double *)CommBuffer;
+
+  MyIDType *ip          = (MyIDType *)CommBuffer;
+  MyFloat *fp           = (MyFloat *)CommBuffer;
+  MyDouble *dp          = (MyDouble *)CommBuffer;
+  MyIntPosType *intposp = (MyIntPosType *)CommBuffer;
+
+  int pindex = *startindex;
+
+  for(int n = 0; n < pc; pindex++)
+    {
+      if(type_of_file == FILE_IS_SNAPSHOT && type < NTYPES && get_type_of_element(pindex) != type)
+        continue;
+#ifdef LIGHTCONE
+      if(type_of_file == FILE_IS_LIGHTCONE && type < NTYPES && get_type_of_element(pindex) != type)
+        continue;
+#endif
+      if(field->io_func)
+        {
+          switch(field->type_in_memory)
+            {
+              case MEM_INT:
+                field->io_func(this, pindex, field->values_per_block, intp, 0);
+                intp += field->values_per_block;
+                break;
+              case MEM_INT64:
+                field->io_func(this, pindex, field->values_per_block, longp, 0);
+                longp += field->values_per_block;
+                break;
+              case MEM_MY_ID_TYPE:
+                field->io_func(this, pindex, field->values_per_block, ip, 0);
+                ip += field->values_per_block;
+                break;
+              case MEM_MY_INTPOS_TYPE:
+                field->io_func(this, pindex, field->values_per_block, intposp, 0);
+                intposp += field->values_per_block;
+                break;
+              case MEM_FLOAT:
+                field->io_func(this, pindex, field->values_per_block, floatp, 0);
+                floatp += field->values_per_block;
+                break;
+              case MEM_DOUBLE:
+                field->io_func(this, pindex, field->values_per_block, doublep, 0);
+                doublep += field->values_per_block;
+                break;
+              case MEM_MY_FLOAT:
+                field->io_func(this, pindex, field->values_per_block, fp, 0);
+                fp += field->values_per_block;
+                break;
+              case MEM_MY_DOUBLE:
+                field->io_func(this, pindex, field->values_per_block, dp, 0);
+                dp += field->values_per_block;
+                break;
+              default:
+                Terminate("ERROR in fill_write_buffer: Type not found!\n");
+
+                break;
+            }
+        }
+      else
+        {
+          void *array_pos;
+
+          switch(field->array)
+            {
+              case A_NONE:
+                array_pos = NULL;
+                break;
+
+              default:
+                array_pos = get_base_address_of_structure(field->array, pindex);
+                break;
+            }
+
+          for(int k = 0; k < field->values_per_block; k++)
+            {
+              switch(field->type_in_memory)
+                {
+                  case MEM_INT:
+                    *intp++ = *((int *)((size_t)array_pos + field->offset + k * sizeof(int)));
+                    break;
+
+                  case MEM_INT64:
+                    *longp++ = *((long long *)((size_t)array_pos + field->offset + k * sizeof(long long)));
+                    break;
+
+                  case MEM_MY_ID_TYPE:
+                    *ip++ = *((MyIDType *)((size_t)array_pos + field->offset + k * sizeof(MyIDType)));
+                    break;
+
+                  case MEM_MY_INTPOS_TYPE:
+                    *intposp++ = *((MyIntPosType *)((size_t)array_pos + field->offset + k * sizeof(MyIntPosType)));
+                    break;
+
+                  case MEM_FLOAT:
+                    *floatp++ = *((float *)((size_t)array_pos + field->offset + k * sizeof(float)));
+                    break;
+
+                  case MEM_DOUBLE:
+                    *doublep++ = *((double *)((size_t)array_pos + field->offset + k * sizeof(double)));
+                    break;
+
+                  case MEM_MY_FLOAT:
+                    *fp++ = *((MyFloat *)((size_t)array_pos + field->offset + k * sizeof(MyFloat)));
+                    break;
+
+                  case MEM_MY_DOUBLE:
+                    *dp++ = *((MyDouble *)((size_t)array_pos + field->offset + k * sizeof(MyDouble)));
+                    break;
+
+                  default:
+                    Terminate("ERROR in fill_write_buffer: Type not found!\n");
+                    break;
+                }
+            }
+        }
+
+      n++;
+    }
+
+  *startindex = pindex;
+}
+
+/*! \brief This function reads out the io buffer that was filled with particle data.
+ *
+ * The data in the io buffer is put in the appropriate places of the particle structures.
+ *
+ * \param blocknr data block present in io buffer
+ * \param offset particle corresponding to the first element in io buffer
+ * \param pc number of elements in the io buffer
+ * \param type If blocknr=IO_POS P[n].Type is set to type
+ */
+void IO_Def::empty_read_buffer(int blocknr, int offset, int pc, int type, long long nprevious, void *CommBuffer)
+{
+  IO_Field *field = &IO_Fields[blocknr];
+
+  int *intp        = (int *)CommBuffer;
+  long long *longp = (long long *)CommBuffer;
+  float *floatp    = (float *)CommBuffer;
+  double *doublep  = (double *)CommBuffer;
+
+  MyIDType *ip          = (MyIDType *)CommBuffer;
+  MyFloat *fp           = (MyFloat *)CommBuffer;
+  MyDouble *dp          = (MyDouble *)CommBuffer;
+  MyIntPosType *intposp = (MyIntPosType *)CommBuffer;
+
+  if(field->read_flag != SKIP_ON_READ || field->type_in_memory == MEM_MY_FILEOFFSET)
+    {
+      for(int n = 0; n < pc; n++)
+        {
+          if(field->io_func)
+            {
+              switch(field->type_in_memory)
+                {
+                  case MEM_INT:
+                    field->io_func(this, offset + n, field->values_per_block, intp, 1);
+                    intp += field->values_per_block;
+                    break;
+                  case MEM_INT64:
+                    field->io_func(this, offset + n, field->values_per_block, longp, 1);
+                    longp += field->values_per_block;
+                    break;
+                  case MEM_MY_ID_TYPE:
+                    field->io_func(this, offset + n, field->values_per_block, ip, 1);
+                    ip += field->values_per_block;
+                    break;
+                  case MEM_MY_INTPOS_TYPE:
+                    field->io_func(this, offset + n, field->values_per_block, intposp, 1);
+                    intposp += field->values_per_block;
+                    break;
+                  case MEM_FLOAT:
+                    field->io_func(this, offset + n, field->values_per_block, floatp, 1);
+                    floatp += field->values_per_block;
+                    break;
+                  case MEM_DOUBLE:
+                    field->io_func(this, offset + n, field->values_per_block, doublep, 1);
+                    doublep += field->values_per_block;
+                    break;
+                  case MEM_MY_FLOAT:
+                    field->io_func(this, offset + n, field->values_per_block, fp, 1);
+                    fp += field->values_per_block;
+                    break;
+                  case MEM_MY_DOUBLE:
+                    field->io_func(this, offset + n, field->values_per_block, dp, 1);
+                    dp += field->values_per_block;
+                    break;
+                  case MEM_MY_FILEOFFSET:
+                    Terminate("undefined");
+                    break;
+                }
+            }
+          else
+            {
+              void *array_pos;
+              switch(field->array)
+                {
+                  case A_NONE:
+                    array_pos = 0;
+                    break;
+
+                  default:
+                    array_pos = get_base_address_of_structure(field->array, offset + n);
+                    break;
+                }
+
+              for(int k = 0; k < field->values_per_block; k++)
+                {
+                  switch(field->type_in_memory)
+                    {
+                      case MEM_INT:
+                        *((int *)((size_t)array_pos + field->offset + k * sizeof(int))) = *intp++;
+                        break;
+                      case MEM_INT64:
+                        *((long long *)((size_t)array_pos + field->offset + k * sizeof(long long))) = *longp++;
+                        break;
+                      case MEM_MY_ID_TYPE:
+                        *((MyIDType *)((size_t)array_pos + field->offset + k * sizeof(MyIDType))) = *ip++;
+                        break;
+                      case MEM_MY_INTPOS_TYPE:
+                        *((MyIntPosType *)((size_t)array_pos + field->offset + k * sizeof(MyIntPosType))) = *intposp++;
+                        break;
+                      case MEM_FLOAT:
+                        *((float *)((size_t)array_pos + field->offset + k * sizeof(float))) = *floatp++;
+                        break;
+                      case MEM_DOUBLE:
+                        *((double *)((size_t)array_pos + field->offset + k * sizeof(double))) = *doublep++;
+                        break;
+                      case MEM_MY_FLOAT:
+                        *((MyFloat *)((size_t)array_pos + field->offset + k * sizeof(MyFloat))) = *fp++;
+                        break;
+                      case MEM_MY_DOUBLE:
+                        *((MyDouble *)((size_t)array_pos + field->offset + k * sizeof(MyDouble))) = *dp++;
+                        break;
+                      case MEM_MY_FILEOFFSET:
+                        *((long long *)((size_t)array_pos + field->offset + k * sizeof(long long))) = nprevious++;
+                        break;
+                      default:
+                        Terminate("ERROR: Type not found!\n");
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void IO_Def::polling(int numfilesperdump)
+{
+  if(ThisTask == 0)
+    if(files_completed < numfilesperdump)
+      {
+        MPI_Status status;
+        int flag;
+
+        /* now check for a completion message  */
+        MPI_Iprobe(MPI_ANY_SOURCE, TAG_KEY, Communicator, &flag, &status);
+
+        if(flag)
+          {
+            int source = status.MPI_SOURCE;
+
+            int dummy;
+            MPI_Recv(&dummy, 1, MPI_INT, source, TAG_KEY, Communicator, MPI_STATUS_IGNORE);
+            files_completed++;
+
+            if(files_started < numfilesperdump)
+              {
+                /* send start signal */
+                MPI_Ssend(&ThisTask, 1, MPI_INT, seq[files_started++].thistask, TAG_N, Communicator);
+              }
+          }
+      }
+}
+
+/* driver routine for outputting multiple files, scheduled in optimal order under the constraint not to write more than a certain
+ * number of files simultanuously */
+void IO_Def::write_multiple_files(char *fname, int numfilesperdump, int append_flag, int chunksize)
+{
+  if(ThisTask == 0)
+    if(!(seq = (seq_data *)Mem.mymalloc("seq", NTask * sizeof(seq_data))))
+      Terminate("can't allocate seq_data");
+
+  void *CommBuffer = Mem.mymalloc("CommBuffer", COMMBUFFERSIZE);
+
+  /* assign processors to output files */
+  int filenr, masterTask, lastTask;
+  distribute_file(numfilesperdump, &filenr, &masterTask, &lastTask);
+
+  char buf[MAXLEN_PATH];
+  if(numfilesperdump > 1)
+    sprintf(buf, "%s.%d", fname, filenr);
+  else
+    sprintf(buf, "%s", fname);
+
+  seq_data seq_loc;
+  seq_loc.thistask   = ThisTask;
+  seq_loc.rankinnode = RankInThisNode;
+  seq_loc.thisnode   = ThisNode;
+
+  if(masterTask != ThisTask)
+    seq_loc.thistask = -1;
+
+  MPI_Gather(&seq_loc, sizeof(seq_data), MPI_BYTE, seq, sizeof(seq_data), MPI_BYTE, 0, Communicator);
+
+  if(ThisTask == 0)
+    {
+      int count = NTask;
+      for(int i = 0; i < count; i++)
+        {
+          if(seq[i].thistask < 0)
+            {
+              count--;
+              seq[i] = seq[count];
+              i--;
+            }
+        }
+      if(count != numfilesperdump)
+        Terminate("count=%d != numfilesperdump=%d", count, numfilesperdump);
+
+      std::sort(seq, seq + numfilesperdump);
+
+      files_started   = 0;
+      files_completed = 0;
+
+      for(int i = 1; i < std::min<int>(All.MaxFilesWithConcurrentIO, numfilesperdump); i++)
+        {
+          files_started++;
+          MPI_Ssend(&ThisTask, 1, MPI_INT, seq[i].thistask, TAG_N, Communicator);
+        }
+
+      files_started++;
+      if(append_flag)
+        append_file(buf, masterTask, lastTask, CommBuffer, numfilesperdump, chunksize);
+      else
+        write_file(buf, masterTask, lastTask, CommBuffer, numfilesperdump, chunksize);
+      files_completed++;
+
+      if(files_started < numfilesperdump)
+        {
+          /* send start signal */
+          MPI_Ssend(&ThisTask, 1, MPI_INT, seq[files_started++].thistask, TAG_N, Communicator);
+        }
+
+      while(files_completed < numfilesperdump)
+        polling(numfilesperdump);
+    }
+  else if(masterTask == ThisTask)
+    {
+      /* wait for start signal */
+      int dummy;
+      MPI_Recv(&dummy, 1, MPI_INT, 0, TAG_N, Communicator, MPI_STATUS_IGNORE); /* wait until we are told to start */
+
+      if(append_flag)
+        append_file(buf, masterTask, lastTask, CommBuffer, numfilesperdump, chunksize);
+      else
+        write_file(buf, masterTask, lastTask, CommBuffer, numfilesperdump, chunksize);
+
+      /* send back completion notice */
+      MPI_Ssend(&ThisTask, 1, MPI_INT, 0, TAG_KEY, Communicator);
+    }
+  else
+    {
+      if(append_flag)
+        append_file(buf, masterTask, lastTask, CommBuffer, numfilesperdump, chunksize);
+      else
+        write_file(buf, masterTask, lastTask, CommBuffer, numfilesperdump, chunksize);
+    }
+
+  Mem.myfree(CommBuffer);
+
+  if(ThisTask == 0)
+    Mem.myfree(seq);
+}
+
+/*! \brief Actually write the snapshot file to the disk
+ *
+ *  This function writes a snapshot file containing the data from processors
+ *  'writeTask' to 'lastTask'. 'writeTask' is the one that actually writes.
+ *  Each snapshot file contains a header first, then particle positions,
+ *  velocities and ID's.  Then particle masses are written for those particle
+ *  types with zero entry in MassTable.  After that, first the internal
+ *  energies u, and then the density is written for the SPH particles.  If
+ *  cooling is enabled, mean molecular weight and neutral hydrogen abundance
+ *  are written for the gas particles. This is followed by the SPH smoothing
+ *  length and further blocks of information, depending on included physics
+ *  and compile-time flags.
+ *
+ *  \param fname string containing the file name
+ *  \param writeTask the b of the task in a writing group that which is responsible
+ *         for the output operations
+ *  \param lastTask the rank of the last task in a writing group
+ *
+ */
+void IO_Def::write_file(char *fname, int writeTask, int lastTask, void *CommBuffer, int numfilesperdump, int chunksize)
+{
+  int typelist[N_DataGroups];
+  long long n_type[N_DataGroups], npart[N_DataGroups];
+  char label[LABEL_LEN + 1];
+  unsigned int blksize, bytes_per_blockelement_in_file = 0;
+  FILE *fd        = 0;
+  hid_t hdf5_file = 0, hdf5_grp[N_DataGroups], hdf5_headergrp = 0, hdf5_dataspace_memory;
+  hid_t hdf5_dataspace_in_file = 0, hdf5_dataset = 0, hdf5_prop = 0;
+  hsize_t dims[2], count[2], start[2];
+  int rank = 0, pcsum = 0;
+  hid_t hdf5_paramsgrp = 0;
+  hid_t hdf5_configgrp = 0;
+
+#define SKIP                                 \
+  {                                          \
+    my_fwrite(&blksize, sizeof(int), 1, fd); \
+  }
+
+  fill_file_header(writeTask, lastTask, n_type, npart);
+
+  /* open file and write header */
+  if(ThisTask == writeTask)
+    {
+      if(file_format == FILEFORMAT_HDF5)
+        {
+          char buf[MAXLEN_PATH];
+          sprintf(buf, "%s.hdf5", fname);
+          mpi_printf("%s file: '%s' (file 1 of %d)\n", info, fname, numfilesperdump);
+
+          rename_file_to_bak_if_it_exists(buf);
+
+          hdf5_file = my_H5Fcreate(buf, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
+
+          hdf5_headergrp = my_H5Gcreate(hdf5_file, "/Header", 0);
+          write_header_fields(hdf5_headergrp);
+
+          hdf5_paramsgrp = my_H5Gcreate(hdf5_file, "/Parameters", 0);
+          write_parameters_attributes_in_hdf5(hdf5_paramsgrp);
+
+          hdf5_configgrp = my_H5Gcreate(hdf5_file, "/Config", 0);
+          write_compile_time_options_in_hdf5(hdf5_configgrp);
+
+          for(int type = 0; type < N_DataGroups; type++)
+            {
+              if(npart[type] > 0)
+                {
+                  get_datagroup_name(type, buf);
+                  hdf5_grp[type] = my_H5Gcreate(hdf5_file, buf, 0);
+                }
+            }
+        }
+      else
+        {
+          rename_file_to_bak_if_it_exists(fname);
+
+          if(!(fd = fopen(fname, "w")))
+            Terminate("can't open file `%s' for writing.\n", fname);
+
+          mpi_printf("%s file: '%s' (file 1 of %d)\n", info, fname, numfilesperdump);
+
+          if(file_format == FILEFORMAT_LEGACY2)
+            {
+              blksize = sizeof(int) + 4 * sizeof(char);
+              SKIP;
+              my_fwrite((const void *)"HEAD", sizeof(char), 4, fd);
+              int nextblock = header_size + 2 * sizeof(int);
+              my_fwrite(&nextblock, sizeof(int), 1, fd);
+              SKIP;
+            }
+
+          blksize = header_size;
+          SKIP;
+          my_fwrite(header_buf, header_size, 1, fd);
+          SKIP;
+        }
+    }
+
+  for(int blocknr = 0; blocknr < N_IO_Fields; blocknr++)
+    {
+      polling(numfilesperdump);
+
+      if(IO_Fields[blocknr].type_in_file_output != FILE_NONE)
+        {
+          unsigned int bytes_per_blockelement = get_bytes_per_memory_blockelement(blocknr, 0);
+          int blockmaxlen                     = (int)(COMMBUFFERSIZE / bytes_per_blockelement);
+          long long npart_in_block            = get_particles_in_block(blocknr, npart, typelist);
+          hid_t hdf5_memory_datatype          = get_hdf5_memorytype_of_block(blocknr);
+          char dname[MAXLEN_PATH];
+          get_dataset_name(blocknr, dname);
+
+          if(npart_in_block > 0)
+            {
+              mpi_printf("%s block %d (%s)...\n", info, blocknr, dname);
+
+              if(ThisTask == writeTask)
+                {
+                  if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+                    {
+                      if(file_format == FILEFORMAT_LEGACY2)
+                        {
+                          blksize = sizeof(int) + LABEL_LEN * sizeof(char);
+                          SKIP;
+                          get_Tab_IO_Label(blocknr, label);
+                          my_fwrite(label, sizeof(char), LABEL_LEN, fd);
+                          int nextblock = npart_in_block * bytes_per_blockelement + 2 * sizeof(int);
+                          my_fwrite(&nextblock, sizeof(int), 1, fd);
+                          SKIP;
+                        }
+
+                      bytes_per_blockelement_in_file =
+                          IO_Fields[blocknr].values_per_block * H5Tget_size(get_hdf5_outputtype_of_block(blocknr));
+
+                      blksize = npart_in_block * bytes_per_blockelement_in_file;
+                      SKIP;
+                    }
+                }
+
+              for(int type = 0; type < N_DataGroups; type++)
+                {
+                  if(typelist[type])
+                    {
+                      if(ThisTask == writeTask && file_format == FILEFORMAT_HDF5 && npart[type] > 0)
+                        {
+                          hid_t hdf5_file_datatype = get_hdf5_outputtype_of_block(blocknr);
+
+                          dims[0] = npart[type];
+                          dims[1] = get_values_per_blockelement(blocknr);
+                          if(dims[1] == 1)
+                            rank = 1;
+                          else
+                            rank = 2;
+
+                          if(chunksize > 0)
+                            {
+                              hsize_t maxdims[2]     = {H5S_UNLIMITED, dims[1]};
+                              hdf5_dataspace_in_file = my_H5Screate_simple(rank, dims, maxdims);
+
+                              /* Modify dataset creation properties, i.e. enable chunking  */
+                              hdf5_prop             = H5Pcreate(H5P_DATASET_CREATE);
+                              hsize_t chunk_dims[2] = {0, dims[1]};
+                              chunk_dims[0]         = chunksize;
+                              H5Pset_chunk(hdf5_prop, rank, chunk_dims);
+
+                              hdf5_dataset =
+                                  my_H5Dcreate(hdf5_grp[type], dname, hdf5_file_datatype, hdf5_dataspace_in_file, hdf5_prop);
+                            }
+                          else if(IO_Fields[blocknr].compression_on)
+                            {
+                              hdf5_dataspace_in_file = my_H5Screate_simple(rank, dims, NULL);
+
+                              /* Modify dataset creation properties, i.e. enable compression  */
+                              hdf5_prop             = H5Pcreate(H5P_DATASET_CREATE);
+                              hsize_t chunk_dims[2] = {COMPRESSION_CHUNKSIZE, dims[1]};
+                              if(chunk_dims[0] > dims[0])
+                                chunk_dims[0] = dims[0];
+                              H5Pset_chunk(hdf5_prop, rank, chunk_dims); /* set chunk size */
+                              H5Pset_shuffle(hdf5_prop);                 /* reshuffle bytes to get better compression ratio */
+                              H5Pset_deflate(hdf5_prop, 9);              /* gzip compression level 9 */
+                              if(H5Pall_filters_avail(hdf5_prop))
+                                hdf5_dataset =
+                                    my_H5Dcreate(hdf5_grp[type], dname, hdf5_file_datatype, hdf5_dataspace_in_file, hdf5_prop);
+                              else
+                                Terminate("HDF5: Compression not available!\n");
+                            }
+                          else
+                            {
+                              hdf5_dataspace_in_file = my_H5Screate_simple(rank, dims, NULL);
+                              hdf5_dataset =
+                                  my_H5Dcreate(hdf5_grp[type], dname, hdf5_file_datatype, hdf5_dataspace_in_file, H5P_DEFAULT);
+                            }
+
+                          write_dataset_attributes(hdf5_dataset, blocknr);
+
+                          byte_count += dims[0] * dims[1] * my_H5Tget_size(hdf5_file_datatype); /* for I/O performance measurement */
+
+                          pcsum = 0;
+                        }
+
+                      for(int task = writeTask, offset = 0; task <= lastTask; task++)
+                        {
+                          int n_for_this_task;
+
+                          if(task == ThisTask)
+                            {
+                              n_for_this_task = n_type[type];
+
+                              for(int p = writeTask; p <= lastTask; p++)
+                                if(p != ThisTask)
+                                  MPI_Send(&n_for_this_task, 1, MPI_INT, p, TAG_NFORTHISTASK, Communicator);
+                            }
+                          else
+                            MPI_Recv(&n_for_this_task, 1, MPI_INT, task, TAG_NFORTHISTASK, Communicator, MPI_STATUS_IGNORE);
+
+                          while(n_for_this_task > 0)
+                            {
+                              int pc = n_for_this_task;
+
+                              if(pc > blockmaxlen)
+                                pc = blockmaxlen;
+
+                              if(ThisTask == task)
+                                fill_write_buffer(blocknr, &offset, pc, type, CommBuffer);
+
+                              if(ThisTask == writeTask && task != writeTask)
+                                MPI_Recv(CommBuffer, bytes_per_blockelement * pc, MPI_BYTE, task, TAG_PDATA, Communicator,
+                                         MPI_STATUS_IGNORE);
+
+                              if(ThisTask != writeTask && task == ThisTask)
+                                MPI_Ssend(CommBuffer, bytes_per_blockelement * pc, MPI_BYTE, writeTask, TAG_PDATA, Communicator);
+
+                              if(ThisTask == writeTask)
+                                {
+                                  if(file_format == FILEFORMAT_HDF5)
+                                    {
+                                      start[0] = pcsum;
+                                      start[1] = 0;
+
+                                      count[0] = pc;
+                                      count[1] = get_values_per_blockelement(blocknr);
+                                      pcsum += pc;
+
+                                      my_H5Sselect_hyperslab(hdf5_dataspace_in_file, H5S_SELECT_SET, start, NULL, count, NULL);
+
+                                      dims[0]               = pc;
+                                      dims[1]               = get_values_per_blockelement(blocknr);
+                                      hdf5_dataspace_memory = my_H5Screate_simple(rank, dims, NULL);
+
+                                      my_H5Dwrite(hdf5_dataset, hdf5_memory_datatype, hdf5_dataspace_memory, hdf5_dataspace_in_file,
+                                                  H5P_DEFAULT, CommBuffer, dname);
+
+                                      my_H5Sclose(hdf5_dataspace_memory, H5S_SIMPLE);
+                                    }
+                                  else
+                                    {
+                                      if(bytes_per_blockelement_in_file != bytes_per_blockelement)
+                                        {
+                                          char *CommAuxBuffer =
+                                              (char *)Mem.mymalloc("CommAuxBuffer", bytes_per_blockelement_in_file * pc);
+                                          type_cast_data((char *)CommBuffer, bytes_per_blockelement, (char *)CommAuxBuffer,
+                                                         bytes_per_blockelement_in_file, pc, blocknr);
+                                          my_fwrite(CommAuxBuffer, bytes_per_blockelement_in_file, pc, fd);
+                                          Mem.myfree(CommAuxBuffer);
+                                        }
+                                      else
+                                        my_fwrite(CommBuffer, bytes_per_blockelement, pc, fd);
+                                    }
+                                }
+
+                              n_for_this_task -= pc;
+                            }
+                        }
+
+                      if(ThisTask == writeTask && file_format == FILEFORMAT_HDF5 && npart[type] > 0)
+                        {
+                          if(file_format == FILEFORMAT_HDF5)
+                            {
+                              my_H5Dclose(hdf5_dataset, dname);
+                              if(chunksize > 0 || IO_Fields[blocknr].compression_on)
+                                my_H5Pclose(hdf5_prop);
+                              my_H5Sclose(hdf5_dataspace_in_file, H5S_SIMPLE);
+                            }
+                        }
+                    }
+                }
+
+              if(ThisTask == writeTask)
+                {
+                  if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+                    SKIP;
+                }
+            }
+        }
+    }
+
+  if(ThisTask == writeTask)
+    {
+      if(file_format == FILEFORMAT_HDF5)
+        {
+          char buf[MAXLEN_PATH];
+
+          for(int type = N_DataGroups - 1; type >= 0; type--)
+            if(npart[type] > 0)
+              {
+                get_datagroup_name(type, buf);
+                my_H5Gclose(hdf5_grp[type], buf);
+              }
+
+          my_H5Gclose(hdf5_headergrp, "/Header");
+          my_H5Gclose(hdf5_paramsgrp, "/Parameters");
+          my_H5Gclose(hdf5_configgrp, "/Config");
+
+          sprintf(buf, "%s.hdf5", fname);
+          my_H5Fclose(hdf5_file, buf);
+        }
+      else
+        fclose(fd);
+    }
+}
+
+void IO_Def::append_file(char *fname, int writeTask, int lastTask, void *CommBuffer, int numfilesperdump, int chunksize)
+{
+  int typelist[N_DataGroups];
+  long long n_type[N_DataGroups], npart[N_DataGroups], n_previous[N_DataGroups];
+  hid_t hdf5_file = 0, hdf5_grp[N_DataGroups], hdf5_headergrp = 0, hdf5_dataspace_memory;
+  hid_t hdf5_dataspace_in_file = 0, hdf5_dataset = 0;
+  hsize_t dims[2], count[2], start[2];
+  int rank = 0, pcsum = 0;
+
+  if(file_format != FILEFORMAT_HDF5)
+    Terminate("appending to files only works with HDF5 format\n");
+
+  read_file_header(NULL, 0, writeTask, lastTask, n_type, npart, NULL);
+
+  for(int n = 0; n < N_DataGroups; n++)
+    n_previous[n] = n_type[n];
+
+  fill_file_header(writeTask, lastTask, n_type, npart);
+
+  /* open file and write header */
+  if(ThisTask == writeTask)
+    {
+      char buf[MAXLEN_PATH];
+      sprintf(buf, "%s.hdf5", fname);
+
+      hdf5_file = my_H5Fopen(buf, H5F_ACC_RDWR, H5P_DEFAULT);
+
+      hdf5_headergrp = my_H5Gopen(hdf5_file, "/Header");
+      write_header_fields(hdf5_headergrp);
+
+      for(int type = 0; type < N_DataGroups; type++)
+        {
+          if(npart[type] > 0)
+            {
+              get_datagroup_name(type, buf);
+              if(n_previous[type] == 0)
+                hdf5_grp[type] = my_H5Gcreate(hdf5_file, buf, 0);
+              else
+                hdf5_grp[type] = my_H5Gopen(hdf5_file, buf);
+            }
+        }
+    }
+
+  for(int blocknr = 0; blocknr < N_IO_Fields; blocknr++)
+    {
+      polling(numfilesperdump);
+
+      if(IO_Fields[blocknr].type_in_file_output != FILE_NONE)
+        {
+          unsigned int bytes_per_blockelement = get_bytes_per_memory_blockelement(blocknr, 0);
+          int blockmaxlen                     = (int)(COMMBUFFERSIZE / bytes_per_blockelement);
+          long long npart_in_block            = get_particles_in_block(blocknr, npart, typelist);
+          hid_t hdf5_memory_datatype          = get_hdf5_memorytype_of_block(blocknr);
+          char dname[MAXLEN_PATH];
+          get_dataset_name(blocknr, dname);
+
+          if(npart_in_block > 0)
+            {
+              mpi_printf("%s block %d (%s)...\n", info, blocknr, dname);
+
+              for(int type = 0; type < N_DataGroups; type++)
+                {
+                  if(typelist[type])
+                    {
+                      if(ThisTask == writeTask && file_format == FILEFORMAT_HDF5 && npart[type] > 0)
+                        {
+                          hid_t hdf5_file_datatype = get_hdf5_outputtype_of_block(blocknr);
+
+                          dims[0] = npart[type] + n_previous[type];
+                          dims[1] = get_values_per_blockelement(blocknr);
+                          if(dims[1] == 1)
+                            rank = 1;
+                          else
+                            rank = 2;
+
+                          hdf5_dataspace_in_file = my_H5Screate_simple(rank, dims, NULL);
+
+                          if(n_previous[type] == 0)
+                            {
+                              if(chunksize > 0)
+                                {
+                                  hsize_t maxdims[2]     = {H5S_UNLIMITED, dims[1]};
+                                  hdf5_dataspace_in_file = my_H5Screate_simple(rank, dims, maxdims);
+
+                                  /* Modify dataset creation properties, i.e. enable chunking  */
+                                  hid_t prop            = H5Pcreate(H5P_DATASET_CREATE);
+                                  hsize_t chunk_dims[2] = {0, dims[1]};
+                                  chunk_dims[0]         = chunksize;
+                                  H5Pset_chunk(prop, rank, chunk_dims);
+
+                                  hdf5_dataset = my_H5Dcreate(hdf5_grp[type], dname, hdf5_file_datatype, hdf5_dataspace_in_file, prop);
+                                }
+                              else
+                                {
+                                  hdf5_dataset =
+                                      my_H5Dcreate(hdf5_grp[type], dname, hdf5_file_datatype, hdf5_dataspace_in_file, H5P_DEFAULT);
+                                  write_dataset_attributes(hdf5_dataset, blocknr);
+                                }
+                            }
+                          else
+                            {
+                              hdf5_dataset = my_H5Dopen_if_existing(hdf5_grp[type], dname);
+                              my_H5Dset_extent(hdf5_dataset, dims);
+                            }
+
+                          byte_count += dims[0] * dims[1] * my_H5Tget_size(hdf5_file_datatype); /* for I/O performance measurement */
+
+                          pcsum = 0;
+                        }
+
+                      for(int task = writeTask, offset = 0; task <= lastTask; task++)
+                        {
+                          int n_for_this_task;
+
+                          if(task == ThisTask)
+                            {
+                              n_for_this_task = n_type[type];
+
+                              for(int p = writeTask; p <= lastTask; p++)
+                                if(p != ThisTask)
+                                  MPI_Send(&n_for_this_task, 1, MPI_INT, p, TAG_NFORTHISTASK, Communicator);
+                            }
+                          else
+                            MPI_Recv(&n_for_this_task, 1, MPI_INT, task, TAG_NFORTHISTASK, Communicator, MPI_STATUS_IGNORE);
+
+                          while(n_for_this_task > 0)
+                            {
+                              int pc = n_for_this_task;
+
+                              if(pc > blockmaxlen)
+                                pc = blockmaxlen;
+
+                              if(ThisTask == task)
+                                fill_write_buffer(blocknr, &offset, pc, type, CommBuffer);
+
+                              if(ThisTask == writeTask && task != writeTask)
+                                MPI_Recv(CommBuffer, bytes_per_blockelement * pc, MPI_BYTE, task, TAG_PDATA, Communicator,
+                                         MPI_STATUS_IGNORE);
+
+                              if(ThisTask != writeTask && task == ThisTask)
+                                MPI_Ssend(CommBuffer, bytes_per_blockelement * pc, MPI_BYTE, writeTask, TAG_PDATA, Communicator);
+
+                              if(ThisTask == writeTask)
+                                {
+                                  start[0] = pcsum + n_previous[type];
+                                  start[1] = 0;
+
+                                  count[0] = pc;
+                                  count[1] = get_values_per_blockelement(blocknr);
+                                  pcsum += pc;
+
+                                  my_H5Sselect_hyperslab(hdf5_dataspace_in_file, H5S_SELECT_SET, start, NULL, count, NULL);
+
+                                  dims[0]               = pc;
+                                  dims[1]               = get_values_per_blockelement(blocknr);
+                                  hdf5_dataspace_memory = my_H5Screate_simple(rank, dims, NULL);
+
+                                  my_H5Dwrite(hdf5_dataset, hdf5_memory_datatype, hdf5_dataspace_memory, hdf5_dataspace_in_file,
+                                              H5P_DEFAULT, CommBuffer, dname);
+
+                                  my_H5Sclose(hdf5_dataspace_memory, H5S_SIMPLE);
+                                }
+
+                              n_for_this_task -= pc;
+                            }
+                        }
+
+                      if(ThisTask == writeTask && npart[type] > 0)
+                        {
+                          my_H5Dclose(hdf5_dataset, dname);
+                          my_H5Sclose(hdf5_dataspace_in_file, H5S_SIMPLE);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+  if(ThisTask == writeTask)
+    {
+      char buf[MAXLEN_PATH];
+
+      for(int type = N_DataGroups - 1; type >= 0; type--)
+        if(npart[type] > 0)
+          {
+            get_datagroup_name(type, buf);
+            my_H5Gclose(hdf5_grp[type], buf);
+          }
+
+      my_H5Gclose(hdf5_headergrp, "/Header");
+
+      sprintf(buf, "%s.hdf5", fname);
+      my_H5Fclose(hdf5_file, buf);
+    }
+}
+
+void IO_Def::type_cast_data(char *src, int src_bytes_per_element, char *target, int target_bytes_per_element, int len, int blocknr)
+{
+  switch(IO_Fields[blocknr].type_in_memory)
+    {
+      case MEM_INT:
+      case MEM_INT64:
+      case MEM_MY_ID_TYPE:
+      case MEM_MY_INTPOS_TYPE:
+      case MEM_MY_FILEOFFSET:
+        if(target_bytes_per_element > src_bytes_per_element)
+          {
+            if(target_bytes_per_element != 2 * src_bytes_per_element)
+              Terminate("something is odd here: target_bytes_per_element=%d != 2 * src_bytes_per_element=%d", target_bytes_per_element,
+                        src_bytes_per_element);
+
+            int fac       = src_bytes_per_element / sizeof(int);
+            int *sp       = (int *)src;
+            long long *tp = (long long *)target;
+            for(int i = 0; i < len * fac; i++)
+              *tp++ = *sp++;
+          }
+        else
+          {
+            if(src_bytes_per_element != 2 * target_bytes_per_element)
+              Terminate("something is odd here: src_bytes_per_element=%d != 2 * target_bytes_per_element=%d", src_bytes_per_element,
+                        target_bytes_per_element);
+            int fac       = src_bytes_per_element / sizeof(long long);
+            long long *sp = (long long *)src;
+            int *tp       = (int *)target;
+            for(int i = 0; i < len * fac; i++)
+              *tp++ = *sp++;
+          }
+        break;
+
+      case MEM_FLOAT:
+      case MEM_DOUBLE:
+      case MEM_MY_FLOAT:
+      case MEM_MY_DOUBLE:
+        if((target_bytes_per_element % 8) == 0 &&
+           (src_bytes_per_element % 4) == 0)  // target_bytes_per_element multiply of 8, src_bytes_per_element mulitple of 4
+          {
+            int fac    = src_bytes_per_element / sizeof(float);
+            float *sp  = (float *)src;
+            double *tp = (double *)target;
+            for(int i = 0; i < len * fac; i++)
+              *tp++ = *sp++;
+          }
+        else if((target_bytes_per_element % 4) == 0 && (src_bytes_per_element % 8) == 0)
+          {
+            int fac    = src_bytes_per_element / sizeof(double);
+            double *sp = (double *)src;
+            float *tp  = (float *)target;
+            for(int i = 0; i < len * fac; i++)
+              *tp++ = *sp++;
+          }
+        else if((target_bytes_per_element & 8) == 0 && (src_bytes_per_element % 2) == 0)
+          {
+            int fac    = src_bytes_per_element / sizeof(half);
+            half *sp   = (half *)src;
+            double *tp = (double *)target;
+            for(int i = 0; i < len * fac; i++)
+              *tp++ = *sp++;
+          }
+        else if((target_bytes_per_element % 2) == 0 && (src_bytes_per_element % 8) == 0)
+          {
+            int fac    = src_bytes_per_element / sizeof(double);
+            double *sp = (double *)src;
+            half *tp   = (half *)target;
+            for(int i = 0; i < len * fac; i++)
+              *tp++ = *sp++;
+          }
+        else
+          {
+            Terminate("Strange conversion requested: target_bytes_per_element=%d  src_bytes_per_element=%d\n",
+                      target_bytes_per_element, src_bytes_per_element);
+          }
+        break;
+    }
+}
+
+/*! \brief This function reads a file
+ *
+ *  This routine reads a single file. The data it contains is
+ *  distributed to tasks 'readTask' to 'lastTask'.
+ *
+ *  \param fname filename to be read
+ *  \param readTask task responsible for reading the file fname
+ *  \param lastTask last Task which gets data contained in the file
+ */
+void IO_Def::read_file(const char *fname, int filenr, int readTask, int lastTask, void *CommBuffer)
+{
+  long long n_type[N_DataGroups], npart[N_DataGroups];
+  int typelist[N_DataGroups];
+  unsigned int blksize1, blksize2, bytes_per_blockelement_in_file = 0;
+  hid_t hdf5_file = 0, hdf5_grp[N_DataGroups], hdf5_dataspace_in_file;
+  hid_t hdf5_dataspace_in_memory, hdf5_dataset;
+  FILE *fd = 0;
+
+  /* open file and read header */
+
+  if(ThisTask == readTask)
+    {
+      if(file_format == FILEFORMAT_HDF5)
+        {
+          read_header_fields(fname);
+        }
+      else if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+        {
+          if(!(fd = fopen(fname, "r")))
+            Terminate("can't open file `%s' for reading initial conditions.\n", fname);
+
+          if(file_format == FILEFORMAT_LEGACY2)
+            {
+              int nextblock;
+              char label[LABEL_LEN];
+              my_fread(&blksize1, sizeof(int), 1, fd);
+              my_fread(&label, sizeof(char), LABEL_LEN, fd);
+              my_fread(&nextblock, sizeof(int), 1, fd);
+              my_fread(&blksize2, sizeof(int), 1, fd);
+            }
+
+          my_fread(&blksize1, sizeof(int), 1, fd);
+          my_fread(header_buf, header_size, 1, fd);
+          my_fread(&blksize2, sizeof(int), 1, fd);
+        }
+      else
+        Terminate("unknown ICFormat");
+
+      for(int task = readTask + 1; task <= lastTask; task++)
+        MPI_Ssend(header_buf, header_size, MPI_BYTE, task, TAG_HEADER, Communicator);
+    }
+  else
+    MPI_Recv(header_buf, header_size, MPI_BYTE, readTask, TAG_HEADER, Communicator, MPI_STATUS_IGNORE);
+
+  int nstart;
+  read_file_header(fname, filenr, readTask, lastTask, n_type, npart, &nstart);
+
+  if(ThisTask == readTask)
+    {
+      if(file_format == FILEFORMAT_HDF5)
+        {
+          hdf5_file = my_H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
+
+          for(int type = 0; type < N_DataGroups; type++)
+            {
+              if(npart[type] > 0)
+                {
+                  char buf[MAXLEN_PATH];
+                  get_datagroup_name(type, buf);
+                  hdf5_grp[type] = my_H5Gopen(hdf5_file, buf);
+                }
+            }
+        }
+    }
+
+  for(int blocknr = 0; blocknr < N_IO_Fields; blocknr++)
+    {
+      if((IO_Fields[blocknr].read_flag != SKIP_ON_READ &&
+          !(file_format == FILEFORMAT_LEGACY1 && All.RestartFlag == RST_BEGIN && type_of_file == FILE_IS_SNAPSHOT &&
+            blocknr > 4) /* this second conditions allows short legacy ICs to be read in */
+          ) ||
+         IO_Fields[blocknr].type_in_memory == MEM_MY_FILEOFFSET)
+        {
+          unsigned int bytes_per_blockelement = get_bytes_per_memory_blockelement(blocknr, 1);
+          int blockmaxlen                     = (int)(COMMBUFFERSIZE / bytes_per_blockelement);
+          long long npart_in_block            = get_particles_in_block(blocknr, npart, &typelist[0]);
+          hid_t hdf5_memory_datatype          = get_hdf5_memorytype_of_block(blocknr);
+          char dname[MAXLEN_PATH];
+          get_dataset_name(blocknr, dname);
+
+          if(npart_in_block > 0)
+            {
+              if(filenr == 0)
+                mpi_printf("READIC: reading block %d (%s)...\n", blocknr, dname);
+
+              if(ThisTask == readTask && IO_Fields[blocknr].type_in_memory != MEM_MY_FILEOFFSET)
+                {
+                  if(file_format == FILEFORMAT_LEGACY2)
+                    {
+                      char expected_label[LABEL_LEN + 1];
+                      get_Tab_IO_Label(blocknr, expected_label);
+                      find_block(expected_label, fd);
+                    }
+
+                  if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+                    {
+                      my_fread(&blksize1, sizeof(int), 1, fd);
+                      bytes_per_blockelement_in_file = blksize1 / npart_in_block;
+                    }
+                }
+
+              int offset = 0;
+
+              for(int type = 0; type < N_DataGroups; type++)
+                {
+                  if(type_of_file != FILE_IS_SNAPSHOT)
+                    offset = 0;
+
+                  int n_in_file = npart[type];
+                  int pcsum     = 0;
+
+                  long long nprevious = 0;
+                  for(int t = 0; t < type; t++)
+                    for(int f = 0; f < get_filenr_from_header(); f++)
+                      nprevious += ntype_in_files[f * N_DataGroups + t];
+
+                  for(int nr = 0; nr < filenr; nr++)
+                    nprevious += ntype_in_files[nr * N_DataGroups + type];
+
+                  if(typelist[type] == 0)
+                    {
+                      int ntask           = lastTask - readTask + 1;
+                      int n_for_this_task = n_in_file / ntask;
+                      if((ThisTask - readTask) < (n_in_file % ntask))
+                        n_for_this_task++;
+
+                      offset += n_for_this_task;
+                    }
+                  else
+                    {
+                      for(int task = readTask; task <= lastTask; task++)
+                        {
+                          int ntask           = lastTask - readTask + 1;
+                          int n_for_this_task = n_in_file / ntask;
+                          if((task - readTask) < (n_in_file % ntask))
+                            n_for_this_task++;
+
+                          do
+                            {
+                              int pc = n_for_this_task;
+
+                              if(pc > blockmaxlen)
+                                pc = blockmaxlen;
+
+                              if(ThisTask == readTask && IO_Fields[blocknr].type_in_memory != MEM_MY_FILEOFFSET)
+                                {
+                                  if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+                                    {
+                                      if(bytes_per_blockelement_in_file != bytes_per_blockelement)
+                                        {
+                                          char *CommAuxBuffer =
+                                              (char *)Mem.mymalloc("CommAuxBuffer", bytes_per_blockelement_in_file * pc);
+                                          my_fread(CommAuxBuffer, bytes_per_blockelement_in_file, pc, fd);
+                                          type_cast_data((char *)CommAuxBuffer, bytes_per_blockelement_in_file, (char *)CommBuffer,
+                                                         bytes_per_blockelement, pc, blocknr);
+                                          Mem.myfree(CommAuxBuffer);
+                                        }
+                                      else
+                                        my_fread(CommBuffer, bytes_per_blockelement, pc, fd);
+                                    }
+
+                                  if(file_format == FILEFORMAT_HDF5 && pc > 0)
+                                    {
+                                      hdf5_dataset = my_H5Dopen_if_existing(hdf5_grp[type], dname);
+                                      int rank;
+                                      hsize_t dims[2], count[2], start[2];
+
+                                      dims[0] = npart[type];
+                                      dims[1] = get_values_per_blockelement(blocknr);
+                                      if(dims[1] == 1)
+                                        rank = 1;
+                                      else
+                                        rank = 2;
+
+                                      hdf5_dataspace_in_file = my_H5Screate_simple(rank, dims, NULL);
+
+                                      dims[0]                  = pc;
+                                      hdf5_dataspace_in_memory = my_H5Screate_simple(rank, dims, NULL);
+
+                                      start[0] = pcsum;
+                                      start[1] = 0;
+
+                                      count[0] = pc;
+                                      count[1] = get_values_per_blockelement(blocknr);
+                                      pcsum += pc;
+
+                                      my_H5Sselect_hyperslab(hdf5_dataspace_in_file, H5S_SELECT_SET, start, NULL, count, NULL);
+
+                                      // Test if dataset was present
+                                      if(hdf5_dataset < 0)
+                                        {
+                                          // no, pad with zeros
+                                          if((ThisTask == readTask) && (task == ThisTask))
+                                            printf("\tDataset %s not present for particle type %d, using zero.\n", dname, type);
+                                          memset(CommBuffer, 0, dims[0] * dims[1] * my_H5Tget_size(hdf5_memory_datatype));
+                                        }
+                                      else
+                                        {
+                                          hid_t hdf5_file_datatype = H5Dget_type(hdf5_dataset);
+                                          byte_count += dims[0] * dims[1] *
+                                                        my_H5Tget_size(hdf5_file_datatype); /* for I/O performance measurement */
+                                          H5Tclose(hdf5_file_datatype);
+
+                                          my_H5Dread(hdf5_dataset, hdf5_memory_datatype, hdf5_dataspace_in_memory,
+                                                     hdf5_dataspace_in_file, H5P_DEFAULT, CommBuffer, dname);
+                                          my_H5Dclose(hdf5_dataset, dname);
+                                        }
+                                      my_H5Sclose(hdf5_dataspace_in_memory, H5S_SIMPLE);
+                                      my_H5Sclose(hdf5_dataspace_in_file, H5S_SIMPLE);
+                                    }
+                                }
+
+                              if(ThisTask == readTask && task != readTask && pc > 0)
+                                MPI_Ssend(CommBuffer, bytes_per_blockelement * pc, MPI_BYTE, task, TAG_PDATA, Communicator);
+
+                              if(ThisTask != readTask && task == ThisTask && pc > 0)
+                                MPI_Recv(CommBuffer, bytes_per_blockelement * pc, MPI_BYTE, readTask, TAG_PDATA, Communicator,
+                                         MPI_STATUS_IGNORE);
+
+                              if(ThisTask == task)
+                                {
+                                  if(blocknr == 0 && IO_Fields[blocknr].array == A_P)
+                                    {
+                                      for(int n = 0; n < pc; n++)
+                                        set_type_of_element(nstart + offset + n, type); /* initialize type */
+                                    }
+#ifdef MERGERTREE
+                                  if(blocknr == 0 && IO_Fields[blocknr].array == A_MTRP)
+                                    {
+                                      for(int n = 0; n < pc; n++)
+                                        {
+                                          set_type_of_element(nstart + offset + n, type); /* initialize type */
+                                          //         MtrP[nstart + offset + n].Type = type;  /* initialize type */
+                                        }
+                                    }
+#endif
+                                  empty_read_buffer(blocknr, nstart + offset, pc, type, nprevious, CommBuffer);
+
+                                  offset += pc;
+                                }
+
+                              n_for_this_task -= pc;
+                              nprevious += pc;
+                            }
+                          while(n_for_this_task > 0);
+                        }
+                    }
+                }
+              if(ThisTask == readTask && IO_Fields[blocknr].type_in_memory != MEM_MY_FILEOFFSET)
+                {
+                  if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+                    {
+                      my_fread(&blksize2, sizeof(int), 1, fd);
+                      ;
+                      if(blksize1 != blksize2)
+                        {
+                          char buf[MAXLEN_PATH];
+                          sprintf(buf, "incorrect block-sizes detected!\n Task=%d   blocknr=%d  blksize1=%d  blksize2=%d\n", ThisTask,
+                                  blocknr, blksize1, blksize2);
+                          if(blocknr == 2) /* block number 2 is always IDs */
+                            strcat(buf, "Possible mismatch of 32bit and 64bit ID's in IC file and GADGET compilation !\n");
+                          Terminate(buf);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+  for(int type = 0; type < N_DataGroups; type++)
+    {
+      long long n_in_file = npart[type];
+      int ntask           = lastTask - readTask + 1;
+      int n_for_this_task = n_in_file / ntask;
+      if((ThisTask - readTask) < (n_in_file % ntask))
+        n_for_this_task++;
+
+      read_increase_numbers(type, n_for_this_task);
+    }
+
+  if(ThisTask == readTask)
+    {
+      if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+        fclose(fd);
+
+      if(file_format == FILEFORMAT_HDF5)
+        {
+          for(int type = N_DataGroups - 1; type >= 0; type--)
+            if(npart[type] > 0)
+              {
+                char buf[MAXLEN_PATH];
+                get_datagroup_name(type, buf);
+                my_H5Gclose(hdf5_grp[type], buf);
+              }
+          my_H5Fclose(hdf5_file, fname);
+        }
+    }
+}
+
+/*! \brief This function assigns a certain number of tasks to each file.
+ *
+ *  These tasks are containing the content of that file after the ICs have been read
+ *  The number of tasks per file is as homogeneous as possible.
+ *  The number of files may at most be equal to the number of tasks.
+ *
+ *  \param nfiles Number of files of which the snapshot is distributed
+ *  \param filenr contains the file number to which this task belongs
+ *  \param master the number of the task responsible to read the file
+ *  \param last number of the last task belonging to the same file as this task
+ */
+void IO_Def::distribute_file(int nfiles, int *filenr, int *master, int *last)
+{
+  int tasks_per_file = NTask / nfiles;
+  int tasks_left     = NTask % nfiles;
+
+  if(tasks_left == 0)
+    {
+      int group = ThisTask / tasks_per_file;
+      *master   = group * tasks_per_file;
+      *last     = (group + 1) * tasks_per_file - 1;
+      *filenr   = group;
+      return;
+    }
+
+  double tpf = ((double)NTask) / nfiles;
+
+  *last = -1;
+  for(int i = 0; i < nfiles; i++)
+    {
+      *master = *last + 1;
+      *last   = (i + 1) * tpf;
+      if(*last >= NTask)
+        *last = *last - 1;
+      if(*last < *master)
+        Terminate("last < master");
+      *filenr = i;
+
+      if(i == nfiles - 1)
+        *last = NTask - 1;
+
+      if(ThisTask >= *master && ThisTask <= *last)
+        return;
+    }
+}
+
+/*! \brief This function tells the size in bytes of one data entry in each of the blocks
+ *  defined for the output file.
+ *
+ *  \param blocknr ID of the output block (i.e. position, velocities...)
+ *  \param mode used to distinguish whether the function is called in input
+ *         mode (mode > 0) or in output mode (mode = 0). The size of one data
+ *         entry may vary depending on the mode
+ *  \return size of the data entry in bytes
+ */
+int IO_Def::get_bytes_per_memory_blockelement(int blocknr, int mode)
+{
+  if(blocknr < 0 || blocknr >= N_IO_Fields)
+    Terminate("something is wrong here: blocknr=%d N_IO_Fields=%d", blocknr, N_IO_Fields);
+
+  int bytes_per_blockelement = 0;
+
+  IO_Field *field = &IO_Fields[blocknr];
+
+  switch(field->type_in_memory)
+    {
+      case MEM_INT:
+        bytes_per_blockelement = field->values_per_block * sizeof(int);
+        break;
+      case MEM_INT64:
+        bytes_per_blockelement = field->values_per_block * sizeof(long long);
+        break;
+      case MEM_MY_ID_TYPE:
+        bytes_per_blockelement = field->values_per_block * sizeof(MyIDType);
+        break;
+      case MEM_MY_INTPOS_TYPE:
+        bytes_per_blockelement = field->values_per_block * sizeof(MyIntPosType);
+        break;
+      case MEM_FLOAT:
+        bytes_per_blockelement = field->values_per_block * sizeof(float);
+        break;
+      case MEM_DOUBLE:
+        bytes_per_blockelement = field->values_per_block * sizeof(double);
+        break;
+      case MEM_MY_FLOAT:
+        bytes_per_blockelement = field->values_per_block * sizeof(MyFloat);
+        break;
+      case MEM_MY_DOUBLE:
+        bytes_per_blockelement = field->values_per_block * sizeof(MyDouble);
+        break;
+      case MEM_MY_FILEOFFSET:
+        bytes_per_blockelement = field->values_per_block * sizeof(long long);
+        break;
+    }
+
+  return bytes_per_blockelement;
+}
+
+hid_t IO_Def::get_hdf5_outputtype_of_block(int blocknr)
+{
+  hid_t hdf5_datatype = 0;
+
+  switch(IO_Fields[blocknr].type_in_file_output)
+    {
+      case FILE_INT:
+        hdf5_datatype = H5T_NATIVE_INT;
+        break;
+      case FILE_INT64:
+        hdf5_datatype = H5T_NATIVE_INT64;
+        break;
+      case FILE_MY_IO_FLOAT:
+#ifdef OUTPUT_IN_DOUBLEPRECISION
+        hdf5_datatype = H5T_NATIVE_DOUBLE;
+#else
+        hdf5_datatype = H5T_NATIVE_FLOAT;
+#endif
+        break;
+      case FILE_MY_ID_TYPE:
+#if defined(IDS_32BIT)
+        hdf5_datatype = H5T_NATIVE_UINT32;
+#elif defined(IDS_48BIT)
+        hdf5_datatype = Int48_memtype;
+#else
+        hdf5_datatype = H5T_NATIVE_UINT64;
+#endif
+        break;
+      case FILE_MY_INTPOS_TYPE:
+#if defined(POSITIONS_IN_32BIT)
+        hdf5_datatype = H5T_NATIVE_UINT32;
+#elif defined(POSITIONS_IN_64BIT)
+        hdf5_datatype = H5T_NATIVE_UINT64;
+#else
+        hdf5_datatype = Int128_memtype;
+#endif
+        break;
+      case FILE_DOUBLE:
+        hdf5_datatype = H5T_NATIVE_DOUBLE;
+        break;
+      case FILE_FLOAT:
+        hdf5_datatype = H5T_NATIVE_FLOAT;
+        break;
+      case FILE_HALF:
+        hdf5_datatype = Halfprec_memtype;
+        break;
+      case FILE_NONE:
+        Terminate("undefined type");
+        break;
+    }
+
+  return hdf5_datatype;
+}
+
+hid_t IO_Def::get_hdf5_memorytype_of_block(int blocknr)
+{
+  hid_t hdf5_datatype = 0;
+
+  switch(IO_Fields[blocknr].type_in_memory)
+    {
+      case MEM_INT:
+        hdf5_datatype = H5T_NATIVE_INT;
+        break;
+      case MEM_INT64:
+        hdf5_datatype = H5T_NATIVE_INT64;
+        break;
+      case MEM_MY_ID_TYPE:
+#ifdef IDS_32BIT
+        hdf5_datatype = H5T_NATIVE_UINT32;
+#else
+        hdf5_datatype = H5T_NATIVE_UINT64;
+#endif
+        break;
+      case MEM_MY_INTPOS_TYPE:
+#ifdef POSITIONS_IN_32BIT
+        hdf5_datatype = H5T_NATIVE_UINT32;
+#elif defined(POSITIONS_IN_64BIT)
+        hdf5_datatype = H5T_NATIVE_UINT64;
+#else
+        hdf5_datatype = Int128_memtype;
+#endif
+        break;
+      case MEM_FLOAT:
+        hdf5_datatype = H5T_NATIVE_FLOAT;
+        break;
+      case MEM_DOUBLE:
+        hdf5_datatype = H5T_NATIVE_DOUBLE;
+        break;
+      case MEM_MY_FLOAT:
+        hdf5_datatype = H5T_NATIVE_MYFLOAT;
+        break;
+      case MEM_MY_DOUBLE:
+        hdf5_datatype = H5T_NATIVE_MYDOUBLE;
+        break;
+      case MEM_MY_FILEOFFSET:
+        hdf5_datatype = H5T_NATIVE_INT64;
+        break;
+    }
+
+  return hdf5_datatype;
+}
+
+/*! \brief This function determines the number of elements composing one data entry
+ *  in each of the blocks defined for the output file.
+ *
+ *  Used only if output in HDF5 format is enabled
+ *
+ *  \param blocknr ID of the output block (i.e. position, velocities...)
+ *  \return number of elements of one data entry
+ */
+int IO_Def::get_values_per_blockelement(int blocknr)
+{
+  if(blocknr < 0 || blocknr >= N_IO_Fields)
+    Terminate("something is wrong here: blocknr=%d N_IO_Fields=%d", blocknr, N_IO_Fields);
+
+  return IO_Fields[blocknr].values_per_block;
+}
+
+/*! \brief Get particle number in an output block
+ *
+ *  This function determines how many particles there are in a given block,
+ *  based on the information in the header-structure.  It also flags particle
+ *  types that are present in the block in the typelist array.
+ *
+ *  \param blocknr ID of the output block (i.e. position, velocities...)
+ *  \param typelist array that contains the number of particles of each type in the block
+ *  \return the total number of particles in the block
+ */
+long long IO_Def::get_particles_in_block(int blocknr, long long *npart_file, int *typelist)
+{
+  long long npart = 0;
+
+  for(int i = 0; i < N_DataGroups; i++)
+    typelist[i] = 0;
+
+  switch(IO_Fields[blocknr].typelist)
+    {
+      case MASS_BLOCK:
+        for(int i = 0; i < NTYPES; i++)
+          {
+            typelist[i] = (All.MassTable[i] == 0 && npart_file[i] > 0);
+            npart += npart_file[i] * typelist[i];
+          }
+        return npart;
+        break;
+
+      case AGE_BLOCK:
+        for(int i = 0; i < NTYPES; i++)
+          {
+            typelist[i] = (npart_file[i] > 0);
+
+            if((file_format == FILEFORMAT_HDF5 && (i == 0 || i == 1 || i == 5)) || (file_format != FILEFORMAT_HDF5 && i != 4))
+              typelist[i] = 0;
+
+            npart += npart_file[i] * typelist[i];
+          }
+        return npart;
+        break;
+
+      case Z_BLOCK:
+        for(int i = 0; i < NTYPES; i++)
+          {
+            typelist[i] = (npart_file[i] > 0);
+            if((file_format == FILEFORMAT_HDF5 && (i == 1 || i == 5)) || (file_format != FILEFORMAT_HDF5 && (i != 0 && i != 4)))
+              typelist[i] = 0;
+
+            npart += npart_file[i] * typelist[i];
+          }
+        return npart;
+        break;
+
+      case GROUPS:
+        npart       = npart_file[0];
+        typelist[0] = 1;
+        return npart;
+        break;
+
+      case SUBGROUPS:
+        npart       = npart_file[1];
+        typelist[1] = 1;
+        return npart;
+        break;
+
+      case ID_BLOCK:
+        npart       = npart_file[2];
+        typelist[2] = 1;
+        return npart;
+        break;
+
+      case CURRSUBS:
+      case PREVSUBS:
+      case TREELINK:
+      case TREELENGTH:
+      case MASSMAPS:
+      case GALSNAPS:
+        npart       = npart_file[0];
+        typelist[0] = 1;
+        return npart;
+        break;
+
+      case TREEHALOS:
+        npart       = npart_file[1];
+        typelist[1] = 1;
+        return npart;
+        break;
+
+      case TREETIMES:
+        npart       = npart_file[2];
+        typelist[2] = 1;
+        return npart;
+        break;
+
+      case TREETABLE:
+        npart            = npart_file[NTYPES];
+        typelist[NTYPES] = 1;
+        return npart;
+        break;
+
+      case HEALPIXTAB:
+        npart                = npart_file[NTYPES + 1];
+        typelist[NTYPES + 1] = 1;
+        return npart;
+        break;
+
+      default:
+        for(int i = 0; i < N_DataGroups; i++)
+          {
+            if((IO_Fields[blocknr].typelist & (1 << i)) && npart_file[i] > 0)
+              {
+                typelist[i] = 1;
+                npart += npart_file[i];
+              }
+            else
+              typelist[i] = 0;
+          }
+        return npart;
+        break;
+    }
+
+  return 0;
+}
+
+/*! \brief This function associates a short 4-character block name with each block number.
+ *
+ *   This is stored in front of each block for snapshot FileFormat=2.
+ *
+ *  \param blocknr ID of the output block (i.e. position, velocities...)
+ *  \param label string containing the dataset name
+ */
+void IO_Def::get_Tab_IO_Label(int blocknr, char *label)
+{
+  if(blocknr < 0 || blocknr >= N_IO_Fields)
+    Terminate("something is wrong here: blocknr=%d N_IO_Fields=%d", blocknr, N_IO_Fields);
+
+  strcpy(label, IO_Fields[blocknr].label);
+}
+
+/*! \brief This function associates a dataset name with each block number.
+ *
+ *   This is needed to name the dataset if the output is written in HDF5 format
+ *
+ *  \param blocknr ID of the output block (i.e. position, velocities...)
+ *  \param buf string containing the dataset name
+ */
+void IO_Def::get_dataset_name(int blocknr, char *buf)
+{
+  if(blocknr < 0 || blocknr >= N_IO_Fields)
+    Terminate("something is wrong here: blocknr=%d N_IO_Fields=%d", blocknr, N_IO_Fields);
+
+  strcpy(buf, IO_Fields[blocknr].datasetname);
+}
+
+void IO_Def::write_dataset_attributes(hid_t hdf5_dataset, int blocknr)
+{
+  if(blocknr < 0 || blocknr >= N_IO_Fields)
+    Terminate("something is wrong here: blocknr=%d N_IO_Fields=%d", blocknr, N_IO_Fields);
+
+  if(IO_Fields[blocknr].hasunit == 0)
+    return;
+
+  if(All.ComovingIntegrationOn)
+    {
+      write_scalar_attribute(hdf5_dataset, "a_scaling", &IO_Fields[blocknr].a, H5T_NATIVE_DOUBLE);
+      write_scalar_attribute(hdf5_dataset, "h_scaling", &IO_Fields[blocknr].h, H5T_NATIVE_DOUBLE);
+    }
+  else
+    {
+      double zero = 0;
+      write_scalar_attribute(hdf5_dataset, "a_scaling", &zero, H5T_NATIVE_DOUBLE);
+      write_scalar_attribute(hdf5_dataset, "h_scaling", &zero, H5T_NATIVE_DOUBLE);
+    }
+
+  write_scalar_attribute(hdf5_dataset, "length_scaling", &IO_Fields[blocknr].L, H5T_NATIVE_DOUBLE);
+  write_scalar_attribute(hdf5_dataset, "mass_scaling", &IO_Fields[blocknr].M, H5T_NATIVE_DOUBLE);
+  write_scalar_attribute(hdf5_dataset, "velocity_scaling", &IO_Fields[blocknr].V, H5T_NATIVE_DOUBLE);
+
+  write_scalar_attribute(hdf5_dataset, "to_cgs", &IO_Fields[blocknr].c, H5T_NATIVE_DOUBLE);
+}
+
+/*! \brief Write the parameters read from the parameter file into the HDF5 snapshot file
+ *
+ *  This function stores the parameter io_header as attributes belonging
+ *  to the parameter group of the HDF5 file.
+ *
+ *  \param handle contains a reference to the parameter group
+ */
+void IO_Def::write_parameters_attributes_in_hdf5(hid_t handle)
+{
+  for(int i = 0; i < All.NParameters; i++)
+    {
+      switch(All.ParametersType[i])
+        {
+          case PARAM_DOUBLE:
+            write_scalar_attribute(handle, All.ParametersTag[i], All.ParametersValue[i], H5T_NATIVE_DOUBLE);
+            break;
+          case PARAM_STRING:
+            write_string_attribute(handle, All.ParametersTag[i], (const char *)All.ParametersValue[i]);
+            break;
+          case PARAM_INT:
+            write_scalar_attribute(handle, All.ParametersTag[i], All.ParametersValue[i], H5T_NATIVE_INT);
+            break;
+        }
+    }
+}
+
+/*---------------------- Routine find a block in a snapfile -------------------*/
+void IO_Def::find_block(char *label, FILE *fd)
+{
+  unsigned int blocksize = 0, blksize;
+  char blocklabel[5]     = {"    "};
+
+#define FBSKIP                              \
+  {                                         \
+    my_fread(&blksize, sizeof(int), 1, fd); \
+  }
+
+  rewind(fd);
+
+  while(!feof(fd) && blocksize == 0)
+    {
+      FBSKIP;
+
+      if(blksize != 8)
+        {
+          Terminate("Incorrect Format (blksize=%u)!\n", blksize);
+        }
+      else
+        {
+          my_fread(blocklabel, LABEL_LEN * sizeof(char), 1, fd);
+          my_fread(&blocksize, sizeof(int), 1, fd);
+
+          FBSKIP;
+
+          if(strncmp(label, blocklabel, LABEL_LEN) != 0)
+            {
+              fseek(fd, blocksize, 1);
+              blocksize = 0;
+            }
+        }
+    }
+  if(feof(fd))
+    Terminate("Block '%c%c%c%c' not found !\n", label[0], label[1], label[2], label[3]);
+}
+
+void IO_Def::read_segment(const char *fname, int type, long long offset, long long count, int num_files)
+{
+  long long nleft   = count;
+  long long offleft = offset;
+  long long nskip   = 0;
+
+  for(int filenr = 0; filenr < num_files && nleft > 0; filenr++)
+    {
+      long long nloc = ntype_in_files[filenr * N_DataGroups + type];
+
+      if(nloc > offleft)  // we may have something in this file
+        {
+          long long nread;
+
+          if(nloc - offleft > nleft)  // there are more particles in the file then we need
+            nread = nleft;
+          else
+            nread = nloc - offleft;
+
+          /* now read partial list */
+          read_single_file_segment(fname, filenr, type, offleft, nread, nskip, num_files);
+
+          nleft -= nread;
+          nskip += nread;
+          offleft += nread;
+        }
+
+      offleft -= nloc;
+    }
+
+  if(nleft > 0)
+    {
+      for(int filenr = 0; filenr < num_files; filenr++)
+        {
+          long long nloc = ntype_in_files[filenr * N_DataGroups + type];
+          printf("filenr=%d:  nloc=%lld\n", filenr, nloc);
+        }
+      Terminate("Not all desired entries read: nleft=%lld  type=%d\n", nleft, type);
+    }
+}
+
+void IO_Def::read_single_file_segment(const char *basename, int filenr, int type, long long offset, unsigned long long count,
+                                      long long storage_offset, int num_files)
+{
+  int bytes_per_blockelement_in_file = 0;
+  hid_t hdf5_file = 0, hdf5_grp = 0, hdf5_dataspace_in_file;
+  hid_t hdf5_dataspace_in_memory, hdf5_dataset;
+  FILE *fd = 0;
+  char fname[MAXLEN_PATH];
+
+  if(num_files > 1)
+    {
+      if(file_format == FILEFORMAT_HDF5)
+        sprintf(fname, "%s.%d.hdf5", basename, filenr);
+      else
+        sprintf(fname, "%s.%d", basename, filenr);
+    }
+  else
+    {
+      if(file_format == FILEFORMAT_HDF5)
+        sprintf(fname, "%s.hdf5", basename);
+      else
+        sprintf(fname, "%s", basename);
+    }
+
+  /* open file  */
+  if(file_format == FILEFORMAT_HDF5)
+    {
+      hdf5_file = my_H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
+      char buf[MAXLEN_PATH];
+      get_datagroup_name(type, buf);
+      hdf5_grp = my_H5Gopen(hdf5_file, buf);
+    }
+  else if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+    {
+      if(!(fd = fopen(fname, "r")))
+        Terminate("can't open file `%s' for reading initial conditions.\n", fname);
+
+      unsigned int blksize1, blksize2;
+
+      if(file_format == FILEFORMAT_LEGACY2)
+        {
+          int nextblock;
+          char label[LABEL_LEN];
+          my_fread(&blksize1, sizeof(int), 1, fd);
+          my_fread(&label, sizeof(char), LABEL_LEN, fd);
+          my_fread(&nextblock, sizeof(int), 1, fd);
+          my_fread(&blksize2, sizeof(int), 1, fd);
+        }
+
+      my_fread(&blksize1, sizeof(int), 1, fd);
+      my_fread(header_buf, header_size, 1, fd);
+      my_fread(&blksize2, sizeof(int), 1, fd);
+    }
+  else
+    Terminate("unknown ICFormat");
+
+  long long npart[N_DataGroups];
+  for(int i = 0; i < N_DataGroups; i++)
+    npart[i] = ntype_in_files[filenr * N_DataGroups + i];
+
+  for(int blocknr = 0; blocknr < N_IO_Fields; blocknr++)
+    {
+      if(IO_Fields[blocknr].type_in_memory != MEM_MY_FILEOFFSET)
+        {
+          unsigned int blksize1, blksize2;
+          int typelist[N_DataGroups];
+          int bytes_per_blockelement = get_bytes_per_memory_blockelement(blocknr, 1);
+          long long npart_in_block   = get_particles_in_block(blocknr, npart, &typelist[0]);
+          hid_t hdf5_memory_datatype = get_hdf5_memorytype_of_block(blocknr);
+          char dname[MAXLEN_PATH];
+          get_dataset_name(blocknr, dname);
+
+          if(npart_in_block > 0 && typelist[type] > 0 && IO_Fields[blocknr].read_flag != SKIP_ON_READ)
+            {
+              if(file_format == FILEFORMAT_LEGACY2)
+                {
+                  char expected_label[LABEL_LEN + 1];
+                  get_Tab_IO_Label(blocknr, expected_label);
+                  find_block(expected_label, fd);
+                }
+
+              if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+                {
+                  my_fread(&blksize1, sizeof(int), 1, fd);
+                  bytes_per_blockelement_in_file = blksize1 / npart_in_block;
+                }
+
+              void *CommBuffer = (char *)Mem.mymalloc("CommBuffer", bytes_per_blockelement * count);
+
+              if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+                {
+                  fseek(fd, bytes_per_blockelement_in_file * offset, SEEK_CUR);
+
+                  if(bytes_per_blockelement_in_file != bytes_per_blockelement)
+                    {
+                      char *CommAuxBuffer = (char *)Mem.mymalloc("CommAuxBuffer", bytes_per_blockelement_in_file * count);
+                      my_fread(CommAuxBuffer, bytes_per_blockelement_in_file, count, fd);
+                      type_cast_data((char *)CommAuxBuffer, bytes_per_blockelement_in_file, (char *)CommBuffer, bytes_per_blockelement,
+                                     count, blocknr);
+                      Mem.myfree(CommAuxBuffer);
+                    }
+                  else
+                    my_fread(CommBuffer, bytes_per_blockelement, count, fd);
+
+                  fseek(fd, bytes_per_blockelement_in_file * (npart_in_block - offset - count), SEEK_CUR);
+
+                  my_fread(&blksize2, sizeof(int), 1, fd);
+                  if(blksize1 != blksize2)
+                    {
+                      char buf[MAXLEN_PATH];
+                      sprintf(buf, "incorrect block-sizes detected!\n Task=%d   blocknr=%d  blksize1=%d  blksize2=%d\n", ThisTask,
+                              blocknr, blksize1, blksize2);
+                      if(blocknr == 2) /* block number 2 is always IDs */
+                        strcat(buf, "Possible mismatch of 32bit and 64bit ID's in IC file and GADGET compilation !\n");
+                      Terminate(buf);
+                    }
+                }
+
+              if(file_format == FILEFORMAT_HDF5)
+                {
+                  hdf5_dataset = my_H5Dopen_if_existing(hdf5_grp, dname);
+                  int rank;
+                  hsize_t dims[2], nelem[2], start[2];
+
+                  dims[0] = npart[type];
+                  dims[1] = get_values_per_blockelement(blocknr);
+                  if(dims[1] == 1)
+                    rank = 1;
+                  else
+                    rank = 2;
+
+                  hdf5_dataspace_in_file = my_H5Screate_simple(rank, dims, NULL);
+
+                  dims[0]                  = count;
+                  hdf5_dataspace_in_memory = my_H5Screate_simple(rank, dims, NULL);
+
+                  start[0] = offset;
+                  start[1] = 0;
+
+                  nelem[0] = count;
+                  nelem[1] = get_values_per_blockelement(blocknr);
+
+                  my_H5Sselect_hyperslab(hdf5_dataspace_in_file, H5S_SELECT_SET, start, NULL, nelem, NULL);
+
+                  // Test if dataset was present
+                  if(hdf5_dataset < 0)
+                    {
+                      // no, pad with zeros
+                      printf("\tDataset %s not present for particle type %d, using zero.\n", dname, type);
+                      memset(CommBuffer, 0, dims[0] * dims[1] * my_H5Tget_size(hdf5_memory_datatype));
+                    }
+                  else
+                    {
+                      hid_t hdf5_file_datatype = H5Dget_type(hdf5_dataset);
+                      byte_count += dims[0] * dims[1] * my_H5Tget_size(hdf5_file_datatype); /* for I/O performance measurement */
+                      H5Tclose(hdf5_file_datatype);
+
+                      my_H5Dread(hdf5_dataset, hdf5_memory_datatype, hdf5_dataspace_in_memory, hdf5_dataspace_in_file, H5P_DEFAULT,
+                                 CommBuffer, dname);
+                      my_H5Dclose(hdf5_dataset, dname);
+                    }
+                  my_H5Sclose(hdf5_dataspace_in_memory, H5S_SIMPLE);
+                  my_H5Sclose(hdf5_dataspace_in_file, H5S_SIMPLE);
+                }
+
+              empty_read_buffer(blocknr, storage_offset, count, type, 0, CommBuffer);
+
+              Mem.myfree(CommBuffer);
+            }
+          else
+            {
+              if(file_format == FILEFORMAT_LEGACY1 && npart_in_block > 0)
+                {
+                  my_fread(&blksize1, sizeof(int), 1, fd);
+                  bytes_per_blockelement_in_file = blksize1 / npart_in_block;
+                  fseek(fd, bytes_per_blockelement_in_file * npart_in_block, SEEK_CUR);
+                  my_fread(&blksize2, sizeof(int), 1, fd);
+                  if(blksize1 != blksize2)
+                    {
+                      char buf[MAXLEN_PATH];
+                      sprintf(buf, "incorrect block-sizes detected!\n Task=%d   blocknr=%d  blksize1=%d  blksize2=%d\n", ThisTask,
+                              blocknr, blksize1, blksize2);
+                      if(blocknr == 2) /* block number 2 is always IDs */
+                        strcat(buf, "Possible mismatch of 32bit and 64bit ID's in IC file and GADGET compilation !\n");
+                      Terminate(buf);
+                    }
+                }
+            }
+        }
+    }
+
+  if(file_format == FILEFORMAT_LEGACY1 || file_format == FILEFORMAT_LEGACY2)
+    fclose(fd);
+
+  if(file_format == FILEFORMAT_HDF5)
+    {
+      char buf[MAXLEN_PATH];
+      get_datagroup_name(type, buf);
+      my_H5Gclose(hdf5_grp, buf);
+      my_H5Fclose(hdf5_file, fname);
+    }
+
+  read_increase_numbers(type, count);
+}
+
+void IO_Def::rename_file_to_bak_if_it_exists(char *fname)
+{
+  char fin[MAXLEN_PATH], buf[2 * MAXLEN_PATH];
+
+  strcpy(fin, fname);
+
+  char *p = strrchr(fin, '/');
+  if(p)
+    {
+      *p = 0;
+      sprintf(buf, "%s/bak-%s", fin, p + 1);
+    }
+  else
+    sprintf(buf, "bak-%s", fname);
+
+  if(FILE *fcheck = fopen(fname, "r"))  // check if file already exists, if yes, try to rename the existing file
+    {
+      fclose(fcheck);
+
+      if(!fopen(buf, "r"))  // only do the rename of the old file if the back-up file doesn't exist yet
+        {
+          mpi_printf("%s rename '%s' to '%s'\n", info, fname, buf);
+          rename(fname, buf);
+        }
+    }
+}
+
+void IO_Def::alloc_and_read_ntype_in_files(const char *fname, int num_files)
+{
+  ntype_in_files = (long long *)Mem.mymalloc_movable(&ntype_in_files, "ntype_in_files", num_files * N_DataGroups * sizeof(long long));
+
+  for(int filenr = 0; filenr < num_files; filenr++)
+    {
+      char buf[3 * MAXLEN_PATH];
+
+      if(num_files > 1)
+        {
+          sprintf(buf, "%s.%d", fname, filenr);
+          if(file_format == 3)
+            sprintf(buf, "%s.%d.hdf5", fname, filenr);
+        }
+      else
+        {
+          sprintf(buf, "%s", fname);
+          if(file_format == 3)
+            sprintf(buf, "%s.hdf5", fname);
+        }
+
+      if(file_format == 3)
+        {
+          read_header_fields(buf);
+        }
+      else if(file_format == 1 || file_format == 2)
+        {
+          FILE *fd = 0;
+
+          if(!(fd = fopen(buf, "r")))
+            Terminate("can't open file `%s' for reading initial conditions.\n", fname);
+
+          int blksize1, blksize2;
+
+          if(file_format == 2)
+            {
+              char label[4];
+              int nextblock;
+              my_fread(&blksize1, sizeof(int), 1, fd);
+              my_fread(&label, sizeof(char), 4, fd);
+              my_fread(&nextblock, sizeof(int), 1, fd);
+              printf("READIC: Reading header => '%c%c%c%c' (%d byte)\n", label[0], label[1], label[2], label[3], nextblock);
+              my_fread(&blksize2, sizeof(int), 1, fd);
+            }
+
+          my_fread(&blksize1, sizeof(int), 1, fd);
+          my_fread(header_buf, header_size, 1, fd);
+          my_fread(&blksize2, sizeof(int), 1, fd);
+
+          if(blksize1 != blksize2)
+            Terminate("incorrect header format, blksize1=%d blksize2=%d  header_size=%d\n", blksize1, blksize2, (int)header_size);
+
+          fclose(fd);
+        }
+
+      long long n_type[N_DataGroups], npart[N_DataGroups];
+
+      read_file_header(fname, filenr, 0, 0, n_type, npart, NULL);
+
+      for(int type = 0; type < N_DataGroups; type++)
+        ntype_in_files[filenr * N_DataGroups + type] = npart[type];
+    }
+}
diff --git a/src/io/io.h b/src/io/io.h
new file mode 100644
index 0000000000000000000000000000000000000000..48525a009b67537ca7353465d66b519d7bb80f52
--- /dev/null
+++ b/src/io/io.h
@@ -0,0 +1,284 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file io.h
+ *
+ * \brief declarations of I/O enumerations and a base class for I/O in Gadget
+ */
+
+#ifndef IO_H
+#define IO_H
+
+#include <hdf5.h>
+
+#ifdef LIGHTCONE_PARTICLES
+#include <chealpix.h>
+#endif
+
+#include "../data/simparticles.h"
+#include "../fof/fof.h"
+#include "../io/io_streamcount.h"
+#include "../mpi_utils/setcomm.h"
+
+#define LABEL_LEN 4
+#define DATASETNAME_LEN 256
+
+enum arrays
+{
+  A_NONE,
+  A_SPHP,
+  A_P,
+  A_PS,
+  A_G,
+  A_S,
+  A_ID,
+  A_DESC,
+  A_PROG,
+  A_MTRP,
+  A_H,
+  A_TT,
+  A_CT,
+  A_TL,
+  A_LC,
+  A_MM,
+  A_IDS,
+  A_TID,
+};
+
+enum file_contents
+{
+  FILE_IS_SNAPSHOT,
+  FILE_IS_GROUPCAT,
+  FILE_IS_DESCCAT,
+  FILE_IS_PROGCAT,
+  FILE_IS_TREECAT,
+  FILE_IS_TREELINK,
+  FILE_IS_LIGHTCONE,
+  FILE_IS_MASSMAP,
+  FILE_IS_GALSNAP
+};
+
+enum types_in_file
+{
+  FILE_NONE,
+  FILE_INT,
+  FILE_INT64,
+  FILE_MY_IO_FLOAT,
+  FILE_HALF,
+  FILE_MY_ID_TYPE,
+  FILE_MY_INTPOS_TYPE,
+  FILE_DOUBLE,
+  FILE_FLOAT,
+};
+
+enum types_in_memory
+{
+  MEM_INT,
+  MEM_INT64,
+  MEM_MY_ID_TYPE,
+  MEM_MY_INTPOS_TYPE,
+  MEM_FLOAT,
+  MEM_DOUBLE,
+  MEM_MY_FLOAT,
+  MEM_MY_DOUBLE,
+  MEM_MY_FILEOFFSET,
+};
+
+#ifdef FOF_ALLOW_HUGE_GROUPLENGTH
+const types_in_memory mem_len_type = MEM_INT64;
+const types_in_file file_len_type  = FILE_INT64;
+#else
+const types_in_memory mem_len_type = MEM_INT;
+const types_in_file file_len_type  = FILE_INT;
+#endif
+
+enum e_typelist
+{
+  GAS_ONLY      = 1,
+  STARS_ONLY    = 16,
+  GAS_AND_STARS = 17,
+  ALL_TYPES     = ((1 << NTYPES) - 1),
+  MASS_BLOCK    = -1,
+  AGE_BLOCK     = -2,
+  Z_BLOCK       = -3,
+  GROUPS        = -4,
+  SUBGROUPS     = -5,
+  ID_BLOCK      = -6,
+  PREVSUBS      = -7,
+  CURRSUBS      = -8,
+  TREELENGTH    = -9,
+  TREEHALOS     = -10,
+  TREELINK      = -11,
+  MASSMAPS      = -12,
+  TREETABLE     = -13,
+  TREETIMES     = -14,
+  GALSNAPS      = -16,
+  HEALPIXTAB    = -17,
+};
+
+enum read_flags
+{
+  READ_IF_PRESENT,
+  SKIP_ON_READ
+};
+
+class IO_Def : public io_streamcount, public setcomm
+{
+ public:
+  IO_Def(MPI_Comm comm, int format) : setcomm(comm)
+  {
+    determine_compute_nodes();
+    file_format = format;
+  }
+
+  virtual ~IO_Def();
+
+  int N_DataGroups  = 0;
+  int N_IO_Fields   = 0;
+  int Max_IO_Fields = 0;
+
+  /* functions that need to be provided by the specific module */
+
+  virtual void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart) = 0;
+  virtual void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                                int *nstart)                                                         = 0;
+  virtual void get_datagroup_name(int grnr, char *gname)                                             = 0;
+  virtual void write_header_fields(hid_t)                                                            = 0;
+  virtual void read_header_fields(const char *fname)                                                 = 0;
+  virtual void read_increase_numbers(int type, int n_for_this_task)                                  = 0;
+  virtual int get_filenr_from_header(void)                                                           = 0;
+  virtual void set_filenr_in_header(int)                                                             = 0;
+  virtual void *get_base_address_of_structure(enum arrays array, int index)                          = 0;
+  virtual int get_type_of_element(int index)                                                         = 0;
+  virtual void set_type_of_element(int index, int type)                                              = 0;
+
+  void init_field(const char *label, const char *datasetname, enum types_in_memory type_in_memory,
+                  enum types_in_file type_in_file_output, enum read_flags read_flag, int values_per_block, enum arrays array,
+                  void *pointer_to_field, void (*io_func)(IO_Def *, int, int, void *, int), int typelist_bitmask, int hasunits,
+                  double a, double h, double L, double M, double V, double c, bool compression_on = false);
+
+  int find_files(const char *fname, const char *fname_multiple);
+  void read_files_driver(const char *fname, int rep, int numfiles);
+  void write_multiple_files(char *fname, int numfilesperdump, int append_flag = 0, int chunk_size = 0);
+  void write_compile_time_options_in_hdf5(hid_t handle);
+  void read_segment(const char *fname, int type, long long offset, long long count, int numfiles);
+  void read_single_file_segment(const char *fname, int filenr, int type, long long offset, unsigned long long count,
+                                long long storage_offset, int numfiles);
+
+  void alloc_and_read_ntype_in_files(const char *fname, int num_files);
+
+  size_t header_size;
+  void *header_buf;
+
+  long long *ntype_in_files;
+  char info[100];
+
+#if defined(MERGERTREE) || defined(LGALAXIES)
+  typedef fof<simparticles>::treehalo_t treehalo_type;
+#endif
+
+  /*
+   * variables for new input/output functionality
+   */
+
+  enum file_contents type_of_file;
+
+ private:
+  struct IO_Field
+  {
+    enum types_in_memory type_in_memory;
+    enum types_in_file type_in_file_output;
+    enum read_flags read_flag;
+    int values_per_block;
+    int write_block;
+    int read_block;
+    char label[LABEL_LEN + 1];
+    char datasetname[DATASETNAME_LEN + 1];
+    void (*io_func)(IO_Def *, int, int, void *, int);
+    int typelist;
+    bool compression_on;
+    enum arrays array;
+    size_t offset;
+
+    char hasunit;
+    double a;
+    double h;
+    double L;
+    double M;
+    double V;
+    double c;
+  };
+  IO_Field *IO_Fields;
+
+  void write_file(char *fname, int writeTask, int lastTask, void *CommBuffer, int numfilesperdump, int chunksize);
+  void read_file(const char *fname, int filenr, int readTask, int lastTask, void *CommBuffer);
+  void append_file(char *fname, int writeTask, int lastTask, void *CommBuffer, int numfilesperdump, int chunksize);
+
+  int get_values_per_blockelement(int blocknr);
+  void get_dataset_name(int blocknr, char *buf);
+  long long get_particles_in_block(int blocknr, long long *npartinfile, int *typelist);
+  int get_bytes_per_memory_blockelement(int blocknr, int mode);
+  hid_t get_hdf5_outputtype_of_block(int blocknr);
+  hid_t get_hdf5_memorytype_of_block(int blocknr);
+  void get_Tab_IO_Label(int blocknr, char *label);
+  void type_cast_data(char *src, int src_bytes_per_element, char *target, int target_bytes_per_element, int len, int blocknr);
+  void distribute_file(int nfiles, int *filenr, int *master, int *last);
+  void share_particle_number_in_file(const char *fname, int filenr, int readTask, int lastTask);
+  void find_block(char *label, FILE *fd);
+  void fill_write_buffer(int blocknr, int *startindex, int pc, int type, void *CommBuffer);
+  void empty_read_buffer(int blocknr, int offset, int pc, int type, long long nprevious, void *CommBuffer);
+
+  void write_dataset_attributes(hid_t hdf5_dataset, int blocknr);
+  void write_parameters_attributes_in_hdf5(hid_t handle);
+  void rename_file_to_bak_if_it_exists(char *fname);
+
+  void polling(int numfilesperdump);
+
+  int files_started;
+  int files_completed;
+  int file_format;
+
+  struct seq_data
+  {
+    int thistask;
+    int rankinnode;
+    int thisnode;
+    bool operator<(const seq_data &b) const
+    {
+      if(rankinnode < b.rankinnode)
+        return true;
+      if(rankinnode > b.rankinnode)
+        return false;
+      if(thisnode < b.thisnode)
+        return true;
+      if(thisnode > b.thisnode)
+        return false;
+      return thistask < b.thistask;
+    }
+  };
+  seq_data *seq;
+};
+
+#ifdef GADGET2_HEADER
+#define NTYPES_HEADER 6
+#else
+#define NTYPES_HEADER NTYPES
+#endif
+
+#define FLAG_ZELDOVICH_ICS 1
+#define FLAG_SECOND_ORDER_ICS 2
+
+/* functions visible in the whole code */
+
+/*
+
+  void test_io_bandwidth(void);
+  void dump_particles(void);
+
+
+*/
+
+#endif
diff --git a/src/io/io_streamcount.h b/src/io/io_streamcount.h
new file mode 100644
index 0000000000000000000000000000000000000000..4448cff9760cb4bc809874ec0337e96760b1c576
--- /dev/null
+++ b/src/io/io_streamcount.h
@@ -0,0 +1,98 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_streamcount.h
+ *
+ *  \brief class to measure I/O performance
+ */
+
+#ifndef IO_STREAMCOUNT_H
+#define IO_STREAMCOUNT_H
+
+#include <errno.h>
+
+class io_streamcount
+{
+ public:
+  long long byte_count = 0;
+
+  /*! \brief  A wrapper for the fwrite() function
+   *
+   *  This catches I/O errors occuring for fwrite(). In this case we
+   *  better stop. If stream is null, no attempt at writing is done.
+   *
+   *  \param ptr pointer to the beginning of data to write
+   *  \param size size in bytes of a single data element
+   *  \param nmemb number of elements to be written
+   *  \param stream pointer to the output stream
+   *  \return number of elements written to stream
+   */
+  size_t my_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+  {
+    size_t nwritten;
+
+    if(!stream)
+      return 0;
+
+    if(size * nmemb > 0)
+      {
+        if((nwritten = fwrite(ptr, size, nmemb, stream)) != nmemb)
+          {
+            Terminate("I/O error (fwrite) has occured: %s\n", strerror(errno));
+          }
+      }
+    else
+      nwritten = 0;
+
+    byte_count += size * nmemb;
+
+    return nwritten;
+  }
+
+  /*! \brief  A wrapper for the fread() function
+   *
+   *  This catches I/O errors occuring for fread(). In this case we
+   *  better stop. If stream is null, no attempt at readingis done.
+   *
+   *  \param ptr pointer to the beginning of memory location where to store data
+   *  \param size size in bytes of a single data element
+   *  \param nmemb number of elements to be read
+   *  \param stream pointer to the nput stream
+   *  \return number of elements read from stream
+   */
+  size_t my_fread(void *ptr, size_t size, size_t nmemb, FILE *stream)
+  {
+    size_t nread;
+
+    if(!stream)
+      return 0;
+
+    if(size * nmemb > 0)
+      {
+        if((nread = fread(ptr, size, nmemb, stream)) != nmemb)
+          {
+            if(feof(stream))
+              {
+                Terminate("I/O error (fread) has occured: end of file\n");
+              }
+            else
+              Terminate("I/O error (fread) has occured: %s\n", strerror(errno));
+          }
+      }
+    else
+      nread = 0;
+
+    byte_count += size * nmemb;
+
+    return nread;
+  }
+
+  void reset_io_byte_count(void) { byte_count = 0; }
+
+  long long get_io_byte_count(void) { return byte_count; }
+};
+
+#endif
diff --git a/src/io/parameters.cc b/src/io/parameters.cc
new file mode 100644
index 0000000000000000000000000000000000000000..35f667b8e856a174753a13cb3ff1d46ea20b2b82
--- /dev/null
+++ b/src/io/parameters.cc
@@ -0,0 +1,234 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file parameters.cc
+ *
+ *  \brief parses the parameter file
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../io/io.h"
+#include "../io/parameters.h"
+#include "../main/main.h"
+#include "../mpi_utils/shared_mem_handler.h"
+#include "../system/system.h"
+
+void parameters::add_param(const char *name, void *buf, int type, int flag)
+{
+  if(NParameters > MAX_PARAMETERS)
+    Terminate("exceeded MAX_PARAMETERS=%d", MAX_PARAMETERS);
+
+  if(strlen(name) > MAXLEN_PARAM_TAG - 1)
+    Terminate("parameter '%s' too long", name);
+
+  strcpy(ParametersTag[NParameters], name);
+  ParametersValue[NParameters]      = buf;
+  ParametersType[NParameters]       = type;
+  ParametersChangeable[NParameters] = flag;
+  NParameters++;
+}
+
+/*! \brief This function parses the parameter file.
+ *
+ *  Each parameter is defined by a keyword (`tag'), and can be either
+ *  of type douple, int, or character string. Three arrays containing the name,
+ *  type and address of the parameter are filled first. The routine then parses
+ *  the parameter file and fills the referenced variables. The routine makes sure that
+ *  each parameter appears exactly once in the parameter file, otherwise
+ *  error messages are produced that complain about the missing parameters.
+ *  Basic checks are performed on the supplied parameters in the end.
+ *
+ *  \param fname The file name of the parameter file
+ */
+int parameters::read_parameter_file(const char *fname)
+{
+  FILE *fd, *fdout;
+  char buf[MAXLEN_PARAM_TAG + MAXLEN_PARAM_VALUE + 200];
+  int param_handled[MAX_PARAMETERS];
+  int errorFlag = 0;
+
+  for(int i = 0; i < MAX_PARAMETERS; i++)
+    {
+      param_handled[i]     = 0;
+      ParameterSequence[i] = -1;
+    }
+
+  if(sizeof(long long) != 8)
+    Terminate("\nType `long long' is not 64 bit on this platform. Stopping.\n\n");
+
+  if(sizeof(int) != 4)
+    Terminate("\nType `int' is not 32 bit on this platform. Stopping.\n\n");
+
+  if(sizeof(float) != 4)
+    Terminate("\nType `float' is not 32 bit on this platform. Stopping.\n\n");
+
+  if(sizeof(double) != 8)
+    Terminate("\nType `double' is not 64 bit on this platform. Stopping.\n\n");
+
+  if(ThisTask == 0) /* read parameter file on process 0 */
+    {
+      if((fd = fopen(fname, "r")))
+        {
+          sprintf(buf, "%s%s", fname, "-usedvalues");
+          if(!(fdout = fopen(buf, "w")))
+            {
+              printf("error opening file '%s' \n", buf);
+              errorFlag = 1;
+            }
+          else
+            {
+              printf("Obtaining parameters from file '%s':\n\n", fname);
+              int cnt = 0;
+              while(!feof(fd))
+                {
+                  char buf1[MAXLEN_PARAM_TAG + 200], buf2[MAXLEN_PARAM_VALUE + 200], buf3[MAXLEN_PARAM_TAG + MAXLEN_PARAM_VALUE + 400];
+
+                  *buf = 0;
+                  fgets(buf, MAXLEN_PARAM_TAG + MAXLEN_PARAM_VALUE + 200, fd);
+                  if(sscanf(buf, "%s%s%s", buf1, buf2, buf3) < 2)
+                    continue;
+
+                  if(buf1[0] == '%' || buf1[0] == '#')
+                    continue;
+
+                  int j = -1;
+                  for(int i = 0; i < NParameters; i++)
+                    if(strcmp(buf1, ParametersTag[i]) == 0)
+                      {
+                        if(param_handled[i] == 0)
+                          {
+                            j                        = i;
+                            param_handled[i]         = 1;
+                            ParameterSequence[cnt++] = i;
+                            break;
+                          }
+                        else
+                          {
+                            j = -2;
+                            break;
+                          }
+                      }
+
+                  if(j >= 0)
+                    {
+                      switch(ParametersType[j])
+                        {
+                          case PARAM_DOUBLE:
+                            *((double *)ParametersValue[j]) = atof(buf2);
+                            sprintf(buf3, "%%-%ds%%g\n", MAXLEN_PARAM_TAG);
+                            fprintf(fdout, buf3, buf1, *((double *)ParametersValue[j]));
+                            fprintf(stdout, "        ");
+                            fprintf(stdout, buf3, buf1, *((double *)ParametersValue[j]));
+                            break;
+                          case PARAM_STRING:
+                            if(strcmp(buf2, "OUTPUT_DIR") == 0)
+                              {
+                                if(getenv("OUTPUT_DIR"))
+                                  strcpy(buf2, getenv("OUTPUT_DIR"));
+                                else
+                                  Terminate("no environment variable OUTPUT_DIR found");
+                              }
+                            strcpy((char *)ParametersValue[j], buf2);
+                            sprintf(buf3, "%%-%ds%%s\n", MAXLEN_PARAM_TAG);
+                            fprintf(fdout, buf3, buf1, buf2);
+                            fprintf(stdout, "        ");
+                            fprintf(stdout, buf3, buf1, buf2);
+                            break;
+                          case PARAM_INT:
+                            *((int *)ParametersValue[j]) = atoi(buf2);
+                            sprintf(buf3, "%%-%ds%%d\n", MAXLEN_PARAM_TAG);
+                            fprintf(fdout, buf3, buf1, *((int *)ParametersValue[j]));
+                            fprintf(stdout, "        ");
+                            fprintf(stdout, buf3, buf1, *((int *)ParametersValue[j]));
+                            break;
+                        }
+                    }
+                  else if(j == -2)
+                    {
+                      fprintf(stdout, "Error in file %s:   Tag '%s' multiply defined.\n", fname, buf1);
+                      errorFlag = 1;
+                    }
+                  else
+                    {
+                      fprintf(stdout, "Error in file %s:   Tag '%s' not allowed\n", fname, buf1);
+                      errorFlag = 1;
+                    }
+                }
+              fclose(fd);
+              fclose(fdout);
+              printf("\n");
+            }
+        }
+      else
+        {
+          printf("Parameter file %s not found.\n", fname);
+          errorFlag = 1;
+        }
+
+      for(int i = 0; i < NParameters; i++)
+        {
+          if(param_handled[i] != 1)
+            {
+              printf("Error. I miss a value for tag '%s' in parameter file '%s'.\n", ParametersTag[i], fname);
+              errorFlag = 1;
+            }
+        }
+    }
+
+  return errorFlag;
+}
+
+void parameters::write_used_parameters(const char *dirname, const char *fname)
+{
+  if(ThisTask == 0)
+    {
+      mkdir(dirname, 02755);
+      char buf[MAXLEN_PATH_EXTRA];
+      sprintf(buf, "%s%s", dirname, fname);
+      FILE *fdout = fopen(buf, "w");
+      if(!fdout)
+        Terminate("Can't open file '%s'", buf);
+
+      for(int i = 0; i < NParameters; i++)
+        {
+          int j = ParameterSequence[i];
+
+          if(j >= 0)
+            {
+              char buf3[MAXLEN_PARAM_TAG + MAXLEN_PARAM_VALUE + 400];
+
+              switch(ParametersType[j])
+                {
+                  case PARAM_DOUBLE:
+                    sprintf(buf3, "%%-%ds%%g\n", MAXLEN_PARAM_TAG);
+                    fprintf(fdout, buf3, ParametersTag[j], *((double *)ParametersValue[j]));
+                    break;
+                  case PARAM_STRING:
+                    sprintf(buf3, "%%-%ds%%s\n", MAXLEN_PARAM_TAG);
+                    fprintf(fdout, buf3, ParametersTag[j], (char *)ParametersValue[j]);
+                    break;
+                  case PARAM_INT:
+                    sprintf(buf3, "%%-%ds%%d\n", MAXLEN_PARAM_TAG);
+                    fprintf(fdout, buf3, ParametersTag[j], *((int *)ParametersValue[j]));
+                    break;
+                }
+            }
+        }
+
+      fclose(fdout);
+    }
+}
diff --git a/src/io/parameters.h b/src/io/parameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..734c6aaa06c6d59d24f30972b11385526c220916
--- /dev/null
+++ b/src/io/parameters.h
@@ -0,0 +1,51 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file parameters.h
+ *
+ *  \brief declares a class for dealing with the parameter file
+ */
+
+#ifndef PARAMETERS_H
+#define PARAMETERS_H
+
+#include "../data/dtypes.h"
+#include "../mpi_utils/setcomm.h"
+
+#define PARAM_DOUBLE 1
+#define PARAM_STRING 2
+#define PARAM_INT 3
+
+#define PARAM_FIXED 0
+#define PARAM_CHANGEABLE 1
+
+#define MAXLEN_PARAM_TAG 50    /**< maximum length of the tag of a parameter in the parameter file */
+#define MAXLEN_PARAM_VALUE 200 /**< maximum length of the value of a parameter in the parameter file */
+#define MAX_PARAMETERS 300     /**< maximum number of parameters in the parameter file */
+
+class parameters : public setcomm
+{
+ public:
+  // constructors
+  parameters() : setcomm("delayed init") {}
+  parameters(MPI_Comm comm) : setcomm(comm) {}
+
+  int read_parameter_file(const char *fname);
+
+  void add_param(const char *name, void *buf, int type, int flag);
+
+  void write_used_parameters(const char *dirname, const char *fname);
+
+  int NParameters = 0;
+
+  char ParametersTag[MAX_PARAMETERS][MAXLEN_PARAM_TAG];
+  void *ParametersValue[MAX_PARAMETERS];
+  char ParametersType[MAX_PARAMETERS];
+  char ParametersChangeable[MAX_PARAMETERS];
+  int ParameterSequence[MAX_PARAMETERS];
+};
+
+#endif /* PARAMETERS_H */
diff --git a/src/io/restart.cc b/src/io/restart.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb2bccf0ba0f2a30b4da6ba9713e5578fb55c7e9
--- /dev/null
+++ b/src/io/restart.cc
@@ -0,0 +1,637 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file restart.cc
+ *
+ * \brief handles the reading/writing of restart files
+ */
+
+#include "gadgetconfig.h"
+
+#include <gsl/gsl_rng.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../io/io.h"
+#include "../io/restart.h"
+#include "../lightcone/lightcone.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+void restart::write(sim *Sim_ptr)
+{
+  Sim = Sim_ptr;
+  do_restart(MODUS_WRITE);
+}
+
+/*! \brief This function loads the last restart file.
+ *
+ * Some parameters of the parameter file might be changed between restarting.
+ * This function ensures that only the allowed parameters change,
+ * otherwise the old value from the restart file is taken.
+ * If the end time of the simulation changed readjust_timebase() is called in the end.
+ */
+void restart::load(sim *Sim_ptr)
+{
+  Sim                           = Sim_ptr;
+  global_data_all_processes all = All; /* save global variables. (will be read from restart file) */
+
+  do_restart(MODUS_READ); /* ... read restart file. Note: This also resets
+                          all variables in the struct `All'.
+                          However, during the run, some variables in the parameter
+                          file are allowed to be changed, if desired. These need to
+                          copied in the way below.
+                        */
+
+  /* now update those parameters that were changed in the parameterfile, and where a change is allowed */
+
+  for(int i = 0; i < All.NParameters; i++)
+    {
+      if(All.ParametersChangeable[i] == PARAM_CHANGEABLE)
+        {
+          size_t off = (char *)All.ParametersValue[i] - (char *)&All;
+
+          if(off > sizeof(All))
+            Terminate("Invalid parameter pointer: '%s'  i=%d off=%lld\n", All.ParametersTag[i], i, (long long)off);
+
+          switch(All.ParametersType[i])
+            {
+              case PARAM_DOUBLE:
+                {
+                  double *old_dbl = (double *)((char *)&All + off);
+                  double *new_dbl = (double *)((char *)&all + off);
+
+                  if(*new_dbl != *old_dbl)
+                    {
+                      mpi_printf("RESTART: %s modified from %g to %g while restarting at Time=%g\n", All.ParametersTag[i], *old_dbl,
+                                 *new_dbl, All.Time);
+                      *old_dbl = *new_dbl;
+                    }
+                }
+                break;
+              case PARAM_STRING:
+                {
+                  char *old_p = (char *)&All + off;
+                  char *new_p = (char *)&all + off;
+                  if(strncmp(new_p, old_p, MAXLEN_PARAM_VALUE))
+                    {
+                      mpi_printf("RESTART: %s modified from '%s' to '%s' while restarting at Time=%g\n", All.ParametersTag[i], old_p,
+                                 new_p, All.Time);
+                      strncpy(old_p, new_p, MAXLEN_PARAM_VALUE);
+                    }
+                }
+                break;
+              case PARAM_INT:
+                {
+                  int *old_int = (int *)((char *)&All + off);
+                  int *new_int = (int *)((char *)&all + off);
+
+                  if(*new_int != *old_int)
+                    {
+                      mpi_printf("RESTART: %s modified from %d to %d while restarting at Time=%g\n", All.ParametersTag[i], *old_int,
+                                 *new_int, All.Time);
+                      *old_int = *new_int;
+                    }
+                }
+                break;
+            }
+        }
+    }
+
+  /* change in the output list table is always allowed */
+  All.OutputListLength = all.OutputListLength;
+  memcpy(All.OutputListTimes, all.OutputListTimes, sizeof(double) * All.OutputListLength);
+  memcpy(All.OutputListFlag, all.OutputListFlag, sizeof(char) * All.OutputListLength);
+
+  /* if the final time is changed, we process this with a special function */
+  if(All.TimeMax != all.TimeMax)
+    readjust_timebase(All.TimeMax, all.TimeMax);
+}
+
+void restart::backup_restartfiles(int task)
+{
+  char buf[MAXLEN_PATH_EXTRA];
+  char buf_bak[MAXLEN_PATH_EXTRA];
+  FILE *fcheck         = NULL;
+  int bak_files_status = 0;
+
+  mpi_printf("RESTART: Backing up restart files...\n");
+
+  sprintf(buf, "%s/restartfiles/%s.%d", All.OutputDir, "restart", task);
+  sprintf(buf_bak, "%s/restartfiles/bak-%s.%d", All.OutputDir, "restart", task);
+
+  if((fcheck = fopen(buf, "r")))
+    {
+      fclose(fcheck);
+
+      rename(buf, buf_bak);
+
+      bak_files_status = 1;
+    }
+
+  int bak_files_status_sum;
+  MPI_Allreduce(&bak_files_status, &bak_files_status_sum, 1, MPI_INT, MPI_SUM, Communicator);
+
+  if(bak_files_status_sum != NTask && bak_files_status_sum != 0)
+    mpi_printf("RESTART: some (%d) restart files were renamed to bak, but some (%d) weren't - something is very possibly wrong!",
+               bak_files_status, NTask - bak_files_status);
+  else if(bak_files_status_sum == NTask)
+    mpi_printf("RESTART: done renaming pre-existing restart files to bak files.\n");
+  else if(bak_files_status_sum == 0)
+    mpi_printf("RESTART: no pre-existing restart files for renaming were found.\n");
+}
+
+/*! \brief This function reads or writes the restart files.
+ *
+ * Each processor writes its own restart file, with the
+ * I/O being done in parallel. To avoid congestion of the disks
+ * you can tell the program to restrict the number of files
+ * that are simultaneously written to MaxFilesWithConcurrentIO.
+ *
+ * \param modus if modus>0  the restart()-routine reads,
+ * if modus==0 it writes a restart file.
+ */
+void restart::do_restart(int modus)
+{
+#ifdef DO_NOT_PRODUCE_BIG_OUTPUT
+  if(modus == MODUS_WRITE)
+    {
+      mpi_printf("RESTART: Omitting writing restart files.\n");
+      return;
+    }
+#endif
+
+  TIMER_START(CPU_RESTART);
+
+  double t0 = Logs.second();
+  reset_io_byte_count();
+
+  if(modus == MODUS_READ)
+    mpi_printf("RESTART: Loading restart files...\n");
+  else if(modus == MODUS_WRITE)
+    mpi_printf("RESTART: Writing restart files.\n");
+
+  /* create directory for restartfiles */
+  if(ThisTask == 0 && modus == MODUS_WRITE)
+    {
+      char buf[MAXLEN_PATH_EXTRA];
+      sprintf(buf, "%s/restartfiles", All.OutputDir);
+      mkdir(buf, 02755);
+    }
+  MPI_Barrier(Communicator);
+
+  if(All.MaxFilesWithConcurrentIO > NTask)
+    {
+      mpi_printf("NOTICE: MaxFilesWithConcurrentIO has been reduced to the number of processors\n");
+      All.MaxFilesWithConcurrentIO = NTask;
+    }
+
+  if(All.MaxFilesWithConcurrentIO < 1)
+    {
+      mpi_printf("NOTICE: MaxFilesWithConcurrentIO has been set to be equal to the number of processors\n");
+      All.MaxFilesWithConcurrentIO = NTask;
+    }
+
+  files_concurrent = All.MaxFilesWithConcurrentIO;
+
+  files_groups = NTask / All.MaxFilesWithConcurrentIO;
+
+  if(NTask % All.MaxFilesWithConcurrentIO)
+    files_groups++;
+
+  if(modus == MODUS_WRITE)
+    backup_restartfiles(ThisTask);
+
+  /* now work the I/O of the files, controlled by scheduler to achieve optimum I/O bandwidth under the constraint of a maximum number
+   * for the concurrent file access */
+  work_files(modus);
+
+  /* check whether the restarts are all at the same time */
+  if(modus == MODUS_READ) /* read */
+    {
+      global_data_all_processes all_task0;
+
+      if(ThisTask == 0)
+        all_task0 = All;
+
+      MPI_Bcast(&all_task0, sizeof(global_data_all_processes), MPI_BYTE, 0, Communicator);
+
+      if(all_task0.Time != All.Time)
+        Terminate("The restart file on task=%d is not consistent with the one on task=0\n", ThisTask);
+    }
+
+  long long byte_count = get_io_byte_count(), byte_count_all;
+  sumup_longs(1, &byte_count, &byte_count_all, Communicator);
+
+  double t1 = Logs.second();
+
+  mpi_printf("RESTART: done. load/save took %g sec, total size %g MB, corresponds to effective I/O rate of %g MB/sec\n",
+             Logs.timediff(t0, t1), byte_count_all / (1024.0 * 1024.0), byte_count_all / (1024.0 * 1024.0) / Logs.timediff(t0, t1));
+
+  TIMER_STOP(CPU_RESTART);
+}
+
+void restart::polling(int modus)
+{
+  if(ThisTask == 0)
+    if(files_completed < NTask)
+      {
+        MPI_Status status;
+        int flag;
+
+        /* now check for a completion message  */
+        MPI_Iprobe(MPI_ANY_SOURCE, TAG_KEY, Communicator, &flag, &status);
+
+        if(flag)
+          {
+            int source = status.MPI_SOURCE;
+
+            int dummy;
+            MPI_Recv(&dummy, 1, MPI_INT, source, TAG_KEY, Communicator, MPI_STATUS_IGNORE);
+            files_completed++;
+
+            if(files_started < NTask)
+              {
+                if((files_started % files_concurrent) == 0)
+                  {
+                    if(modus == MODUS_READ)
+                      mpi_printf("RESTART: Loading restart files group #%d out of %d...\n", (files_started / files_concurrent) + 1,
+                                 files_groups);
+                    else if(modus == MODUS_WRITE)
+                      mpi_printf("RESTART: Writing restart files group #%d out of %d...\n", (files_started / files_concurrent) + 1,
+                                 files_groups);
+                  }
+
+                /* send start signal */
+                MPI_Ssend(&ThisTask, 1, MPI_INT, seq[files_started++].thistask, TAG_N, Communicator);
+              }
+          }
+      }
+}
+
+void restart::work_files(int modus)
+{
+  if(ThisTask == 0)
+    if(!(seq = (seq_data *)malloc(NTask * sizeof(seq_data))))
+      Terminate("can't allocate seq_data");
+
+  seq_data seq_loc;
+  seq_loc.thistask   = ThisTask;
+  seq_loc.rankinnode = RankInThisNode;
+  seq_loc.thisnode   = ThisNode;
+
+  MPI_Gather(&seq_loc, sizeof(seq_data), MPI_BYTE, seq, sizeof(seq_data), MPI_BYTE, 0, Communicator);
+
+  if(ThisTask == 0)
+    {
+      std::sort(seq, seq + NTask);
+
+      files_started   = 0;
+      files_completed = 0;
+
+      if((files_started % files_concurrent) == 0)
+        {
+          if(modus == MODUS_READ)
+            mpi_printf("RESTART: Loading restart files group #%d out of %d...\n", (files_started / files_concurrent) + 1,
+                       files_groups);
+          else if(modus == MODUS_WRITE)
+            mpi_printf("RESTART: Writing restart files group #%d out of %d...\n", (files_started / files_concurrent) + 1,
+                       files_groups);
+        }
+
+      for(int i = 1; i < All.MaxFilesWithConcurrentIO; i++)
+        {
+          files_started++;
+          MPI_Ssend(&ThisTask, 1, MPI_INT, seq[i].thistask, TAG_N, Communicator);
+        }
+
+      files_started++;
+      contents_restart_file(modus);
+      files_completed++;
+
+      if(files_started < NTask)
+        {
+          if((files_started % files_concurrent) == 0)
+            {
+              if(modus == MODUS_READ)
+                mpi_printf("RESTART: Loading restart files group #%d out of %d...\n", (files_started / files_concurrent) + 1,
+                           files_groups);
+              else if(modus == MODUS_WRITE)
+                mpi_printf("RESTART: Writing restart files group #%d out of %d...\n", (files_started / files_concurrent) + 1,
+                           files_groups);
+            }
+
+          /* send start signal */
+          MPI_Ssend(&ThisTask, 1, MPI_INT, seq[files_started++].thistask, TAG_N, Communicator);
+        }
+
+      while(files_completed < NTask)
+        polling(modus);
+
+      free(seq);
+    }
+  else
+    {
+      /* wait for start signal */
+      int dummy;
+      MPI_Recv(&dummy, 1, MPI_INT, 0, TAG_N, Communicator, MPI_STATUS_IGNORE); /* wait until we are told to start */
+
+      contents_restart_file(modus);
+
+      /* send back completion notice */
+      MPI_Ssend(&ThisTask, 1, MPI_INT, 0, TAG_KEY, Communicator);
+    }
+}
+
+void restart::contents_restart_file(int modus)
+{
+  char buf[MAXLEN_PATH_EXTRA];
+  sprintf(buf, "%s/restartfiles/%s.%d", All.OutputDir, "restart", ThisTask);
+
+  if(modus == MODUS_READ)
+    {
+      if(!(fd = fopen(buf, "r")))
+        {
+          Terminate("RESTART: Restart file '%s' not found.\n", buf);
+        }
+    }
+  else if(modus == MODUS_WRITE)
+    {
+      if(!(fd = fopen(buf, "w")))
+        {
+          Terminate("RESTART: Restart file '%s' cannot be opened.\n", buf);
+        }
+    }
+  else
+    Terminate("unknown modus\n");
+
+  /* common data  */
+  byten(All.get_data_ptr(), All.get_data_size(), modus);
+
+  /* converter data to integer coordinates*/
+  intposconvert *converter = &Sim->Sp;
+  byten(converter, sizeof(intposconvert), modus);
+
+  in(&Sim->Sp.MaxPart, modus);
+  in(&Sim->Sp.MaxPartSph, modus);
+  byten(&Sim->Sp.TotNumPart, sizeof(Sim->Sp.TotNumPart), modus);
+  byten(&Sim->Sp.TotNumGas, sizeof(Sim->Sp.TotNumGas), modus);
+
+  if(modus == MODUS_READ) /* read */
+    Sim->Sp.allocate_memory();
+
+  in(&Sim->Sp.NumPart, modus);
+
+  /* Particle data  */
+  byten(&Sim->Sp.P[0], Sim->Sp.NumPart * sizeof(particle_data), modus);
+
+  in(&Sim->Sp.NumGas, modus);
+
+  if(Sim->Sp.NumGas > 0)
+    {
+      /* Sph-Particle data  */
+      byten(&Sim->Sp.SphP[0], Sim->Sp.NumGas * sizeof(sph_particle_data), modus);
+    }
+
+#if defined(MERGERTREE) && defined(SUBFIND)
+  byten(&Sim->MergerTree.PrevTotNsubhalos, sizeof(long long), modus);
+  byten(&Sim->MergerTree.PrevNsubhalos, sizeof(int), modus);
+#endif
+
+  /* lightcone particle data  */
+#ifdef LIGHTCONE_PARTICLES
+  /* converter data to integer coordinates*/
+  intposconvert *converter_lp = &Sim->Lp;
+  byten(converter_lp, sizeof(intposconvert), modus);
+
+  in(&Sim->Lp.MaxPart, modus);
+  byten(&Sim->Lp.TotNumPart, sizeof(Sim->Lp.TotNumPart), modus);
+
+  if(modus == MODUS_READ) /* read */
+    Sim->Lp.allocate_memory();
+
+  in(&Sim->Lp.NumPart, modus);
+  byten(&Sim->Lp.P[0], Sim->Lp.NumPart * sizeof(lightcone_particle_data), modus);
+
+  in(&Sim->LightCone.NumLastCheck, modus);
+#endif
+
+  /* lightcone massmap data  */
+#ifdef LIGHTCONE_MASSMAPS
+  in(&Sim->Mp.MaxPart, modus);
+
+  if(modus == MODUS_READ)
+    Sim->Mp.allocate_memory();
+
+  in(&Sim->Mp.NumPart, modus);
+  byten(&Sim->Mp.P[0], Sim->Mp.NumPart * sizeof(lightcone_massmap_data), modus);
+
+  /* allocate and clear local piece of mass map if needed */
+  if(modus == MODUS_READ)
+    {
+      Sim->LightCone.Mp->Npix = nside2npix(All.LightConeMassMapsNside);
+      subdivide_evenly(Sim->LightCone.Mp->Npix, NTask, ThisTask, &Sim->LightCone.Mp->FirstPix, &Sim->LightCone.Mp->NpixLoc);
+
+      Sim->LightCone.MassMap =
+          (double *)Mem.mymalloc_movable_clear(&Sim->LightCone.MassMap, "MassMap", Sim->LightCone.Mp->NpixLoc * sizeof(double));
+    }
+
+  byten(Sim->LightCone.MassMap, Sim->LightCone.Mp->NpixLoc * sizeof(double), modus);
+#endif
+
+  /* write state of random number generator */
+  byten(gsl_rng_state(random_generator), gsl_rng_size(random_generator), modus);
+
+  byten(Logs.CPU_Step, logs::CPU_LAST * sizeof(double), modus);
+  byten(Logs.CPU_Step_Stored, logs::CPU_LAST * sizeof(double), modus);
+  byten(Logs.CPU_Sum, logs::CPU_LAST * sizeof(double), modus);
+
+  /* now store variables for time integration bookkeeping */
+  byten(Sim->Sp.TimeBinSynchronized, TIMEBINS * sizeof(int), modus);
+
+  in(&Sim->Sp.TimeBinsHydro.NActiveParticles, modus);
+  in(&Sim->Sp.TimeBinsGravity.NActiveParticles, modus);
+  byten(&Sim->Sp.TimeBinsHydro.GlobalNActiveParticles, sizeof(long long), modus);
+  byten(&Sim->Sp.TimeBinsGravity.GlobalNActiveParticles, sizeof(long long), modus);
+  byten(Sim->Sp.TimeBinsHydro.ActiveParticleList, Sim->Sp.TimeBinsHydro.NActiveParticles * sizeof(int), modus);
+  byten(Sim->Sp.TimeBinsGravity.ActiveParticleList, Sim->Sp.TimeBinsGravity.NActiveParticles * sizeof(int), modus);
+  byten(Sim->Sp.TimeBinsHydro.NextInTimeBin, Sim->Sp.NumGas * sizeof(int), modus);
+  byten(Sim->Sp.TimeBinsGravity.NextInTimeBin, Sim->Sp.NumPart * sizeof(int), modus);
+  byten(Sim->Sp.TimeBinsHydro.PrevInTimeBin, Sim->Sp.NumGas * sizeof(int), modus);
+  byten(Sim->Sp.TimeBinsGravity.PrevInTimeBin, Sim->Sp.NumPart * sizeof(int), modus);
+  byten(Sim->Sp.TimeBinsHydro.TimeBinCount, TIMEBINS * sizeof(int), modus);
+  byten(Sim->Sp.TimeBinsGravity.TimeBinCount, TIMEBINS * sizeof(int), modus);
+  byten(Sim->Sp.TimeBinsHydro.FirstInTimeBin, TIMEBINS * sizeof(int), modus);
+  byten(Sim->Sp.TimeBinsGravity.FirstInTimeBin, TIMEBINS * sizeof(int), modus);
+  byten(Sim->Sp.TimeBinsHydro.LastInTimeBin, TIMEBINS * sizeof(int), modus);
+  byten(Sim->Sp.TimeBinsGravity.LastInTimeBin, TIMEBINS * sizeof(int), modus);
+
+#ifdef STARFORMATION
+  byten(Sim->Sp.TimeBinSfr, TIMEBINS * sizeof(double), modus);
+#endif
+
+  /* now store relevant data for tree */
+
+  in(&Sim->Domain.NTopleaves, modus);
+  in(&Sim->Domain.NTopnodes, modus);
+
+  in(&Sim->NgbTree.MaxPart, modus);
+  in(&Sim->NgbTree.MaxNodes, modus);
+  in(&Sim->NgbTree.NumNodes, modus);
+  in(&Sim->NgbTree.NumPartImported, modus);
+  in(&Sim->NgbTree.FirstNonTopLevelNode, modus);
+
+  if(modus == MODUS_READ)
+    {
+      Sim->Domain.domain_allocate(Sim->Domain.NTopnodes);
+
+      /* passing a negative number to the allocate call will here prevent that NgbTree.MaxPart and NgbTree.MaxNodes are recomputed */
+      Sim->NgbTree.treeallocate(-1, &Sim->Sp, &Sim->Domain);
+
+      if(Sim->NgbTree.MaxPart != 0)
+        {
+          Sim->NgbTree.Points   = (ngbpoint_data *)Mem.mymalloc_movable(&Sim->NgbTree.Points, "Points",
+                                                                      Sim->NgbTree.NumPartImported * sizeof(ngbpoint_data));
+          Sim->NgbTree.Nextnode = (int *)Mem.mymalloc_movable(
+              &Sim->NgbTree.Nextnode, "Nextnode",
+              (Sim->NgbTree.MaxPart + Sim->Domain.NTopleaves + Sim->NgbTree.NumPartImported) * sizeof(int));
+          Sim->NgbTree.Father = (int *)Mem.mymalloc_movable(&Sim->NgbTree.Father, "Father",
+                                                            (Sim->NgbTree.MaxPart + Sim->NgbTree.NumPartImported) * sizeof(int));
+        }
+    }
+
+  if(Sim->Sp.TotNumGas > 0)
+    {
+      byten(Sim->NgbTree.Nodes + Sim->NgbTree.MaxPart + Sim->Domain.NTopnodes,
+            (Sim->NgbTree.NumNodes - Sim->Domain.NTopnodes) * sizeof(ngbnode), modus);
+      byten(Sim->NgbTree.TopNodes + Sim->NgbTree.MaxPart, Sim->Domain.NTopnodes * sizeof(ngbnode), modus);
+      byten(Sim->NgbTree.NodeIndex, Sim->Domain.NTopleaves * sizeof(int), modus);
+      byten(Sim->NgbTree.NodeSibling, Sim->Domain.NTopleaves * sizeof(int), modus);
+      byten(Sim->NgbTree.NodeLevel, Sim->Domain.NTopleaves * sizeof(unsigned char), modus);
+      byten(Sim->NgbTree.Nextnode, (Sim->NgbTree.MaxPart + Sim->Domain.NTopleaves) * sizeof(int), modus);
+      byten(Sim->NgbTree.Father, Sim->NgbTree.MaxPart * sizeof(int), modus);
+    }
+
+  byten(Sim->Domain.TopNodes, Sim->Domain.NTopnodes * Sim->Domain.domain_sizeof_topnode_data(), modus);
+  byten(Sim->Domain.TaskOfLeaf, Sim->Domain.NTopleaves * sizeof(int), modus);
+  byten(Sim->Domain.ListOfTopleaves, Sim->Domain.NTopleaves * sizeof(int), modus);
+  byten(Sim->Domain.FirstTopleafOfTask, NTask * sizeof(int), modus);
+  byten(Sim->Domain.NumTopleafOfTask, NTask * sizeof(int), modus);
+
+  fclose(fd);
+}
+
+/*! \brief Adjusts the timeline if the TimeMax variable is
+ * increased between a restart.
+ *
+ * The approach taken here is to reduce the resolution of the
+ * integer timeline by factors of 2 until the new final time
+ * can be reached within TIMEBASE.
+ *
+ * \param TimeMax_old old final time
+ * \param TimeMax_new new final time (must be larger than old one)
+ */
+void restart::readjust_timebase(double TimeMax_old, double TimeMax_new)
+{
+  mpi_printf("\nRESTART: All.TimeMax has been changed in the parameterfile from %g to %g. Need to adjust integer timeline.\n",
+             TimeMax_old, TimeMax_new);
+
+  if(TimeMax_new < TimeMax_old)
+    Terminate("\nIt is not allowed to reduce All.TimeMax\n");
+
+  long long ti_end;
+
+  if(All.ComovingIntegrationOn)
+    ti_end = (long long)(log(TimeMax_new / All.TimeBegin) / All.Timebase_interval);
+  else
+    ti_end = (long long)((TimeMax_new - All.TimeBegin) / All.Timebase_interval);
+
+  while(ti_end > TIMEBASE)
+    {
+      All.Timebase_interval *= 2.0;
+
+      ti_end /= 2;
+      All.Ti_Current /= 2;
+
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+      All.PM_Ti_begstep /= 2;
+      All.PM_Ti_endstep /= 2;
+#endif
+
+      for(int i = 0; i < Sim->Sp.NumPart; i++)
+        {
+          if(Sim->Sp.P[i].TimeBinGrav > 0)
+            {
+              Sim->Sp.P[i].Ti_Current = Sim->Sp.P[i].Ti_Current / 2;
+
+              Sim->Sp.P[i].TimeBinGrav--;
+
+              if(Sim->Sp.P[i].TimeBinGrav <= 0)
+                Terminate("Error in readjust_timebase(). Minimum Timebin for particle %d reached.\n", i);
+            }
+
+          if(Sim->Sp.P[i].getType() == 0)
+            {
+              if(Sim->Sp.P[i].getTimeBinHydro() > 0)
+                {
+                  Sim->Sp.P[i].setTimeBinHydro(Sim->Sp.P[i].getTimeBinHydro() - 1);
+
+                  if(Sim->Sp.P[i].getTimeBinHydro() <= 0)
+                    Terminate("Error in readjust_timebase(). Minimum Timebin (hydro) for sph particle %d reached.\n", i);
+                }
+            }
+        }
+    }
+
+  All.TimeMax = TimeMax_new;
+}
+
+void restart::byten(void *x, size_t n, int modus)
+{
+  char *p = (char *)x;
+
+  while(n > BLKSIZE)
+    {
+      byten_doit(p, BLKSIZE, modus);
+      p += BLKSIZE;
+      n -= BLKSIZE;
+      polling(modus);
+    }
+
+  if(n > 0)
+    byten_doit(p, n, modus);
+}
+
+/*! \brief reads/writes n bytes to a restart file
+ */
+void restart::byten_doit(void *x, size_t n, int modus)
+{
+  if(modus == MODUS_READ)
+    my_fread(x, n, 1, fd);
+  else
+    my_fwrite(x, n, 1, fd);
+}
+
+/*! \brief reads/writes one integer to a restart file
+ *
+ * \param x pointer to the integer
+ * \param modus if modus>0  the restart()-routine reads,
+ * if modus==0 it writes a restart file.
+ */
+void restart::in(int *x, int modus) { byten(x, sizeof(int), modus); }
diff --git a/src/io/restart.h b/src/io/restart.h
new file mode 100644
index 0000000000000000000000000000000000000000..4824a6de258624b51c7c5fb550b0f2a53ae96ed3
--- /dev/null
+++ b/src/io/restart.h
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file restart.h
+ *
+ * \brief declares class used for reading/writing of restart files
+ */
+
+#ifndef RESTART_H
+#define RESTART_H
+
+#define MODUS_WRITE 0
+#define MODUS_READ 1
+
+#define BLKSIZE (1024 * 1024)
+
+#include "../io/io_streamcount.h"
+#include "../main/simulation.h"
+
+class restart : public io_streamcount, public setcomm
+{
+ public:
+  restart(MPI_Comm comm) : setcomm(comm) /* constructor */ { determine_compute_nodes(); }
+
+  void load(sim *Sim_ptr);
+  void write(sim *Sim_ptr);
+
+ private:
+  sim *Sim;
+
+  FILE *fd;
+
+  struct seq_data
+  {
+    int thistask;
+    int rankinnode;
+    int thisnode;
+    bool operator<(const seq_data &other) const
+    {
+      if(rankinnode < other.rankinnode)
+        return true;
+      if(rankinnode > other.rankinnode)
+        return false;
+      if(thisnode < other.thisnode)
+        return true;
+      if(thisnode > other.thisnode)
+        return false;
+      return thistask < other.thistask;
+    }
+  };
+  seq_data *seq;
+
+  int files_started;
+  int files_completed;
+  int files_concurrent;
+  int files_groups;
+
+  void do_restart(int modus);
+
+  void readjust_timebase(double TimeMax_old, double TimeMax_new);
+  void work_files(int modus);
+  void contents_restart_file(int modus);
+  void backup_restartfiles(int task);
+  void polling(int modus);
+  void in(int *x, int modus);
+  void byten(void *x, size_t n, int modus);
+  void byten_doit(void *x, size_t n, int modus);
+};
+
+#endif /* RESTART_H */
diff --git a/src/io/snap_io.cc b/src/io/snap_io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..733ec231f58e056c2da0ce453d2a19f0cb3c5292
--- /dev/null
+++ b/src/io/snap_io.cc
@@ -0,0 +1,1059 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file snap_io.cc
+ *
+ *  \brief routines for I/O of snapshot files
+ */
+
+#include "gadgetconfig.h"
+
+#include <errno.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../gitversion/version.h"
+#include "../gravtree/gravtree.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../io/snap_io.h"
+#include "../lightcone/lightcone.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/peano.h"
+#include "../src/pm/pm.h"
+#include "../system/system.h"
+
+/*!
+ * \brief Function for field registering.
+ *
+ * For init_field arguments read the documentation of init_field.
+ * Don't forget to add the new IO_FLAG to io_private.h
+ */
+void snap_io::init_basic(simparticles *Sp_ptr)
+{
+  Sp = Sp_ptr;
+
+  this->N_IO_Fields  = 0;
+  this->N_DataGroups = NTYPES + 1;  // the last data group is a tree table used only for storing/reading tree-reordered particle data
+  this->header_size  = sizeof(header);
+  this->header_buf   = &header;
+  this->type_of_file = FILE_IS_SNAPSHOT;
+  sprintf(this->info, "SNAPSHOT: writing snapshot");
+
+#ifdef OUTPUT_COORDINATES_AS_INTEGERS
+  init_field("IPOS", "IntCoordinates", MEM_MY_INTPOS_TYPE, FILE_MY_INTPOS_TYPE, READ_IF_PRESENT, 3, A_P, NULL, io_func_intpos,
+             ALL_TYPES, 1, 1., -1., 1., 0., 0., All.UnitLength_in_cm * Sp->FacIntToCoord, true);
+#else
+  init_field("POS ", "Coordinates", MEM_MY_DOUBLE, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_P, NULL, io_func_pos, ALL_TYPES, 1, 1., -1.,
+             1., 0., 0., All.UnitLength_in_cm, true);
+#endif
+
+#ifdef OUTPUT_VELOCITIES_IN_HALF_PRECISION
+  init_field("VEL ", "Velocities", MEM_MY_FLOAT, FILE_HALF, READ_IF_PRESENT, 3, A_NONE, NULL, io_func_vel, ALL_TYPES, 1, 0.5, 0., 0.,
+             0., 1., All.UnitVelocity_in_cm_per_s);
+#else
+  init_field("VEL ", "Velocities", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_NONE, NULL, io_func_vel, ALL_TYPES, 1, 0.5,
+             0., 0., 0., 1., All.UnitVelocity_in_cm_per_s);
+#endif
+
+#ifdef OUTPUT_ACCELERATION
+#ifdef OUTPUT_ACCELERATIONS_IN_HALF_PRECISION
+  All.accel_normalize_fac = 10.0 * All.Hubble * (100.0 * 1.0e5 / All.UnitVelocity_in_cm_per_s);
+
+  init_field("ACCE", "Acceleration", MEM_MY_FLOAT, FILE_HALF, SKIP_ON_READ, 3, A_NONE, 0, io_func_accel, ALL_TYPES, 1, -2.0, 1, -1, 0,
+             2, All.accel_normalize_fac * All.UnitVelocity_in_cm_per_s * All.UnitVelocity_in_cm_per_s / All.UnitLength_in_cm);
+#else
+  All.accel_normalize_fac = 1.0;
+
+  init_field("ACCE", "Acceleration", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 3, A_NONE, 0, io_func_accel, ALL_TYPES, 1, -2.0, 1,
+             -1, 0, 2, All.UnitVelocity_in_cm_per_s * All.UnitVelocity_in_cm_per_s / All.UnitLength_in_cm);
+#endif
+
+  /* hydro acceleration */
+  init_field("HACC", "HydroAcceleration", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_SPHP, &Sp->SphP[0].HydroAccel, 0,
+             GAS_ONLY, 0, 0, 0, 0, 0, 0, 0);
+#endif
+
+  init_field("ID  ", "ParticleIDs", MEM_MY_ID_TYPE, FILE_MY_ID_TYPE, READ_IF_PRESENT, 1, A_P, NULL, io_func_id, ALL_TYPES, 0, 0, 0, 0,
+             0, 0, 0, true);
+
+#ifndef LEAN
+  init_field("MASS", "Masses", MEM_MY_DOUBLE, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_P, NULL, io_func_mass, MASS_BLOCK, 1, 0., -1.,
+             0., 1., 0., All.UnitMass_in_g);
+#endif
+
+#ifdef SECOND_ORDER_LPT_ICS
+  init_field("LPTM", "SecondOrderICMasses", MEM_FLOAT, FILE_NONE, READ_IF_PRESENT, 1, A_P, &Sp->P[0].OldAcc, NULL, ALL_TYPES, 0, 0, 0,
+             0, 0, 0, 0);
+#endif
+
+  init_field("U   ", "InternalEnergy", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_NONE, NULL, io_func_u, GAS_ONLY, 1, 0.,
+             0., 0., 0., 2., All.UnitVelocity_in_cm_per_s * All.UnitVelocity_in_cm_per_s);
+
+  init_field("RHO ", "Density", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_SPHP, &Sp->SphP[0].Density, NULL, GAS_ONLY, 1,
+             -3., 2., -3., 1., 0., All.UnitDensity_in_cgs);
+
+  init_field("HSML", "SmoothingLength", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_SPHP, &Sp->SphP[0].Hsml, NULL, GAS_ONLY,
+             1, 1., -1., 1., 0., 0., All.UnitLength_in_cm);
+
+#ifdef STARFORMATION
+
+  init_field("SFR ", "StarFormationRate", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, All.RestartFlag == RST_FOF ? READ_IF_PRESENT : SKIP_ON_READ,
+             1, A_NONE, 0, io_func_sfr, GAS_ONLY, 1, 0., 0., -1., 1., 1., SOLAR_MASS / SEC_PER_YEAR);
+
+  init_field("AGE ", "StellarFormationTime", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_P, &Sp->P[0].StellarAge, NULL,
+             AGE_BLOCK, /* stellar formation time */
+             0, 0, 0, 0, 0, 0, 0);
+
+  init_field("Z   ", "Metallicity", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_NONE, 0, io_func_metallicity,
+             Z_BLOCK, /* gas and star metallicity */
+             0, 0, 0, 0, 0, 0, 0);
+#endif
+
+#if defined(PRESSURE_ENTROPY_SPH) && defined(OUTPUT_PRESSURE_SPH_DENSITY)
+  init_field("PRHO", "PressureSphDensity", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_SPHP, &Sp->SphP[0].PressureSphDensity,
+             NULL, GAS_ONLY, /* Pressure density */
+             1, -3., 2., -3., 1., 0., All.UnitDensity_in_cgs);
+#endif
+
+#ifdef OUTPUT_PRESSURE
+  init_field("PRES", "Pressure", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 1, A_NONE, 0, io_func_pressure,
+             GAS_ONLY, /* particle pressure */
+             1, -3., 2., -3., 1., 2., All.UnitDensity_in_cgs * All.UnitVelocity_in_cm_per_s * All.UnitVelocity_in_cm_per_s);
+#endif
+
+#if defined(TIMEDEP_ART_VISC) && defined(OUTPUT_VISCOSITY_PARAMETER)
+  init_field("ALP ", "ArtificialViscosityParameter", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_SPHP, &Sp->SphP[0].Alpha,
+             NULL, GAS_ONLY, 1, -3., 2., -3., 1., 0., 1);
+#endif
+
+#ifdef OUTPUT_ENTROPY
+  init_field("ENTR", "Entropy", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_SPHP, &Sp->SphP[0].Entropy, 0,
+             GAS_ONLY, /* particle entropy */
+             0, 0, 0, 0, 0, 0, 0);
+#endif
+
+#ifdef COOLING
+  init_field("NE  ", "ElectronAbundance", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_SPHP, &Sp->SphP[0].Ne, 0,
+             GAS_ONLY, /* electron abundance */
+             0, 0, 0, 0, 0, 0, 0);
+#endif
+
+#ifdef OUTPUT_POTENTIAL
+  init_field("POT ", "Potential", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_P, &Sp->P[0].Potential, 0,
+             ALL_TYPES, /* potential */
+             1, -1., 0., 0., 0., 2., All.UnitVelocity_in_cm_per_s * All.UnitVelocity_in_cm_per_s);
+#endif
+
+#ifdef OUTPUT_CHANGEOFENTROPY
+  init_field("ENDT", "RateOfChangeOfEntropy", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_SPHP, &Sp->SphP[0].DtEntropy, 0,
+             GAS_ONLY, /* particle entropy change */
+             0, 0, 0, 0, 0, 0, 0);
+#endif
+
+#ifdef OUTPUT_TIMESTEP
+  init_field("TSTP", "TimeStep", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 1, A_NONE, 0, io_func_timestep,
+             ALL_TYPES, /* time step */
+             0, 0, 0, 0, 0, 0, 0);
+
+  init_field("TSTH", "TimeStepHydro", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 1, A_NONE, 0, io_func_timestephydro,
+             GAS_ONLY, /* hydro time step */
+             0, 0, 0, 0, 0, 0, 0);
+#endif
+
+#ifdef OUTPUT_DIVVEL
+  init_field("DIVV", "VelocityDivergence", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_SPHP, &Sp->SphP[0].DivVel, 0,
+             GAS_ONLY, /* hydro velocity divergence */
+             0, 0, 0, 0, 0, 0, 0);
+#endif
+
+#ifdef OUTPUT_CURLVEL
+  init_field("ROTV", "VelocityCurl", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_SPHP, &Sp->SphP[0].CurlVel, 0,
+             GAS_ONLY, /* absolute value of rot v */
+             0, 0, 0, 0, 0, 0, 0);
+#endif
+
+#ifdef OUTPUT_VELOCITY_GRADIENT
+#ifndef IMPROVED_VELOCITY_GRADIENTS
+#error "The option OUTPUT_VELOCITY_GRADIENT requires IMPROVED_VELOCITY_GRADIENTS"
+#endif
+  init_field("GRAV", "VelocityGradient", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 9, A_SPHP, &Sp->SphP[0].dvel[0][0], 0,
+             GAS_ONLY, 0, 0, 0, 0, 0, 0, 0);
+#endif
+
+#ifdef OUTPUT_COOLHEAT
+  init_field("COHE", "CoolingHeatingEnergy", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_SPHP, &Sp->SphP[0].CoolHeat, 0,
+             GAS_ONLY, 0, 0, 0, 0, 0, 0, 0);
+#endif
+
+#if defined(SUBFIND) && defined(SUBFIND_STORE_LOCAL_DENSITY)
+
+  init_field("SFDE", "SubfindDensity", MEM_MY_FLOAT, All.RestartFlag != RST_CREATEICS ? FILE_MY_IO_FLOAT : FILE_NONE, SKIP_ON_READ, 1,
+             A_PS, &Sp->PS[0].SubfindDensity, 0, ALL_TYPES, /* subfind density */
+             1, -3., 2., -3., 1., 0., All.UnitDensity_in_cgs);
+
+  init_field("SFHS", "SubfindHsml", MEM_MY_FLOAT, All.RestartFlag != RST_CREATEICS ? FILE_MY_IO_FLOAT : FILE_NONE, SKIP_ON_READ, 1,
+             A_PS, &Sp->PS[0].SubfindHsml, 0, ALL_TYPES, /* subfind hsml */
+             1, 1., -1., 1., 0., 0., All.UnitLength_in_cm);
+
+  init_field("SFVD", "SubfindVelDisp", MEM_MY_FLOAT, All.RestartFlag != RST_CREATEICS ? FILE_MY_IO_FLOAT : FILE_NONE, SKIP_ON_READ, 1,
+             A_PS, &Sp->PS[0].SubfindVelDisp, 0, ALL_TYPES, /* subfind velocity dispersion */
+             1, 0., 0., 0., 0., 1., All.UnitVelocity_in_cm_per_s);
+#endif
+
+#if defined(GADGET2_HEADER) && defined(REARRANGE_OPTION) && defined(MERGERTREE)
+  Terminate("GADGET2_HEADER does not work together with REARRANGE_OPTION\n");
+#endif
+}
+
+#if defined(REARRANGE_OPTION) && defined(MERGERTREE)
+void snap_io::init_extra(simparticles *Sp_ptr, mergertree *MergerTree_ptr)
+{
+  Sp         = Sp_ptr;
+  MergerTree = MergerTree_ptr;
+
+  init_field("MTRI", "TreeID", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_P, &Sp->P[0].TreeID, NULL, ALL_TYPES, 0, 0, 0, 0, 0, 0, 0);
+
+  init_field("MTRL", "ParticleCount", MEM_INT, FILE_INT, READ_IF_PRESENT, 1, A_TT, &MergerTree->TreeTable[0].HaloCount, NULL,
+             TREETABLE, 0, 0, 0, 0, 0, 0, 0);
+  init_field("MTRS", "ParticleFirst", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_TT, &MergerTree->TreeTable[0].FirstHalo, NULL,
+             TREETABLE, 0, 0, 0, 0, 0, 0, 0);
+  init_field("MTRI", "TreeID", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_TT, &MergerTree->TreeTable[0].TreeID, NULL, TREETABLE, 0,
+             0, 0, 0, 0, 0, 0);
+}
+#endif
+
+#if defined(LGALAXIES)
+void snap_io::init_extra_lgalaxies(void)
+{
+#ifdef REARRANGE_OPTION
+  init_field("MTRI", "TreeID", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_P, &Sp->P[0].TreeID, NULL, ALL_TYPES, 0, 0, 0, 0, 0, 0, 0);
+#endif
+
+  init_field("MTRL", "ParticleCount", MEM_INT, FILE_INT, READ_IF_PRESENT, 1, A_CT, &Sp->PartTreeTable[0].ParticleCount, NULL,
+             TREETABLE, 0, 0, 0, 0, 0, 0, 0);
+  init_field("MTRS", "ParticleFirst", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_CT, &Sp->PartTreeTable[0].ParticleFirst, NULL,
+             TREETABLE, 0, 0, 0, 0, 0, 0, 0);
+  init_field("MTRI", "TreeID", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_CT, &Sp->PartTreeTable[0].TreeID, NULL, TREETABLE, 0, 0, 0,
+             0, 0, 0, 0);
+}
+
+void snap_io::free_basic_treeinfo(void) { Mem.myfree(ntype_in_files); }
+
+int snap_io::acquire_basic_treeinfo(int num, mysnaptype loc_snap_type)
+{
+  snap_type = loc_snap_type;
+
+  if(snap_type != MOST_BOUND_PARTICLE_SNAPHOT_REORDERED)
+    Terminate("bummmer");
+
+  char fname[3 * MAXLEN_PATH], fname_multiple[3 * MAXLEN_PATH];
+
+  sprintf(fname_multiple, "%s/snapdir_%03d/%s-prevmostboundonly-treeorder_%03d", All.OutputDir, num, All.SnapshotFileBase, num);
+  sprintf(fname, "%s%s-prevmostboundonly-treeorder_%03d", All.OutputDir, All.SnapshotFileBase, num);
+
+  int num_files = find_files(fname, fname_multiple);
+
+  if(num_files > 1)
+    strcpy(fname, fname_multiple);
+
+  alloc_and_read_ntype_in_files(fname, num_files);
+
+  return num_files;
+}
+
+long long snap_io::load_orphans(int num, long long treenr, int num_files)
+{
+  char fname[3 * MAXLEN_PATH];
+
+  if(num_files > 1)
+    sprintf(fname, "%s/snapdir_%03d/%s-prevmostboundonly-treeorder_%03d", All.OutputDir, num, All.SnapshotFileBase, num);
+  else
+    sprintf(fname, "%s%s-prevmostboundonly-treeorder_%03d", All.OutputDir, All.SnapshotFileBase, num);
+
+  read_segment(fname, NTYPES, treenr, 1, num_files);
+
+  long long count = Sp->PartTreeTable[0].ParticleCount;
+  long long first = Sp->PartTreeTable[0].ParticleFirst;
+
+  Sp->P       = (particle_data *)Mem.mymalloc_movable(&Sp->P, "Sp->P", count * sizeof(particle_data));
+  Sp->NumPart = 0;
+
+#ifndef OUTPUT_COORDINATES_AS_INTEGERS
+  Ptmp = (ptmp_data *)Mem.mymalloc_movable(&Ptmp, "Ptmp", count * sizeof(ptmp_data));
+#endif
+
+  read_segment(fname, 1, first, count, num_files);
+
+#ifndef OUTPUT_COORDINATES_AS_INTEGERS
+  snap_init_domain_mapping();
+
+  for(int i = 0; i < count; i++)
+    Sp->pos_to_intpos(Ptmp[i].Pos, Sp->P[i].IntPos); /* converts floating point representation to integers */
+
+  Mem.myfree(Ptmp);
+#endif
+
+  if(count > 0 && Sp->NumPart != count)
+    Terminate("Sp->NumPart=%d != count=%d", (int)Sp->NumPart, (int)count);
+
+  return count;
+}
+
+void snap_io::free_orphans(void) { Mem.myfree(Sp->P); }
+
+#endif
+
+void snap_io::read_snapshot(int num, mysnaptype loc_snap_type)
+{
+  snap_type = loc_snap_type;
+
+  char buf[MAXLEN_PATH_EXTRA];
+
+  if(snap_type == MOST_BOUND_PARTICLE_SNAPHOT)
+    {
+      if(All.NumFilesPerSnapshot > 1)
+        sprintf(buf, "%s/snapdir_%03d/%s-prevmostboundonly_%03d", All.OutputDir, num, All.SnapshotFileBase, num);
+      else
+        sprintf(buf, "%s%s-prevmostboundonly_%03d", All.OutputDir, All.SnapshotFileBase, num);
+    }
+  else
+    {
+      if(All.NumFilesPerSnapshot > 1)
+        sprintf(buf, "%s/snapdir_%03d/%s_%03d", All.OutputDir, num, All.SnapshotFileBase, num);
+      else
+        sprintf(buf, "%s%s_%03d", All.OutputDir, All.SnapshotFileBase, num);
+    }
+
+  read_ic(buf);
+}
+
+/*! \brief This function reads initial conditions that are in on of the default file formats
+ * of Gadget.
+ *
+ * Snapshot files can be used as input files.  However, when a
+ * snapshot file is used as input, not all the information in the header is
+ * used: THE STARTING TIME NEEDS TO BE SET IN THE PARAMETERFILE.
+ * Alternatively, the code can be started with All.RestartFlag 2, then snapshots
+ * from the code can be used as initial conditions-files without having to
+ * change the parameter file.  For gas particles, only the internal energy is
+ * read, the density and mean molecular weight will be recomputed by the code.
+ * When InitGasTemp>0 is given, the gas temperature will be initialized to this
+ * value assuming a mean molecular weight either corresponding to complete
+ * neutrality, or full ionization.
+ *
+ * \param fname file name of the ICs
+ * \param readTypes readTypes is a bitfield that
+ * determines what particle types to read, only if the bit
+ * corresponding to a particle type is set, the corresponding data is
+ * loaded, otherwise its particle number is set to zero. (This is
+ * only implemented for HDF5 files.)
+ */
+void snap_io::read_ic(const char *fname)
+{
+  if(All.ICFormat < 1 || All.ICFormat > 4)
+    Terminate("ICFormat = %d not supported.\n", All.ICFormat);
+
+  TIMER_START(CPU_SNAPSHOT);
+
+  double t0 = Logs.second();
+
+  Sp->TotNumPart = 0;
+
+  int num_files = find_files(fname, fname);
+
+  reset_io_byte_count();
+
+  /* we repeat reading the headers of the files two times. In the first iteration, only the
+   * particle numbers ending up on each processor are assembled, followed by memory allocation.
+   * In the second iteration, the data is actually read in.
+   */
+  for(int rep = 0; rep < 2; rep++)
+    {
+      Sp->NumPart = 0;
+      Sp->NumGas  = 0;
+
+      read_files_driver(fname, rep, num_files);
+
+      /* now do the memory allocation */
+      if(rep == 0)
+        {
+          int max_load, max_sphload;
+          MPI_Allreduce(&Sp->NumPart, &max_load, 1, MPI_INT, MPI_MAX, Communicator);
+          MPI_Allreduce(&Sp->NumGas, &max_sphload, 1, MPI_INT, MPI_MAX, Communicator);
+
+#ifdef TILING
+          max_load *= TILING * TILING * TILING;
+          max_sphload *= TILING * TILING * TILING;
+#endif
+
+          Sp->MaxPart    = max_load / (1.0 - 2 * ALLOC_TOLERANCE);
+          Sp->MaxPartSph = max_sphload / (1.0 - 2 * ALLOC_TOLERANCE);
+
+          Sp->allocate_memory(); /* allocate particle storage */
+
+#ifndef OUTPUT_COORDINATES_AS_INTEGERS
+          Ptmp = (ptmp_data *)Mem.mymalloc_movable(&Ptmp, "Ptmp", Sp->NumPart * sizeof(ptmp_data));
+#endif
+        }
+    }
+
+  MPI_Barrier(Communicator);
+
+  long long byte_count = get_io_byte_count(), byte_count_all;
+  sumup_longs(1, &byte_count, &byte_count_all, Communicator);
+
+  double t1 = Logs.second();
+
+  mpi_printf("READIC: reading done. Took %g sec, total size %g MB, corresponds to effective I/O rate of %g MB/sec\n",
+             Logs.timediff(t0, t1), byte_count_all / (1024.0 * 1024.0), byte_count_all / (1024.0 * 1024.0) / Logs.timediff(t0, t1));
+
+  mpi_printf("\nREADIC: Total number of particles :  %lld\n\n", Sp->TotNumPart);
+
+  snap_init_domain_mapping();
+
+#ifndef OUTPUT_COORDINATES_AS_INTEGERS
+  Mem.myfree(Ptmp);
+#endif
+
+#ifdef TILING
+
+  MyIntPosType halfboxlen = ((MyIntPosType)1) << ((BITS_FOR_POSITIONS - 1));
+
+  MyIntPosType len = (halfboxlen / TILING) + (halfboxlen / TILING);
+
+  Sp->TotNumGas *= TILING * TILING * TILING;
+  Sp->TotNumPart *= TILING * TILING * TILING;
+
+  int add_numgas = Sp->NumGas * (TILING * TILING * TILING - 1);
+  /* create a gap behind the existing gas particles where we will insert the new gas particles */
+  memmove(static_cast<void *>(Sp->P + Sp->NumGas + add_numgas), static_cast<void *>(Sp->P + Sp->NumGas),
+          (Sp->NumPart - Sp->NumGas) * sizeof(simparticles::pdata));
+
+  int off = 0;
+
+  for(int i = 0; i < TILING; i++)
+    for(int j = 0; j < TILING; j++)
+      for(int k = 0; k < TILING; k++)
+        if(i != 0 || j != 0 || k != 0)
+          for(int n = 0; n < Sp->NumGas; n++)
+            {
+              Sp->P[Sp->NumGas + off] = Sp->P[n];
+
+              Sp->P[Sp->NumGas + off].IntPos[0] += i * len;
+              Sp->P[Sp->NumGas + off].IntPos[1] += j * len;
+              Sp->P[Sp->NumGas + off].IntPos[2] += k * len;
+
+              off++;
+            }
+
+  if(off != add_numgas)
+    Terminate("this should not happen");
+
+  Sp->NumGas += add_numgas;
+  Sp->NumPart += add_numgas;
+
+  int add_numpart = (Sp->NumPart - Sp->NumGas) * (TILING * TILING * TILING - 1);
+
+  off = 0;
+
+  for(int i = 0; i < TILING; i++)
+    for(int j = 0; j < TILING; j++)
+      for(int k = 0; k < TILING; k++)
+        if(i != 0 || j != 0 || k != 0)
+          for(int n = Sp->NumGas; n < Sp->NumPart; n++)
+            {
+              Sp->P[Sp->NumPart + off] = Sp->P[n];
+
+              Sp->P[Sp->NumPart + off].IntPos[0] += i * len;
+              Sp->P[Sp->NumPart + off].IntPos[1] += j * len;
+              Sp->P[Sp->NumPart + off].IntPos[2] += k * len;
+
+              off++;
+            }
+
+  if(off != add_numpart)
+    Terminate("this should not happen");
+
+  Sp->NumPart += add_numpart;
+#endif
+
+  All.FlagICsContainedEntropy = 0;
+
+#ifdef GADGET2_HEADER
+  if(header.flag_entropy_instead_u)
+    All.FlagICsContainedEntropy = 1;
+#endif
+
+  TIMER_STOP(CPU_SNAPSHOT);
+}
+
+/*! This routine initializes the global domain mapping between integer coordinates and real space coordinates.
+ *  If periodic is on, the extent is the box size. Otherwise we look at the maximum extent of the particles.
+ */
+void snap_io::snap_init_domain_mapping(void)
+{
+#ifdef PERIODIC
+
+  Sp->RegionLen     = All.BoxSize;
+  Sp->FacCoordToInt = pow(2.0, BITS_FOR_POSITIONS) / Sp->RegionLen;
+  Sp->FacIntToCoord = Sp->RegionLen / pow(2.0, BITS_FOR_POSITIONS);
+
+#else
+
+  double posmin[3], posmax[3];
+  for(int k = 0; k < 3; k++)
+    {
+      posmin[k] = MAX_REAL_NUMBER;
+      posmax[k] = -MAX_REAL_NUMBER;
+    }
+
+  for(int i = 0; i < Sp->NumPart; i++)
+    for(int k = 0; k < 3; k++)
+      {
+        if(Ptmp[i].Pos[k] < posmin[k])
+          posmin[k] = Ptmp[i].Pos[k];
+
+        if(Ptmp[i].Pos[k] > posmax[k])
+          posmax[k] = Ptmp[i].Pos[k];
+      }
+
+  double xyz[6] = {posmin[0], posmin[1], posmin[2], -posmax[0], -posmax[1], -posmax[2]};
+  double xyz_glob[6];
+
+  MPI_Allreduce(xyz, xyz_glob, 6, MPI_DOUBLE, MPI_MIN, Communicator);
+
+  mpi_printf("READIC: Region covered with particles: (%g %g %g) -> (%g %g %g)\n", xyz_glob[0], xyz_glob[1], xyz_glob[2], -xyz_glob[3],
+             -xyz_glob[4], -xyz_glob[5]);
+
+  Sp->RegionLen = 0;
+  for(int j = 0; j < 3; j++)
+    if(-xyz_glob[j + 3] - xyz_glob[j] > Sp->RegionLen)
+      Sp->RegionLen = -xyz_glob[j + 3] - xyz_glob[j];
+
+  Sp->RegionLen *= 4.0;
+
+  mpi_printf("READIC: Initial root node size: %g\n", Sp->RegionLen);
+
+  for(int j = 0; j < 3; j++)
+    {
+      Sp->RegionCenter[j] = 0.5 * (xyz_glob[j] - xyz_glob[j + 3]);
+      Sp->RegionCorner[j] = Sp->RegionCenter[j] - 0.5 * Sp->RegionLen;
+    }
+
+  Sp->FacCoordToInt = pow(2.0, BITS_FOR_POSITIONS) / Sp->RegionLen;
+  Sp->FacIntToCoord = Sp->RegionLen / pow(2.0, BITS_FOR_POSITIONS);
+
+#endif
+
+#ifndef OUTPUT_COORDINATES_AS_INTEGERS
+  for(int i = 0; i < Sp->NumPart; i++)
+    Sp->pos_to_intpos(Ptmp[i].Pos, Sp->P[i].IntPos); /* converts floating point representation to integers */
+#endif
+}
+
+int snap_io::get_type_of_element(int index)
+{
+  if(snap_type == NORMAL_SNAPSHOT)
+    {
+      return Sp->P[index].getType();
+    }
+  else if(snap_type == MOST_BOUND_PARTICLE_SNAPHOT)
+    {
+      if(Sp->P[index].ID.is_previously_most_bound())
+        return Sp->P[index].getType();
+      else
+        return -1;  // marks that particle has type that is not written out
+    }
+  else if(snap_type == MOST_BOUND_PARTICLE_SNAPHOT_REORDERED)
+    {
+      return Sp->P[index].getType();
+    }
+  else
+    Terminate("can't be");
+}
+
+void snap_io::set_type_of_element(int index, int type)
+{
+  if(type < NTYPES)
+    Sp->P[index].setType(type);
+}
+
+void *snap_io::get_base_address_of_structure(enum arrays array, int index)
+{
+  switch(array)
+    {
+      case A_SPHP:
+        return (void *)(Sp->SphP + index);
+      case A_P:
+        return (void *)(Sp->P + index);
+      case A_PS:
+        return (void *)(Sp->PS + index);
+#ifdef LIGHTCONE_PARTICLES
+      case A_LC:
+        Terminate("a");  // return (void *) (Lp->P + index);
+#endif
+#ifdef LIGHTCONE_MASSMAPS
+      case A_MM:
+        Terminate("b");  // return (void *) (Lp->P + index);
+#endif
+#if defined(REARRANGE_OPTION) && defined(MERGERTREE)
+      case A_TT:
+        return (void *)(MergerTree->TreeTable + index);
+#endif
+#if defined(LGALAXIES)
+      case A_CT:
+        return (void *)(Sp->PartTreeTable + index);
+#endif
+      default:
+        Terminate("we don't expect to get here");
+    }
+
+  return NULL;
+}
+
+/*! \brief Save snapshot to disk
+ *
+ * This function writes a snapshot of the particle distribution to one or
+ * several files. If NumFilesPerSnapshot>1, the snapshot is distributed
+ * into several files, which are written simultaneously. Each file contains
+ * data from a group of processors of size roughly NTask/NumFilesPerSnapshot.
+ * \param num the snapshot number
+ */
+void snap_io::write_snapshot(int num, mysnaptype loc_snap_type)
+{
+#ifdef DO_NOT_PRODUCE_BIG_OUTPUT
+  if(snap_type != MOST_BOUND_PARTICLE_SNAPHOT)
+    {
+      mpi_printf("\nSNAPSHOT: We skip writing snapshot file #%d @ time %g\n", num, All.Time);
+      return;
+    }
+#endif
+
+  snap_type = loc_snap_type;
+
+  TIMER_START(CPU_SNAPSHOT);
+
+  mpi_printf("\nSNAPSHOT: writing snapshot file #%d @ time %g ... \n", num, All.Time);
+
+  double t0 = Logs.second();
+  reset_io_byte_count();
+
+  All.set_cosmo_factors_for_current_time();
+
+  int n_type[NTYPES];
+
+  /* determine global and local particle numbers */
+  for(int n = 0; n < NTYPES; n++)
+    n_type[n] = 0;
+
+  for(int n = 0; n < Sp->NumPart; n++)
+    {
+      int type = get_type_of_element(n);
+      if(type >= 0)
+        n_type[type]++;
+    }
+
+  sumup_large_ints(NTYPES, n_type, ntot_type_all, Communicator);
+
+  if(All.NumFilesPerSnapshot > 1)
+    {
+      if(ThisTask == 0)
+        {
+          char buf[MAXLEN_PATH_EXTRA];
+          sprintf(buf, "%s/snapdir_%03d", All.OutputDir, num);
+          mkdir(buf, 02755);
+        }
+      MPI_Barrier(Communicator);
+    }
+
+  char buf[MAXLEN_PATH_EXTRA];
+  if(All.NumFilesPerSnapshot > 1)
+    sprintf(buf, "%s/snapdir_%03d/%s_%03d", All.OutputDir, num, All.SnapshotFileBase, num);
+  else
+    sprintf(buf, "%s%s_%03d", All.OutputDir, All.SnapshotFileBase, num);
+
+  if(snap_type == MOST_BOUND_PARTICLE_SNAPHOT)
+    {
+      if(All.NumFilesPerSnapshot > 1)
+        sprintf(buf, "%s/snapdir_%03d/%s-prevmostboundonly_%03d", All.OutputDir, num, All.SnapshotFileBase, num);
+      else
+        sprintf(buf, "%s%s-prevmostboundonly_%03d", All.OutputDir, All.SnapshotFileBase, num);
+    }
+  else if(snap_type == MOST_BOUND_PARTICLE_SNAPHOT_REORDERED)
+    {
+      if(All.NumFilesPerSnapshot > 1)
+        sprintf(buf, "%s/snapdir_%03d/%s-prevmostboundonly-treeorder_%03d", All.OutputDir, num, All.SnapshotFileBase, num);
+      else
+        sprintf(buf, "%s%s-prevmostboundonly-treeorder_%03d", All.OutputDir, All.SnapshotFileBase, num);
+    }
+
+  /* now write the files */
+  write_multiple_files(buf, All.NumFilesPerSnapshot);
+
+  long long byte_count = get_io_byte_count(), byte_count_all;
+  sumup_longs(1, &byte_count, &byte_count_all, Communicator);
+
+  double t1 = Logs.second();
+
+  mpi_printf("SNAPSHOT: done with writing snapshot.  Took %g sec, total size %g MB, corresponds to effective I/O rate of %g MB/sec\n",
+             Logs.timediff(t0, t1), byte_count_all / (1024.0 * 1024.0), byte_count_all / (1024.0 * 1024.0) / Logs.timediff(t0, t1));
+
+  All.Ti_lastoutput = All.Ti_Current;
+
+  TIMER_STOP(CPU_SNAPSHOT);
+}
+
+void snap_io::fill_file_header(int writeTask, int lastTask, long long *n_type, long long *ntot_type)
+{
+  /* determine global and local particle numbers */
+  for(int n = 0; n < NTYPES + 1; n++)
+    n_type[n] = 0;
+
+  for(int n = 0; n < Sp->NumPart; n++)
+    {
+      int type = get_type_of_element(n);
+      if(type >= 0)
+        n_type[type]++;
+    }
+
+#if defined(REARRANGE_OPTION) && defined(MERGERTREE)
+  if(snap_type == MOST_BOUND_PARTICLE_SNAPHOT_REORDERED)
+    n_type[NTYPES] = MergerTree->Ntrees;
+#endif
+
+  /* determine particle numbers of each type in file */
+  if(ThisTask == writeTask)
+    {
+      for(int n = 0; n < NTYPES + 1; n++)
+        ntot_type[n] = n_type[n];
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        {
+          long long nn[NTYPES + 1];
+          MPI_Recv(&nn[0], NTYPES + 1, MPI_LONG_LONG, task, TAG_LOCALN, Communicator, MPI_STATUS_IGNORE);
+          for(int n = 0; n < NTYPES + 1; n++)
+            ntot_type[n] += nn[n];
+        }
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        MPI_Send(&ntot_type[0], NTYPES + 1, MPI_LONG_LONG, task, TAG_N, Communicator);
+    }
+  else
+    {
+      MPI_Send(&n_type[0], NTYPES + 1, MPI_LONG_LONG, writeTask, TAG_LOCALN, Communicator);
+      MPI_Recv(&ntot_type[0], NTYPES + 1, MPI_LONG_LONG, writeTask, TAG_N, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* fill file header */
+
+  for(int n = 0; n < NTYPES; n++)
+    {
+      header.npart[n]      = ntot_type[n];
+      header.npartTotal[n] = ntot_type_all[n];
+    }
+
+#ifdef MERGERTREE
+  if(snap_type == MOST_BOUND_PARTICLE_SNAPHOT_REORDERED)
+    {
+      header.Ntrees = ntot_type[NTYPES];
+#if defined(REARRANGE_OPTION) && defined(MERGERTREE)
+      header.TotNtrees = MergerTree->TotNtrees;
+#endif
+    }
+  else
+    {
+      header.Ntrees    = 0;
+      header.TotNtrees = 0;
+    }
+#endif
+
+  for(int n = 0; n < NTYPES; n++)
+    header.mass[n] = All.MassTable[n];
+
+  header.time = All.Time;
+
+  if(All.ComovingIntegrationOn)
+    header.redshift = 1.0 / All.Time - 1;
+  else
+    header.redshift = 0;
+
+  header.num_files = All.NumFilesPerSnapshot;
+  header.BoxSize   = All.BoxSize;
+
+#ifdef GADGET2_HEADER
+  for(int n = 0; n < NTYPES; n++)
+    header.npartTotalLowWord[n] = ntot_type_all[n];
+
+  header.flag_sfr      = 0;
+  header.flag_feedback = 0;
+  header.flag_cooling  = 0;
+
+#ifdef COOLING
+  header.flag_cooling = 1;
+#endif
+
+#ifdef STARFORMATION
+  header.flag_sfr      = 1;
+  header.flag_feedback = 1;
+#endif
+  header.Omega0      = All.Omega0;
+  header.OmegaLambda = All.OmegaLambda;
+
+#ifdef OUTPUT_IN_DOUBLEPRECISION
+  header.flag_doubleprecision = 1;
+#else
+  header.flag_doubleprecision = 0;
+#endif
+#endif
+}
+
+void snap_io::read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *n_type, long long *ntot_type,
+                               int *nstart)
+{
+  n_type[NTYPES]    = 0;
+  ntot_type[NTYPES] = 0;
+
+#ifdef GADGET2_HEADER
+  for(int i = 0; i < NTYPES_HEADER; i++)
+    if(header.npartTotalLowWord[i] > 0)
+      header.npartTotal[i] = header.npartTotalLowWord[i];  // + (((long long)header.npartTotalHighWord[i]) << 32);
+#endif
+
+  if(Sp->TotNumPart == 0)
+    {
+      if(header.num_files <= 1)
+        for(int i = 0; i < NTYPES; i++)
+          header.npartTotal[i] = header.npart[i];
+
+      Sp->TotNumGas  = header.npartTotal[0];
+      Sp->TotNumPart = 0;
+
+      for(int i = 0; i < NTYPES; i++)
+        Sp->TotNumPart += header.npartTotal[i];
+
+#ifdef GADGET2_HEADER
+#if defined(SECOND_ORDER_LPT_ICS)
+      if(header.flag_ic_info == FLAG_SECOND_ORDER_ICS)
+        {
+          mpi_printf("READIC:  Second order ICs detected. Will complete them first before starting run.\n");
+          All.LptScalingfactor = header.lpt_scalingfactor;
+        }
+      else
+        Terminate("READIC:  No second order ICs detected even though you activated SECOND_ORDER_LPT_ICS.\n");
+#else
+      if(header.flag_ic_info == FLAG_SECOND_ORDER_ICS)
+        Terminate("Detected second order ICs but SECOND_ORDER_LPT_ICS is not enabled.\n");
+#endif
+#endif
+
+#ifdef GENERATE_GAS_IN_ICS
+      if(All.RestartFlag == RST_BEGIN)
+        {
+#ifdef SPLIT_PARTICLE_TYPE
+          for(int i = 0; i < NTYPES; i++)
+            if((1 << i) & (SPLIT_PARTICLE_TYPE))
+              {
+                Sp->TotNumGas += header.npartTotal[i];
+                Sp->TotNumPart += header.npartTotal[i];
+              }
+#else
+          Sp->TotNumGas += header.npartTotal[1];
+          Sp->TotNumPart += header.npartTotal[1];
+#endif
+        }
+#endif
+
+      for(int i = 0; i < NTYPES; i++)
+        All.MassTable[i] = header.mass[i];
+
+      if(All.RestartFlag == RST_BEGIN || All.RestartFlag == RST_RESUME || All.RestartFlag == RST_CREATEICS)
+        All.Time = All.TimeBegin;
+      else
+        All.Time = All.TimeBegin = header.time;
+
+      All.set_cosmo_factors_for_current_time();
+    }
+
+  if(ThisTask == readTask)
+    {
+      if(filenr == 0 && nstart == NULL)
+        {
+          mpi_printf(
+              "\nREADIC: filenr=%d, '%s' contains:\n"
+              "READIC: Type 0 (gas):   %8lld  (tot=%15lld) masstab= %g\n",
+              filenr, fname, (long long)header.npart[0], (long long)header.npartTotal[0], All.MassTable[0]);
+
+          for(int type = 1; type < NTYPES; type++)
+            {
+              mpi_printf("READIC: Type %d:         %8lld  (tot=%15lld) masstab= %g\n", type, (long long)header.npart[type],
+                         (long long)header.npartTotal[type], All.MassTable[type]);
+            }
+          mpi_printf("\n");
+        }
+    }
+
+  /* to collect the gas particles all at the beginning (in case several
+     snapshot files are read on the current CPU) we move the collisionless
+     particles such that a gap of the right size is created */
+
+  long long nall = 0;
+  for(int type = 0; type < NTYPES; type++)
+    {
+      ntot_type[type] = header.npart[type];
+
+      long long n_in_file = header.npart[type];
+      int ntask           = lastTask - readTask + 1;
+      int n_for_this_task = n_in_file / ntask;
+      if((ThisTask - readTask) < (n_in_file % ntask))
+        n_for_this_task++;
+
+      n_type[type] = n_for_this_task;
+
+      nall += n_for_this_task;
+    }
+
+#ifdef MERGERTREE
+  if(snap_type == MOST_BOUND_PARTICLE_SNAPHOT_REORDERED)
+    ntot_type[NTYPES] = header.Ntrees;
+#endif
+
+  if(nstart)
+    {
+      memmove(static_cast<void *>(&Sp->P[Sp->NumGas + nall]), static_cast<void *>(&Sp->P[Sp->NumGas]),
+              (Sp->NumPart - Sp->NumGas) * sizeof(particle_data));
+#ifndef OUTPUT_COORDINATES_AS_INTEGERS
+      memmove(&Ptmp[Sp->NumGas + nall], &Ptmp[Sp->NumGas], (Sp->NumPart - Sp->NumGas) * sizeof(ptmp_data));
+#endif
+      *nstart = Sp->NumGas;
+    }
+}
+
+/*! \brief Write the fields contained in the header group of the HDF5 snapshot file
+ *
+ *  This function stores the fields of the structure io_header as attributes belonging
+ *  to the header group of the HDF5 file.
+ *
+ *  \param handle contains a reference to the header group
+ */
+void snap_io::write_header_fields(hid_t handle)
+{
+#ifdef GADGET2_HEADER
+  write_vector_attribute(handle, "NumPart_ThisFile", header.npart, H5T_NATIVE_UINT, NTYPES);
+#else
+  write_vector_attribute(handle, "NumPart_ThisFile", header.npart, H5T_NATIVE_UINT64, NTYPES);
+#endif
+  write_vector_attribute(handle, "NumPart_Total", header.npartTotal, H5T_NATIVE_UINT64, NTYPES);
+  write_vector_attribute(handle, "MassTable", header.mass, H5T_NATIVE_DOUBLE, NTYPES);
+  write_scalar_attribute(handle, "Time", &header.time, H5T_NATIVE_DOUBLE);
+  write_scalar_attribute(handle, "Redshift", &header.redshift, H5T_NATIVE_DOUBLE);
+  write_scalar_attribute(handle, "BoxSize", &header.BoxSize, H5T_NATIVE_DOUBLE);
+  write_scalar_attribute(handle, "NumFilesPerSnapshot", &header.num_files, H5T_NATIVE_INT);
+  write_string_attribute(handle, "Git_commit", GIT_COMMIT);
+  write_string_attribute(handle, "Git_date", GIT_DATE);
+
+#if defined(REARRANGE_OPTION) && defined(MERGERTREE)
+  if(snap_type == MOST_BOUND_PARTICLE_SNAPHOT_REORDERED)
+    {
+      write_scalar_attribute(handle, "Ntrees_ThisFile", &header.Ntrees, H5T_NATIVE_UINT64);
+      write_scalar_attribute(handle, "Ntrees_Total", &header.TotNtrees, H5T_NATIVE_UINT64);
+    }
+#endif
+}
+
+/*! \brief This function reads the snapshot header in case of hdf5 files (i.e. format 3)
+ *
+ * \param fname file name of the snapshot as given in the parameter file
+ */
+void snap_io::read_header_fields(const char *fname)
+{
+  for(int i = 0; i < NTYPES; i++)
+    {
+      header.npart[i]      = 0;
+      header.npartTotal[i] = 0;
+      header.mass[i]       = 0;
+    }
+
+  int ntypes = NTYPES;
+
+  hid_t hdf5_file = my_H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
+  hid_t handle    = my_H5Gopen(hdf5_file, "/Header");
+
+  /* check if the file in question actually has this number of types */
+  hid_t hdf5_attribute = my_H5Aopen_name(handle, "NumPart_ThisFile");
+  hid_t space          = H5Aget_space(hdf5_attribute);
+  hsize_t dims, len;
+  H5Sget_simple_extent_dims(space, &dims, &len);
+  H5Sclose(space);
+  if(len != (size_t)ntypes)
+    Terminate("Length of NumPart_ThisFile attribute (%d) does not match NTYPES(ICS) (%d)", (int)len, ntypes);
+  my_H5Aclose(hdf5_attribute, "NumPart_ThisFile");
+
+  /* now read the header fields */
+
+#ifdef GADGET2_HEADER
+  read_vector_attribute(handle, "NumPart_ThisFile", header.npart, H5T_NATIVE_UINT, ntypes);
+#else
+  read_vector_attribute(handle, "NumPart_ThisFile", header.npart, H5T_NATIVE_UINT64, ntypes);
+#endif
+
+  read_vector_attribute(handle, "NumPart_Total", header.npartTotal, H5T_NATIVE_UINT64, ntypes);
+
+#ifdef MERGERTREE
+  if(snap_type == MOST_BOUND_PARTICLE_SNAPHOT_REORDERED)
+    {
+      read_scalar_attribute(handle, "Ntrees_ThisFile", &header.Ntrees, H5T_NATIVE_UINT64);
+      read_scalar_attribute(handle, "Ntrees_Total", &header.TotNtrees, H5T_NATIVE_UINT64);
+    }
+#endif
+
+  read_vector_attribute(handle, "MassTable", header.mass, H5T_NATIVE_DOUBLE, ntypes);
+  read_scalar_attribute(handle, "Time", &header.time, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "Redshift", &header.redshift, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "BoxSize", &header.BoxSize, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "NumFilesPerSnapshot", &header.num_files, H5T_NATIVE_INT);
+
+  my_H5Gclose(handle, "/Header");
+  my_H5Fclose(hdf5_file, fname);
+}
+
+int snap_io::get_filenr_from_header(void) { return header.num_files; }
+
+void snap_io::set_filenr_in_header(int numfiles) { header.num_files = numfiles; }
+
+void snap_io::read_increase_numbers(int type, int n_for_this_task)
+{
+  if(type < NTYPES)
+    Sp->NumPart += n_for_this_task;
+
+  if(type == 0)
+    Sp->NumGas += n_for_this_task;
+}
+
+void snap_io::get_datagroup_name(int type, char *buf)
+{
+  if(type < NTYPES)
+    sprintf(buf, "/PartType%d", type);
+  else if(type == NTYPES)
+    sprintf(buf, "/TreeTable");
+  else
+    Terminate("wrong group");
+}
diff --git a/src/io/snap_io.h b/src/io/snap_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c210ea973f02b40a8e0f1b60ca35ad4f30fba19
--- /dev/null
+++ b/src/io/snap_io.h
@@ -0,0 +1,399 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file snap_io.h
+ *
+ *  \brief declares class used for I/O of snapshot files
+ */
+
+#ifndef SNAP_READ_WRITE_H
+#define SNAP_READ_WRITE_H
+
+#include "gadgetconfig.h"
+
+#include "../data/intposconvert.h"
+#include "../data/simparticles.h"
+#include "../io/io.h"
+#include "../lgalaxies/lgalaxies.h"
+#include "../mergertree/mergertree.h"
+
+#ifdef LGALAXIES
+class lgalaxies;
+#endif
+
+class snap_io : public IO_Def
+{
+ public:
+  void init_basic(simparticles *Sp_ptr);
+
+  snap_io(simparticles *Sp_ptr, MPI_Comm comm, int format) : IO_Def(comm, format) { init_basic(Sp_ptr); }
+
+#if defined(REARRANGE_OPTION) && defined(MERGERTREE)
+  void init_extra(simparticles *Sp_ptr, mergertree *MergerTree_ptr);
+  snap_io(simparticles *Sp_ptr, mergertree *MergerTree_ptr, MPI_Comm comm, int format) : IO_Def(comm, format)
+  {
+    init_basic(Sp_ptr);
+    init_extra(Sp_ptr, MergerTree_ptr);
+  }
+#endif
+
+#if defined(LGALAXIES)
+  void init_extra_lgalaxies(void);
+  snap_io(simparticles *Sp_ptr, MPI_Comm comm, int format, bool lgalaxies) : IO_Def(comm, format)
+  {
+    init_basic(Sp_ptr);
+    init_extra_lgalaxies();
+  }
+#endif
+
+  void write_snapshot(int num, mysnaptype snap_type);
+  void read_snapshot(int num, mysnaptype snap_type);
+
+  int acquire_basic_treeinfo(int num, mysnaptype loc_snap_type);
+  void free_basic_treeinfo(void);
+  long long load_orphans(int num, long long treenr, int num_files);
+  void free_orphans(void);
+
+  void read_ic(const char *fname);
+
+  /* supplied virtual functions */
+  void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart);
+  void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                        int *nstart);
+  void get_datagroup_name(int grnr, char *gname);
+  void write_header_fields(hid_t);
+  void read_header_fields(const char *fname);
+  void read_increase_numbers(int type, int n_for_this_task);
+  int get_filenr_from_header(void);
+  void set_filenr_in_header(int);
+  void *get_base_address_of_structure(enum arrays array, int index);
+  int get_type_of_element(int index);
+  void set_type_of_element(int index, int type);
+
+  /** Header for the standard file format.
+   */
+#ifdef GADGET2_HEADER
+
+  struct io_header
+  {
+    int npart[NTYPES_HEADER];                      /**< number of particles of each type in this file */
+    double mass[NTYPES_HEADER];                    /**< mass of particles of each type. If 0, then the masses are explicitly
+                                                          stored in the mass-block of the snapshot file, otherwise they are omitted */
+    double time;                                   /**< time of snapshot file */
+    double redshift;                               /**< redshift of snapshot file */
+    int flag_sfr;                                  /**< flags whether the simulation was including star formation */
+    int flag_feedback;                             /**< flags whether feedback was included (obsolete) */
+    unsigned int npartTotalLowWord[NTYPES_HEADER]; /**< total number of particles of each type in this snapshot. This can be
+                                       different from npart if one is dealing with a multi-file snapshot. */
+    int flag_cooling;                              /**< flags whether cooling was included  */
+    int num_files;                                 /**< number of files in multi-file snapshot */
+    double BoxSize;                                /**< box-size of simulation in case periodic boundaries were used */
+    double Omega0;                                 /**< matter density in units of critical density */
+    double OmegaLambda;                            /**< cosmological constant parameter */
+    long long Ntrees;                              // this replaces the storage space for HubbleParam
+    long long TotNtrees;                           // this replaces the storage space for Hubble
+    //  double HubbleParam;                             /**< little 'h' to scale units systems */
+    //   double Hubble;                                  /**< Hubble constant in internal units */
+    unsigned int npartTotalHighWord[NTYPES_HEADER]; /**< High word of the total number of particles of each type */
+    int flag_entropy_instead_u;                     /**< flags that IC-file contains entropy instead of u */
+    int flag_doubleprecision;                       /**< flags that snapshot contains double-precision instead of single precision */
+    int flag_ic_info;        /*!< flag to inform whether IC files are generated with ordinary Zeldovich approximation,
+                                    or whether they ocontains 2nd order lagrangian perturbation theory initial conditions.
+                                    For snapshots files, the value informs whether the simulation was evolved from
+                                    Zeldoch or 2lpt ICs. Encoding is as follows:
+                                      FLAG_ZELDOVICH_ICS     (1)   - IC file based on Zeldovich
+                                      FLAG_SECOND_ORDER_ICS  (2)   - Special IC-file containing 2lpt masses
+                                     All other values, including 0 are interpreted as "don't know" for backwards compatability.
+                                */
+    float lpt_scalingfactor; /*!< scaling factor for 2lpt initial conditions */
+
+    long long npartTotal[NTYPES_HEADER]; /**< fills to 256 Bytes, and for compatability with Gadget2/3 */
+  };
+
+#else
+  /* new simplified header format */
+  struct io_header
+  {
+    long long npart[NTYPES_HEADER];      /**< number of particles of each type in this file */
+    long long npartTotal[NTYPES_HEADER]; /**< total number of particles of each type in this snapshot. This can be
+                                           different from npart if one is dealing with a multi-file snapshot. */
+    double mass[NTYPES_HEADER];          /**< mass of particles of each type. If 0, then the masses are explicitly
+                                                stored in the mass-block of the snapshot file, otherwise they are omitted */
+    double time;                         /**< time of snapshot file */
+    double redshift;                     /**< redshift of snapshot file */
+    double BoxSize;                      /**< box-size of simulation in case periodic boundaries were used */
+    int num_files;                       /**< number of files in multi-file snapshot */
+
+    long long Ntrees;
+    long long TotNtrees;
+  };
+
+#endif
+
+  io_header header; /**< holds header for snapshot files */
+
+ private:
+  simparticles *Sp;
+#ifdef MERGERTREE
+  mergertree *MergerTree;
+#endif
+#ifdef LGALAXIES
+  lgalaxies *LGalaxies;
+#endif
+
+  mysnaptype snap_type;
+
+  long long ntot_type_all[NTYPES]; /**< contains the global number of particles of each type in the snapshot file */
+
+#ifndef OUTPUT_COORDINATES_AS_INTEGERS
+  struct ptmp_data
+  {
+    MyDouble Pos[3];
+  };
+  ptmp_data *Ptmp;
+#endif
+
+  void snap_init_domain_mapping(void);
+  void read_increase_particle_numbers(int type, int n_for_this_task);
+
+  /*
+   * special input/output functions for certain fields
+   */
+#ifndef OUTPUT_COORDINATES_AS_INTEGERS
+  static void io_func_pos(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    /* note: we know that components==3 here */
+    snap_io *thisobj = (snap_io *)ptr;
+
+    if(mode == 0)
+      {
+        MyDouble *out_buffer = (MyDouble *)buffer;
+
+        /* converts the integer coordinates to floating point */
+        thisobj->Sp->intpos_to_pos(thisobj->Sp->P[particle].IntPos, out_buffer);
+      }
+    else
+      {
+        MyDouble *in_buffer = (MyDouble *)buffer;
+
+        /* note: for non-periodic positions, the conversion to integer coordinates is undefined only after the initial read.
+         * We therefore store the coordinates first in a temporary array */
+
+        for(int k = 0; k < 3; k++)
+          thisobj->Ptmp[particle].Pos[k] = in_buffer[k];
+
+#ifdef SQUASH_TEST
+        thisobj->Ptmp[particle].Pos[1] *= 1.0 / 4;
+        thisobj->Ptmp[particle].Pos[2] *= 1.0 / 16;
+#endif
+      }
+  }
+#endif
+
+  static void io_func_intpos(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    /* note: we know that components==3 here */
+    snap_io *thisobj = (snap_io *)ptr;
+
+    if(mode == 0)
+      {
+        MyIntPosType *out_buffer = (MyIntPosType *)buffer;
+
+        /* converts the integer coordinates to integer by subtracting a possible randomization shift */
+        thisobj->Sp->intpos_to_intpos(thisobj->Sp->P[particle].IntPos, out_buffer);
+      }
+    else
+      {
+        MyIntPosType *in_buffer = (MyIntPosType *)buffer;
+
+        for(int k = 0; k < 3; k++)
+          thisobj->Sp->P[particle].IntPos[k] = in_buffer[k];
+      }
+  }
+
+  static void io_func_vel(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    snap_io *thisobj = (snap_io *)ptr;
+
+    if(mode == 0)
+      {
+        MyFloat *out_buffer = (MyFloat *)buffer;
+        for(int k = 0; k < 3; k++)
+          {
+            out_buffer[k] = thisobj->Sp->P[particle].Vel[k];
+
+            /* we are using p = a^2 * xdot internally as velocity unit. Convert to legacy Gadget velocity units */
+            out_buffer[k] *= sqrt(All.cf_a3inv);
+          }
+      }
+    else
+      {
+        MyFloat *in_buffer = (MyFloat *)buffer;
+        for(int k = 0; k < components; k++)
+          {
+            thisobj->Sp->P[particle].Vel[k] = in_buffer[k];
+          }
+      }
+  }
+
+  static void io_func_id(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    snap_io *thisobj = (snap_io *)ptr;
+
+    if(mode == 0)
+      {
+        MyIDType *out_buffer = (MyIDType *)buffer;
+        out_buffer[0]        = thisobj->Sp->P[particle].ID.get();
+      }
+    else
+      {
+        MyIDType *in_buffer = (MyIDType *)buffer;
+        thisobj->Sp->P[particle].ID.set(in_buffer[0]);
+      }
+  }
+
+  static void io_func_mass(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    snap_io *thisobj = (snap_io *)ptr;
+
+    if(mode == 0)
+      {
+        MyDouble *out_buffer = (MyDouble *)buffer;
+        out_buffer[0]        = thisobj->Sp->P[particle].getMass();
+      }
+    else
+      {
+        MyDouble *in_buffer = (MyDouble *)buffer;
+        thisobj->Sp->P[particle].setMass(in_buffer[0]);
+      }
+  }
+
+  static void io_func_u(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    snap_io *thisobj = (snap_io *)ptr;
+
+    if(mode == 0)
+      {
+        MyFloat *out_buffer = (MyFloat *)buffer;
+        out_buffer[0]       = thisobj->Sp->get_utherm_from_entropy(particle);
+      }
+    else
+      {
+        MyFloat *in_buffer                  = (MyFloat *)buffer;
+        thisobj->Sp->SphP[particle].Entropy = in_buffer[0];
+      }
+  }
+
+#ifdef STARFORMATION
+  static void io_func_sfr(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    snap_io *thisobj = (snap_io *)ptr;
+
+    if(mode == 0)
+      {
+        MyFloat *out_buffer = (MyFloat *)buffer;
+        out_buffer[0]       = thisobj->Sp->SphP[particle].Sfr;
+      }
+    else
+      {
+        MyFloat *in_buffer              = (MyFloat *)buffer;
+        thisobj->Sp->SphP[particle].Sfr = in_buffer[0];
+      }
+  }
+
+  static void io_func_metallicity(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    snap_io *thisobj = (snap_io *)ptr;
+
+    if(mode == 0)
+      {
+        MyFloat *out_buffer = (MyFloat *)buffer;
+        if(thisobj->Sp->P[particle].getType() == 0)
+          {
+            out_buffer[0] = thisobj->Sp->SphP[particle].Metallicity;
+          }
+        else
+          {
+            out_buffer[0] = thisobj->Sp->P[particle].Metallicity;
+          }
+      }
+    else
+      {
+        MyFloat *in_buffer                   = (MyFloat *)buffer;
+        thisobj->Sp->P[particle].Metallicity = in_buffer[0];
+      }
+  }
+#endif
+
+#ifdef OUTPUT_ACCELERATION
+  static void io_func_accel(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    snap_io *thisobj = (snap_io *)ptr;
+
+    if(mode == 0)  // writing
+      {
+        MyFloat *out_buffer = (MyFloat *)buffer;
+        if(All.RestartFlag != RST_CONVERTSNAP)
+          for(int k = 0; k < 3; k++)
+            out_buffer[k] = All.cf_a2inv * thisobj->Sp->P[particle].GravAccel[k];
+        else
+          for(int k = 0; k < 3; k++)
+            out_buffer[k] = thisobj->Sp->P[particle].GravAccel[k];
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+        if(All.RestartFlag != RST_CONVERTSNAP)
+          for(int k = 0; k < 3; k++)
+            out_buffer[k] += All.cf_a2inv * thisobj->Sp->P[particle].GravPM[k];
+        else
+          for(int k = 0; k < 3; k++)
+            out_buffer[k] += thisobj->Sp->P[particle].GravPM[k];
+#endif
+
+        for(int k = 0; k < 3; k++)
+          out_buffer[k] /= All.accel_normalize_fac;
+      }
+    else  // reading
+      {
+        MyFloat *in_buffer = (MyFloat *)buffer;
+        for(int k = 0; k < components; k++)
+          thisobj->Sp->P[particle].GravAccel[k] = All.accel_normalize_fac * in_buffer[k];
+      }
+  }
+
+#endif
+
+#ifdef OUTPUT_PRESSURE
+  static void io_func_pressure(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    snap_io *thisobj    = (snap_io *)ptr;
+    MyFloat *out_buffer = (MyFloat *)buffer;
+
+    out_buffer[0] = thisobj->Sp->SphP[particle].get_pressure();
+  }
+#endif
+
+#ifdef OUTPUT_TIMESTEP
+  static void io_func_timestep(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    snap_io *thisobj    = (snap_io *)ptr;
+    MyFloat *out_buffer = (MyFloat *)buffer;
+
+    out_buffer[0] = (thisobj->Sp->P[particle].TimeBinGrav ? (((integertime)1) << thisobj->Sp->P[particle].TimeBinGrav) : 0) *
+                    All.Timebase_interval;
+  }
+
+  static void io_func_timestephydro(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    snap_io *thisobj    = (snap_io *)ptr;
+    MyFloat *out_buffer = (MyFloat *)buffer;
+
+    out_buffer[0] =
+        (thisobj->Sp->P[particle].getTimeBinHydro() ? (((integertime)1) << thisobj->Sp->P[particle].getTimeBinHydro()) : 0) *
+        All.Timebase_interval;
+  }
+#endif
+};
+
+#endif /* SNAP_READ_WRITE_H */
diff --git a/src/io/test_io_bandwidth.cc b/src/io/test_io_bandwidth.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e940a8b536ba76005c6e8ca8d1a74a840362a0c8
--- /dev/null
+++ b/src/io/test_io_bandwidth.cc
@@ -0,0 +1,229 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file test_io_bandwidth.cc
+ *
+ * \brief test routines to identify optimum setting of MaxFilesWithConcurrentIO on given machine
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../io/io.h"
+#include "../io/test_io_bandwidth.h"
+#include "../lightcone/lightcone.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+void test_io_bandwidth::measure_io_bandwidth(void)
+{
+  /* create directory for test data */
+  if(ThisTask == 0)
+    {
+      char buf[MAXLEN_PATH_EXTRA];
+      sprintf(buf, "%s/testdata", All.OutputDir);
+      mkdir(buf, 02755);
+    }
+  MPI_Barrier(Communicator);
+
+  All.MaxFilesWithConcurrentIO = NTask;
+
+  while(All.MaxFilesWithConcurrentIO > 1)
+    {
+      write_test_data();
+
+      All.MaxFilesWithConcurrentIO /= 2;
+    }
+
+  mpi_printf("\n\nTEST: Completed.\n");
+
+  fflush(stdout);
+}
+
+void test_io_bandwidth::write_test_data(void)
+{
+  double t0 = Logs.second();
+  reset_io_byte_count();
+
+  mpi_printf("TEST: Writing test data...\n");
+
+  /* now work the I/O of the files, controlled by scheduler to achieve optimum I/O bandwidth under the constraint of a maximum number
+   * for the concurrent file access */
+  work_files(MODUS_WRITE);
+
+  long long byte_count = get_io_byte_count(), byte_count_all;
+  sumup_longs(1, &byte_count, &byte_count_all, Communicator);
+
+  double t1 = Logs.second();
+
+  mpi_printf(
+      "TEST: done.  MaxFilesWithConcurrentIO=%6d   load/save took %g sec, total size %g MB, corresponds to effective I/O rate of %g "
+      "MB/sec\n",
+      All.MaxFilesWithConcurrentIO, Logs.timediff(t0, t1), byte_count_all / (1024.0 * 1024.0),
+      byte_count_all / (1024.0 * 1024.0) / Logs.timediff(t0, t1));
+
+  /* now delete test data */
+  char buf[MAXLEN_PATH_EXTRA];
+  sprintf(buf, "%s/testdata/%s.%d", All.OutputDir, "testdata", ThisTask);
+  unlink(buf);
+  MPI_Barrier(Communicator);
+}
+
+void test_io_bandwidth::polling(int modus)
+{
+  if(ThisTask == 0)
+    if(files_completed < NTask)
+      {
+        MPI_Status status;
+        int flag;
+
+        /* now check for a completion message  */
+        MPI_Iprobe(MPI_ANY_SOURCE, TAG_KEY, Communicator, &flag, &status);
+
+        if(flag)
+          {
+            int source = status.MPI_SOURCE;
+
+            int dummy;
+            MPI_Recv(&dummy, 1, MPI_INT, source, TAG_KEY, Communicator, MPI_STATUS_IGNORE);
+            files_completed++;
+
+            if(files_started < NTask)
+              {
+                /* send start signal */
+                MPI_Ssend(&ThisTask, 1, MPI_INT, seq[files_started++].thistask, TAG_N, Communicator);
+              }
+          }
+      }
+}
+
+void test_io_bandwidth::work_files(int modus)
+{
+  if(ThisTask == 0)
+    if(!(seq = (seq_data *)malloc(NTask * sizeof(seq_data))))
+      Terminate("can't allocate seq_data");
+
+  seq_data seq_loc;
+  seq_loc.thistask   = ThisTask;
+  seq_loc.rankinnode = RankInThisNode;
+  seq_loc.thisnode   = ThisNode;
+
+  MPI_Gather(&seq_loc, sizeof(seq_data), MPI_BYTE, seq, sizeof(seq_data), MPI_BYTE, 0, Communicator);
+
+  if(ThisTask == 0)
+    {
+      std::sort(seq, seq + NTask);
+
+      files_started   = 0;
+      files_completed = 0;
+
+      for(int i = 1; i < All.MaxFilesWithConcurrentIO; i++)
+        {
+          files_started++;
+          MPI_Ssend(&ThisTask, 1, MPI_INT, seq[i].thistask, TAG_N, Communicator);
+        }
+
+      files_started++;
+      contents_restart_file(modus);
+      files_completed++;
+
+      if(files_started < NTask)
+        {
+          /* send start signal */
+          MPI_Ssend(&ThisTask, 1, MPI_INT, seq[files_started++].thistask, TAG_N, Communicator);
+        }
+
+      while(files_completed < NTask)
+        polling(modus);
+
+      free(seq);
+    }
+  else
+    {
+      /* wait for start signal */
+      int dummy;
+      MPI_Recv(&dummy, 1, MPI_INT, 0, TAG_N, Communicator, MPI_STATUS_IGNORE); /* wait until we are told to start */
+
+      contents_restart_file(modus);
+
+      /* send back completion notice */
+      MPI_Ssend(&ThisTask, 1, MPI_INT, 0, TAG_KEY, Communicator);
+    }
+}
+
+void test_io_bandwidth::contents_restart_file(int modus)
+{
+  char buf[MAXLEN_PATH_EXTRA];
+  sprintf(buf, "%s/testdata/%s.%d", All.OutputDir, "testdata", ThisTask);
+
+  if(modus == MODUS_READ)
+    {
+      if(!(fd = fopen(buf, "r")))
+        {
+          Terminate("TEST: File '%s' not found.\n", buf);
+        }
+    }
+  else if(modus == MODUS_WRITE)
+    {
+      if(!(fd = fopen(buf, "w")))
+        {
+          Terminate("TEST: File '%s' cannot be opened.\n", buf);
+        }
+    }
+  else
+    Terminate("unknown modus\n");
+
+  size_t len = BUF_IN_MB * 1024LL * 1024LL;
+
+  char *p = (char *)Mem.mymalloc("p", len);
+
+  byten(p, len, modus);
+
+  Mem.myfree(p);
+
+  fclose(fd);
+}
+
+void test_io_bandwidth::byten(void *x, size_t n, int modus)
+{
+  char *p = (char *)x;
+
+  while(n > BLKSIZE)
+    {
+      byten_doit(p, BLKSIZE, modus);
+      p += BLKSIZE;
+      n -= BLKSIZE;
+      polling(modus);
+    }
+
+  if(n > 0)
+    byten_doit(p, n, modus);
+}
+
+/*! \brief reads/writes n bytes to a restart file
+ */
+void test_io_bandwidth::byten_doit(void *x, size_t n, int modus)
+{
+  if(modus == MODUS_READ)
+    my_fread(x, n, 1, fd);
+  else
+    my_fwrite(x, n, 1, fd);
+}
diff --git a/src/io/test_io_bandwidth.h b/src/io/test_io_bandwidth.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b9eee7f7833b096b4d8180ad3446bc5955ef6f8
--- /dev/null
+++ b/src/io/test_io_bandwidth.h
@@ -0,0 +1,68 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file test_io_bandwidth.h
+ *
+ * \brief declares a class for I/O performance measurements
+ */
+
+#ifndef TEST_IO_BANDWIDTH_H
+#define TEST_IO_BANDWIDTH_H
+
+#include <mpi.h>
+
+#define MODUS_WRITE 0
+#define MODUS_READ 1
+
+#define BUF_IN_MB 10
+
+#define BLKSIZE (1024 * 1024)
+
+#include "../io/io_streamcount.h"
+
+class test_io_bandwidth : public io_streamcount, public virtual setcomm
+{
+ public:
+  test_io_bandwidth(MPI_Comm comm) : setcomm(comm) {}
+
+  void measure_io_bandwidth(void);
+
+ private:
+  FILE *fd;
+
+  struct seq_data
+  {
+    int thistask;
+    int rankinnode;
+    int thisnode;
+
+    bool operator<(const seq_data &other) const
+    {
+      if(rankinnode < other.rankinnode)
+        return true;
+      if(rankinnode > other.rankinnode)
+        return false;
+      if(thisnode < other.thisnode)
+        return true;
+      if(thisnode > other.thisnode)
+        return false;
+      return thistask < other.thistask;
+    }
+  };
+  seq_data *seq;
+
+  int files_started;
+  int files_completed;
+
+  void work_files(int modus);
+  void contents_restart_file(int modus);
+  void polling(int modus);
+  void write_test_data(void);
+  void byten(void *x, size_t n, int modus);
+  void byten_doit(void *x, size_t n, int modus);
+};
+
+#endif
diff --git a/src/lightcone/lightcone.cc b/src/lightcone/lightcone.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cc3183341836d8e850dd95de28f319a05a60ef3
--- /dev/null
+++ b/src/lightcone/lightcone.cc
@@ -0,0 +1,1043 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  lightcone.cc
+ *
+ *  \brief contains code to collect particles on the lightcone
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef LIGHTCONE
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../io/hdf5_util.h"
+#include "../lightcone/lightcone.h"
+#include "../lightcone/lightcone_massmap_io.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../sort/cxxsort.h"
+#include "../system/system.h"
+
+void lightcone::lightcone_init_intposconverter(double linklength)
+{
+  double maxlength = 0;
+
+#ifdef LIGHTCONE_PARTICLES
+  if(ConeGlobComDistStart > maxlength)
+    maxlength = ConeGlobComDistStart;
+#endif
+
+#ifdef LIGHTCONE_MASSMAPS
+  for(int i = 0; i < NumMassMapBoundaries; i++)
+    if(MassMapBoundariesComDist[i] > maxlength)
+      maxlength = MassMapBoundariesComDist[i];
+#endif
+
+  maxlength += linklength;
+
+#ifdef LIGHTCONE_PARTICLES
+  Lp->RegionLen     = 2.0 * maxlength;
+  Lp->FacCoordToInt = pow(2.0, BITS_FOR_POSITIONS) / Lp->RegionLen;
+  Lp->FacIntToCoord = Lp->RegionLen / pow(2.0, BITS_FOR_POSITIONS);
+#endif
+}
+
+#ifdef LIGHTCONE_MASSMAPS
+int lightcone::lightcone_add_position_massmaps(particle_data *P, double *pos, double ascale)
+{
+  int buffer_full_flag = 0;
+
+  long ipnest;
+  vec2pix_ring(All.LightConeMassMapsNside, pos, &ipnest);
+
+  int task;
+  subdivide_evenly_get_bin(Mp->Npix, NTask, ipnest, &task);
+
+  if(Mp->NumPart >= Mp->MaxPart)
+    {
+      int new_maxpart = std::max<int>(Mp->NumPart + 1, (1 + ALLOC_TOLERANCE) * (1 + ALLOC_TOLERANCE) * Mp->MaxPart);
+      if(Mp->NumPart >= new_maxpart)
+        Terminate("Mp->NumPart >= new_maxpart");
+
+      Mp->reallocate_memory_maxpart(new_maxpart);
+
+      buffer_full_flag = 1;
+    }
+
+  int p = Mp->NumPart++;
+
+  Mp->P[p].Ascale = ascale;
+  Mp->P[p].setMass(P->getMass());
+
+  Mp->P[p].PixIndex = ipnest;
+  Mp->P[p].Task     = task;
+
+  if(task < 0 || task >= NTask || ipnest < 0 || ipnest >= Mp->Npix)
+    Terminate("strange assignment:  task=%d  NTask=%d  pos=(%g|%g|%g)  ipnest=%d\n", task, NTask, pos[0], pos[1], pos[2], (int)ipnest);
+
+  return buffer_full_flag;
+}
+#endif
+
+#ifdef LIGHTCONE_PARTICLES
+void lightcone::lightcone_add_position_particles(particle_data *P, double *pos, double ascale)
+{
+  if(Lp->NumPart >= Lp->MaxPart)
+    {
+      int new_maxpart = (1 + ALLOC_TOLERANCE) * Lp->MaxPart;
+      if(Lp->NumPart >= new_maxpart)
+        Terminate("Lp->NumPart >= new_maxpart.  Lp->NumPart=%d Lp->MaxPart=%d  new_maxpart=%d", Lp->NumPart, Lp->MaxPart, new_maxpart);
+
+      Lp->reallocate_memory_maxpart(new_maxpart);
+    }
+
+  int q = Lp->NumPart++;
+
+  MyIntPosType intpos[3];
+
+  Lp->pos_to_signedintpos(pos, (MySignedIntPosType *)intpos);
+
+  for(int i = 0; i < 3; i++)
+    Lp->P[q].IntPos[i] = intpos[i];
+
+  Lp->P[q].Ascale = ascale;
+
+  /* we change to physical velocity here */
+  for(int i = 0; i < 3; i++)
+    Lp->P[q].Vel[i] = P->Vel[i] / ascale;
+
+#ifdef LIGHTCONE_OUTPUT_ACCELERATIONS
+  for(int i = 0; i < 3; i++)
+    Lp->P[q].GravAccel[i] = P->GravAccel[i];
+#endif
+
+  Lp->P[q].setType(P->getType());
+  Lp->P[q].setMass(P->getMass());
+  Lp->P[q].ID.set(P->ID.get());
+#if defined(LIGHTCONE_PARTICLES_GROUPS) && defined(FOF)
+  Lp->P[q].setSofteningClass(P->getSofteningClass());
+#endif
+
+  if(P->ID.is_previously_most_bound())
+    Lp->P[q].ID.mark_as_formerly_most_bound();
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS) && defined(FOF)
+  Lp->P[q].setFlagSaveDistance();
+#endif
+}
+#endif
+
+int lightcone::lightcone_test_for_particle_addition(particle_data *P, integertime time0, integertime time1, double dt_drift)
+{
+  int buffer_full_flag = 0;
+
+  int flag = 0;
+
+#ifdef LIGHTCONE_PARTICLES
+  if(time0 < ConeGlobTime_end && time1 > ConeGlobTime_start) /* we have a potential overlap */
+    flag = 1;
+#endif
+
+#ifdef LIGHTCONE_MASSMAPS
+  if(time0 < MassMapBoundariesTime[NumMassMapBoundaries - 1] && time1 > MassMapBoundariesTime[0])
+    flag = 1;
+#endif
+
+  if(flag == 0)
+    return buffer_full_flag;
+
+  static integertime last_time0 = -1, last_time1 = -1;
+  static double last_R0 = -1, last_R1 = -1;
+
+  if(time0 != last_time0 || time1 != last_time1)
+    {
+      last_R0 = Driftfac.get_comoving_distance(time0);
+      last_R1 = Driftfac.get_comoving_distance(time1);
+
+      last_time0 = time0;
+      last_time1 = time1;
+    }
+
+  double R0 = last_R0;
+  double R1 = last_R1;
+
+  double R0_squared = R0 * R0;
+  double R1_squared = R1 * R1;
+
+  double R1prime = R1 - (R0 - R1);
+
+  double pos[3];
+  Sp->intpos_to_pos(P->IntPos, pos);
+
+#ifdef LIGHTCONE_PARTICLES
+  bool previously = P->ID.is_previously_most_bound();
+#endif
+
+  NumLastCheck = 0;
+
+  for(int n = 0; n < NumBoxes; n++)
+    {
+      if(R0 < BoxList[n].Rmin)
+        continue;
+
+      if(R1prime > BoxList[n].Rmax)
+        break;
+
+      NumLastCheck++;
+
+      int i = BoxList[n].i;
+      int j = BoxList[n].j;
+      int k = BoxList[n].k;
+
+      double PosA[3];
+
+      PosA[0] = pos[0] + i * All.BoxSize;
+      PosA[1] = pos[1] + j * All.BoxSize;
+      PosA[2] = pos[2] + k * All.BoxSize;
+
+      double rA2 = PosA[0] * PosA[0] + PosA[1] * PosA[1] + PosA[2] * PosA[2];
+
+      if(rA2 < R0_squared)
+        {
+          double PosB[3];
+          double diffBminusA[3];
+
+          for(int q = 0; q < 3; q++)
+            {
+              diffBminusA[q] = P->Vel[q] * dt_drift * Sp->FacIntToCoord;
+              PosB[q]        = PosA[q] + diffBminusA[q];
+            }
+
+          double rB2 = PosB[0] * PosB[0] + PosB[1] * PosB[1] + PosB[2] * PosB[2];
+
+          if(rB2 > R1_squared)
+            {
+              /* ok, particle crossed the lightcone. Interpolate the coordinate of the crossing */
+
+              double dr2 = diffBminusA[0] * diffBminusA[0] + diffBminusA[1] * diffBminusA[1] + diffBminusA[2] * diffBminusA[2];
+
+              double a = pow(R1 - R0, 2) - dr2;
+              double b = 2 * R0 * (R1 - R0) - 2 * (PosA[0] * diffBminusA[0] + PosA[1] * diffBminusA[1] + PosA[2] * diffBminusA[2]);
+              double c = R0 * R0 - rA2;
+
+              double det = b * b - 4 * a * c;
+
+              if(det < 0)
+                Terminate(
+                    "det=%g R0=%g  R1=%g  rA=%g rB=%g  dr=%g  dt_drift=%g  dx=(%g|%g|%g) vel=(%g|%g|%g)  "
+                    "posA=(%g|%g|%g)  posB=(%g|%g|%g)\n",
+                    det, R0, R1, sqrt(rA2), sqrt(rB2), sqrt(dr2), dt_drift, P->Vel[0] * dt_drift * Sp->FacIntToCoord,
+                    P->Vel[1] * dt_drift * Sp->FacIntToCoord, P->Vel[2] * dt_drift * Sp->FacIntToCoord, P->Vel[0], P->Vel[1],
+                    P->Vel[2], PosA[0], PosA[1], PosA[2], PosB[0], PosB[1], PosB[2]);
+
+              double fac = (-b - sqrt(det)) / (2 * a);
+
+              vector<double> Pos;
+
+              for(int q = 0; q < 3; q++)
+                Pos[q] = PosA[q] + fac * diffBminusA[q];
+
+              double ascale = All.TimeBegin * exp((time0 + (time1 - time0) * fac) * All.Timebase_interval);
+
+              /* now we can add particle at position Pos[] to the lightcone, provided it fits into the angular mask */
+
+              if(fac < 0 || fac > 1)
+                {
+                  warn(
+                      "ascale=%g  fac=%g  fac-1%g R0=%g  R1=%g  rA=%g rB=%g  dr=%g  dt_drift=%g  dx=(%g|%g|%g) vel=(%g|%g|%g)  "
+                      "posA=(%g|%g|%g)  posB=(%g|%g|%g)\n",
+                      ascale, fac, fac - 1, R0, R1, sqrt(rA2), sqrt(rB2), sqrt(dr2), dt_drift,
+                      P->Vel[0] * dt_drift * Sp->FacIntToCoord, P->Vel[1] * dt_drift * Sp->FacIntToCoord,
+                      P->Vel[2] * dt_drift * Sp->FacIntToCoord, P->Vel[0], P->Vel[1], P->Vel[2], PosA[0], PosA[1], PosA[2], PosB[0],
+                      PosB[1], PosB[2]);
+                }
+              else
+                {
+#ifdef LIGHTCONE_PARTICLES
+                  if(ascale >= ConeGlobAstart && ascale < ConeGlobAend)
+                    for(int cone = 0; cone < Nlightcones; cone++)
+                      if(lightcone_is_cone_member_basic(ascale, Pos, previously, cone))
+                        {
+                          /* we only add the particle once if it is at least contained in one of the cones */
+                          lightcone_add_position_particles(P, Pos.da, ascale);
+                          break;
+                        }
+#endif
+
+#ifdef LIGHTCONE_MASSMAPS
+                  if(ascale >= MassMapBoundariesAscale[0] && ascale < MassMapBoundariesAscale[NumMassMapBoundaries - 1])
+                    buffer_full_flag |= lightcone_add_position_massmaps(P, Pos.da, ascale);
+#endif
+                }
+            }
+        }
+    }
+
+  return buffer_full_flag;
+}
+
+#ifdef LIGHTCONE_PARTICLES
+
+bool lightcone::lightcone_is_cone_member(int i, int cone)
+{
+#if defined(LIGHTCONE_PARTICLES_GROUPS) && defined(FOF)
+  if(!Lp->P[i].getFlagSaveDistance())
+    return false;
+#endif
+
+  vector<double> pos;
+
+  if(i >= Lp->NumPart)
+    Terminate("i=%d Lp->NumPart=%d\n", i, Lp->NumPart);
+
+  Lp->signedintpos_to_pos((MySignedIntPosType *)Lp->P[i].IntPos, pos.da);
+
+  return lightcone_is_cone_member_basic(Lp->P[i].Ascale, pos, Lp->P[i].ID.is_previously_most_bound(), cone);
+}
+
+bool lightcone::lightcone_is_cone_member_basic(double ascale, vector<double> &pos, bool previously, int cone)
+{
+  if(ascale < Cones[cone].Astart || ascale > Cones[cone].Aend)
+    return false;
+
+  if(Cones[cone].OnlyMostBoundFlag == 1 && previously == false)
+    return false;
+
+  if(Cones[cone].LcType == LC_TYPE_FULLSKY)
+    {
+      return true;
+    }
+  else if(Cones[cone].LcType == LC_TYPE_OCTANT)
+    {
+      double rad   = pos.norm();
+      double phi   = atan2(pos[1], pos[0]);
+      double theta = acos(pos[2] / rad);
+
+      if(phi < 0)
+        phi += 2 * M_PI;
+
+      int octnr = phi / (0.5 * M_PI);
+
+      if(octnr >= 4)
+        octnr = 3;
+
+      if(octnr < 0)
+        octnr = 0;
+
+      if(theta > 0.5 * M_PI)
+        octnr += 4;
+
+      if(octnr == Cones[cone].OctantNr)
+        return true;
+      else
+        return false;
+    }
+  else if(Cones[cone].LcType == LC_TYPE_PENCIL)
+    {
+      double rad = pos.norm();
+
+      // note: PencilDir is already normalized
+
+      double angle = acos((pos * Cones[cone].PencilDir) / rad);
+
+      if(angle < Cones[cone].PencilAngleRad)
+        return true;
+      else
+        return false;
+    }
+  else if(Cones[cone].LcType == LC_TYPE_DISK)
+    {
+      double dist = pos * Cones[cone].DiskNormal;
+
+      if(fabs(dist) < 0.5 * Cones[cone].DiskThickness)
+        return true;
+      else
+        return false;
+    }
+  else if(Cones[cone].LcType == LC_TYPE_SQUAREMAP)
+    {
+      double x = pos * Cones[cone].SquareMapXdir;
+      double y = pos * Cones[cone].SquareMapYdir;
+      double z = pos * Cones[cone].SquareMapZdir;
+
+      /* not angle has been converted from degrees to rad */
+      if(z > 0 && fabs(x) < Cones[cone].SquareMapAngleRad * z && fabs(y) < Cones[cone].SquareMapAngleRad * z)
+        return true;
+      else
+        return false;
+    }
+  return false;
+}
+
+void lightcone::lightcone_init_geometry(char *fname)
+{
+  if(!All.ComovingIntegrationOn)
+    Terminate("LIGHTCONE_PARTICLES: makes only sense for cosmological simulations with ComovingIntegrationOn enabled\n");
+
+  if(ThisTask == 0)
+    {
+      for(int iter = 0; iter < 2; iter++)
+        {
+          Nlightcones = 0;
+          FILE *fd;
+
+          if(!(fd = fopen(fname, "r")))
+            Terminate("LIGHTCONE_PARTICLES: cannot read lightcone geometries in file `%s'\n", fname);
+
+          if(iter == 0)
+            {
+              while(1)
+                {
+                  int lc_type;
+                  if(fscanf(fd, "%d", &lc_type) != 1)
+                    break;
+
+                  double dummy;
+
+                  switch(lc_type)
+                    {
+                      case LC_TYPE_FULLSKY:
+                        if(fscanf(fd, "%lg %lg %lg", &dummy, &dummy, &dummy) != 3)
+                          Terminate("LIGHTCONE_PARTICLES: incorrect data for lightcone type %d in file '%s'", lc_type, fname);
+                        break;
+
+                      case LC_TYPE_OCTANT:
+                        if(fscanf(fd, "%lg %lg %lg %lg", &dummy, &dummy, &dummy, &dummy) != 4)
+                          Terminate("LIGHTCONE_PARTICLES: incorrect data for lightcone type %d in file '%s'", lc_type, fname);
+                        break;
+
+                      case LC_TYPE_PENCIL:
+                        if(fscanf(fd, "%lg %lg %lg %lg %lg %lg %lg", &dummy, &dummy, &dummy, &dummy, &dummy, &dummy, &dummy) != 7)
+                          Terminate("LIGHTCONE_PARTICLES: incorrect data for lightcone type %d in file '%s'", lc_type, fname);
+                        break;
+
+                      case LC_TYPE_DISK:
+                        if(fscanf(fd, "%lg %lg %lg %lg %lg %lg %lg", &dummy, &dummy, &dummy, &dummy, &dummy, &dummy, &dummy) != 7)
+                          Terminate("LIGHTCONE_PARTICLES: incorrect data for lightcone type %d in file '%s'", lc_type, fname);
+                        break;
+
+                      case LC_TYPE_SQUAREMAP:
+                        if(fscanf(fd, "%lg %lg %lg %lg %lg %lg %lg %lg %lg %lg", &dummy, &dummy, &dummy, &dummy, &dummy, &dummy,
+                                  &dummy, &dummy, &dummy, &dummy) != 10)
+                          Terminate("LIGHTCONE_PARTICLES: incorrect data for squaremap type %d in file '%s'", lc_type, fname);
+                        break;
+
+                      default:
+                        Terminate("LIGHTCONE_PARTICLES: unknown lightcone type %d in file '%s'", lc_type, fname);
+                        break;
+                    }
+
+                  Nlightcones++;
+                }
+
+              Cones = (cone_data *)Mem.mymalloc("Cones", Nlightcones * sizeof(cone_data));
+
+              mpi_printf("LIGHTCONE_PARTICLES: read definitions with %d entries in file `%s'.\n", Nlightcones, fname);
+            }
+          else if(iter == 1)
+            {
+              while(1)
+                {
+                  int lc_type;
+                  if(fscanf(fd, "%d", &lc_type) != 1)
+                    break;
+
+                  Cones[Nlightcones].LcType = lc_type;
+
+                  switch(lc_type)
+                    {
+                      case LC_TYPE_FULLSKY:
+                        fscanf(fd, "%d %lg %lg", &Cones[Nlightcones].OnlyMostBoundFlag, &Cones[Nlightcones].Astart,
+                               &Cones[Nlightcones].Aend);
+                        sprintf(Cones[Nlightcones].Tag, "Full-sky");
+                        break;
+
+                      case LC_TYPE_OCTANT:
+                        fscanf(fd, "%d %lg %lg", &Cones[Nlightcones].OnlyMostBoundFlag, &Cones[Nlightcones].Astart,
+                               &Cones[Nlightcones].Aend);
+                        fscanf(fd, "%d", &Cones[Nlightcones].OctantNr);
+                        sprintf(Cones[Nlightcones].Tag, "Octant");
+                        break;
+
+                      case LC_TYPE_PENCIL:
+                        fscanf(fd, "%d %lg %lg", &Cones[Nlightcones].OnlyMostBoundFlag, &Cones[Nlightcones].Astart,
+                               &Cones[Nlightcones].Aend);
+                        fscanf(fd, "%lg %lg %lg", &Cones[Nlightcones].PencilDir[0], &Cones[Nlightcones].PencilDir[1],
+                               &Cones[Nlightcones].PencilDir[2]);
+                        fscanf(fd, "%lg", &Cones[Nlightcones].PencilAngle);
+
+                        /* normalize the normal vector in case it is not normalized yet */
+                        Cones[Nlightcones].PencilDir *= 1.0 / Cones[Nlightcones].PencilDir.norm();
+
+                        /* convert to rad */
+                        Cones[Nlightcones].PencilAngleRad = Cones[Nlightcones].PencilAngle * M_PI / 180.0;
+
+                        sprintf(Cones[Nlightcones].Tag, "Pencil-Beam");
+                        break;
+
+                      case LC_TYPE_DISK:
+                        fscanf(fd, "%d %lg %lg", &Cones[Nlightcones].OnlyMostBoundFlag, &Cones[Nlightcones].Astart,
+                               &Cones[Nlightcones].Aend);
+                        fscanf(fd, "%lg %lg %lg", &Cones[Nlightcones].DiskNormal[0], &Cones[Nlightcones].DiskNormal[1],
+                               &Cones[Nlightcones].DiskNormal[2]);
+                        fscanf(fd, "%lg", &Cones[Nlightcones].DiskThickness);
+
+                        /* normalize the normal vector in case it is not normalized yet */
+                        Cones[Nlightcones].DiskNormal *= 1.0 / Cones[Nlightcones].DiskNormal.norm();
+                        sprintf(Cones[Nlightcones].Tag, "Disk (for image)");
+                        break;
+
+                      case LC_TYPE_SQUAREMAP:
+                        fscanf(fd, "%d %lg %lg", &Cones[Nlightcones].OnlyMostBoundFlag, &Cones[Nlightcones].Astart,
+                               &Cones[Nlightcones].Aend);
+                        fscanf(fd, "%lg %lg %lg", &Cones[Nlightcones].SquareMapZdir[0], &Cones[Nlightcones].SquareMapZdir[1],
+                               &Cones[Nlightcones].SquareMapZdir[2]);
+                        fscanf(fd, "%lg %lg %lg", &Cones[Nlightcones].SquareMapXdir[0], &Cones[Nlightcones].SquareMapXdir[1],
+                               &Cones[Nlightcones].SquareMapXdir[2]);
+                        fscanf(fd, "%lg", &Cones[Nlightcones].SquareMapAngle);
+
+                        /* establish coordinate system */
+
+                        Cones[Nlightcones].SquareMapZdir *= 1.0 / Cones[Nlightcones].SquareMapZdir.norm();
+
+                        // cross product
+                        Cones[Nlightcones].SquareMapYdir = Cones[Nlightcones].SquareMapZdir ^ Cones[Nlightcones].SquareMapXdir;
+
+                        Cones[Nlightcones].SquareMapYdir *= 1.0 / Cones[Nlightcones].SquareMapYdir.norm();
+
+                        // cross product
+                        Cones[Nlightcones].SquareMapXdir = Cones[Nlightcones].SquareMapYdir ^ Cones[Nlightcones].SquareMapZdir;
+
+                        Cones[Nlightcones].SquareMapAngleRad = Cones[Nlightcones].SquareMapAngle * M_PI / 180.0;
+
+                        mpi_printf("LIGHTCONE_SQUAREMAP: cone=%2d   x-axis  =   %15g %15g %15g\n", Nlightcones,
+                                   Cones[Nlightcones].SquareMapXdir[0], Cones[Nlightcones].SquareMapXdir[1],
+                                   Cones[Nlightcones].SquareMapXdir[2]);
+                        mpi_printf("LIGHTCONE_SQUAREMAP: cone=%2d   y-axis  =   %15g %15g %15g\n", Nlightcones,
+                                   Cones[Nlightcones].SquareMapYdir[0], Cones[Nlightcones].SquareMapYdir[1],
+                                   Cones[Nlightcones].SquareMapYdir[2]);
+                        mpi_printf("LIGHTCONE_SQUAREMAP: cone=%2d   z-axis  =   %15g %15g %15g\n", Nlightcones,
+                                   Cones[Nlightcones].SquareMapZdir[0], Cones[Nlightcones].SquareMapZdir[1],
+                                   Cones[Nlightcones].SquareMapZdir[2]);
+                        sprintf(Cones[Nlightcones].Tag, "Square-map");
+                        break;
+                      default:
+                        Terminate("odd");
+                    }
+
+                  Nlightcones++;
+                }
+            }
+          fclose(fd);
+        }
+
+      for(int i = 0; i < Nlightcones; i++)
+        mpi_printf("LIGHTCONE_PARTICLES: lightcone #%2d:  %18s %20s  Astart=%10g  Aend=%10g\n", i, Cones[i].Tag,
+                   Cones[i].OnlyMostBoundFlag ? "(only most bound)" : "(all particles)", Cones[i].Astart, Cones[i].Aend);
+
+      mpi_printf("\n");
+    }
+
+  /* now tell also all other ranks about the lightcones */
+
+  MPI_Bcast(&Nlightcones, 1, MPI_INT, 0, Communicator);
+
+  if(ThisTask != 0)
+    Cones = (cone_data *)Mem.mymalloc("Cones", Nlightcones * sizeof(cone_data));
+
+  MPI_Bcast(Cones, Nlightcones * sizeof(cone_data), MPI_BYTE, 0, Communicator);
+}
+
+int lightcone::lightcone_init_times(void)
+{
+  for(int i = 0; i < Nlightcones; i++)
+    {
+      Cones[i].Time_start = log(Cones[i].Astart / All.TimeBegin) / All.Timebase_interval;
+      Cones[i].Time_end   = log(Cones[i].Aend / All.TimeBegin) / All.Timebase_interval;
+
+      Cones[i].ComDistStart = Driftfac.get_comoving_distance(Cones[i].Time_start);
+      Cones[i].ComDistEnd   = Driftfac.get_comoving_distance(Cones[i].Time_end);
+    }
+
+  double astart = 1.0e10;
+  double aend   = 0;
+
+  for(int i = 0; i < Nlightcones; i++)
+    {
+      if(Cones[i].Astart < astart)
+        astart = Cones[i].Astart;
+
+      if(Cones[i].Aend > aend)
+        aend = Cones[i].Aend;
+    }
+
+  ConeGlobAstart = astart;
+  ConeGlobAend   = aend;
+
+  ConeGlobTime_start = log(ConeGlobAstart / All.TimeBegin) / All.Timebase_interval;
+  ConeGlobTime_end   = log(ConeGlobAend / All.TimeBegin) / All.Timebase_interval;
+
+  ConeGlobComDistStart = Driftfac.get_comoving_distance(ConeGlobTime_start);
+  ConeGlobComDistEnd   = Driftfac.get_comoving_distance(ConeGlobTime_end);
+
+  int n = ceil(ConeGlobComDistStart / All.BoxSize + 1);
+
+  for(int rep = 0; rep < 2; rep++)
+    {
+      if(rep == 1)
+        //    BoxList = (boxlist *) Mem.mymalloc_movable(&BoxList, "BoxList", NumBoxes * sizeof(boxlist));
+        BoxList = Mem.alloc_movable<boxlist> MMM(BoxList, NumBoxes);
+
+      NumBoxes = 0;
+
+      for(int i = -n; i <= n; i++)
+        for(int j = -n; j <= n; j++)
+          for(int k = -n; k <= n; k++)
+            {
+              double corner[3];
+
+              corner[0] = i * All.BoxSize;
+              corner[1] = j * All.BoxSize;
+              corner[2] = k * All.BoxSize;
+
+              double Rmin, Rmax;
+
+              if(lightcone_box_at_corner_overlaps_at_least_with_one_cone(corner, Rmin, Rmax))
+                {
+                  if(rep == 1)
+                    {
+                      BoxList[NumBoxes].i    = i;
+                      BoxList[NumBoxes].j    = j;
+                      BoxList[NumBoxes].k    = k;
+                      BoxList[NumBoxes].Rmin = Rmin;
+                      BoxList[NumBoxes].Rmax = Rmax;
+                    }
+
+                  NumBoxes++;
+                }
+            }
+    }
+
+  mycxxsort(BoxList, BoxList + NumBoxes, lightcone_compare_BoxList_Rmax);
+
+  lightcone_clear_boxlist(All.Time);
+
+  NumLastCheck = 0;
+
+  double fac = (4 * M_PI / 3.0) * pow(ConeGlobComDistStart, 3) / pow(All.BoxSize, 3);
+
+  mpi_printf(
+      "LIGHTCONE_PARTICLES:  scale_factor: %10g to %10g    comoving distance: %10g to %10g   covered volume in units of box "
+      "volume=%g\n",
+      ConeGlobAstart, ConeGlobAend, ConeGlobComDistStart, ConeGlobComDistEnd, fac);
+
+  mpi_printf("LIGHTCONE_PARTICLES:  number of box replicas to check for this lightcone geometry settings = %d\n", NumBoxes);
+
+  if(NumBoxes > LIGHTCONE_MAX_BOXREPLICAS)
+    {
+      mpi_printf(
+          "\nLIGHTCONE_PARTICLES: Your lightcone extends to such high redshift that the box needs to be replicated a huge number of "
+          "times to cover it,\n"
+          "more than the prescribed limit of LIGHTCONE_MAX_BOXREPLICAS=%d. We better don't do such an inefficient run, unless you "
+          "override this constant.\n",
+          LIGHTCONE_MAX_BOXREPLICAS);
+      return 1;
+    }
+
+  return 0;
+}
+
+void lightcone::lightcone_clear_boxlist(double ascale)
+{
+  double time_start = log(ascale / All.TimeBegin) / All.Timebase_interval;
+
+  double dist = Driftfac.get_comoving_distance(time_start);
+
+  int count = 0;
+
+  for(int i = 0; i < NumBoxes; i++)
+    {
+      if(dist < BoxList[i].Rmin)
+        {
+          BoxList[i] = BoxList[--NumBoxes];
+          i--;
+          count++;
+        }
+    }
+
+  if(count)
+    {
+      mpi_printf("LIGHTCONE: Eliminated %d entries from BoxList\n", count);
+      mycxxsort(BoxList, BoxList + NumBoxes, lightcone_compare_BoxList_Rmax);
+    }
+}
+
+bool lightcone::lightcone_box_at_corner_overlaps_at_least_with_one_cone(double *corner, double &Rmin, double &Rmax)
+{
+  Rmin = MAX_DOUBLE_NUMBER;
+  Rmax = 0;
+
+  for(int ii = 0; ii <= 1; ii++)
+    for(int jj = 0; jj <= 1; jj++)
+      for(int kk = 0; kk <= 1; kk++)
+        {
+          double crn[3];
+          crn[0] = corner[0] + ii * All.BoxSize;
+          crn[1] = corner[1] + jj * All.BoxSize;
+          crn[2] = corner[2] + kk * All.BoxSize;
+
+          double r = sqrt(crn[0] * crn[0] + crn[1] * crn[1] + crn[2] * crn[2]);
+          if(Rmin > r)
+            Rmin = r;
+          if(Rmax < r)
+            Rmax = r;
+        }
+
+  for(int cone = 0; cone < Nlightcones; cone++)
+    {
+      if(Rmin < Cones[cone].ComDistStart && Rmax > Cones[cone].ComDistEnd)
+        {
+          if(Cones[cone].LcType == LC_TYPE_FULLSKY)
+            {
+              return true;
+            }
+          else if(Cones[cone].LcType == LC_TYPE_OCTANT)
+            {
+              return true; /* still need to make this more selective */
+            }
+          else if(Cones[cone].LcType == LC_TYPE_PENCIL)
+            {
+              return true; /* still need to make this more selective */
+            }
+          else if(Cones[cone].LcType == LC_TYPE_DISK)
+            {
+              double mindist    = MAX_DOUBLE_NUMBER;
+              double first_dist = 0;
+
+              for(int ii = 0; ii <= 1; ii++)
+                for(int jj = 0; jj <= 1; jj++)
+                  for(int kk = 0; kk <= 1; kk++)
+                    {
+                      double crn[3];
+                      crn[0] = corner[0] + ii * All.BoxSize;
+                      crn[1] = corner[1] + jj * All.BoxSize;
+                      crn[2] = corner[2] + kk * All.BoxSize;
+
+                      double dist =
+                          crn[0] * Cones[cone].DiskNormal[0] + crn[1] * Cones[cone].DiskNormal[1] + crn[2] * Cones[cone].DiskNormal[2];
+
+                      if(ii == 0 && jj == 0 && kk == 0)  // upon first iteration
+                        first_dist = dist;
+                      else
+                        {
+                          if(first_dist * dist < 0)  // points on opposite side imply overlap
+                            return true;
+                        }
+
+                      if(fabs(dist) < mindist)
+                        mindist = fabs(dist);
+
+                      if(mindist < 0.5 * Cones[cone].DiskThickness)
+                        return true;
+                    }
+            }
+        }
+    }
+
+  return false;
+}
+
+#endif
+
+#ifdef LIGHTCONE_MASSMAPS
+
+void lightcone::lightcone_init_massmaps(void)
+{
+  if(!All.ComovingIntegrationOn)
+    Terminate("LIGHTCONE_MASSMAPS: Makes only sense for cosmological simulations with ComovingIntegrationOn enabled\n");
+
+  Mp->Npix = nside2npix(All.LightConeMassMapsNside);
+
+  subdivide_evenly(Mp->Npix, NTask, ThisTask, &Mp->FirstPix, &Mp->NpixLoc);
+
+  if(ThisTask == 0)
+    {
+      for(int iter = 0; iter < 2; iter++)
+        {
+          NumMassMapBoundaries = 0;
+
+          if(iter == 0)
+            {
+              double zend         = 0;
+              double com_dist_end = 0;
+              NumMassMapBoundaries++;
+
+              while(zend <= All.LightConeMassMapMaxRedshift)
+                {
+                  com_dist_end += All.LightConeMassMapThickness;
+
+                  double aend = Driftfac.get_scalefactor_for_comoving_distance(com_dist_end);
+
+                  zend = 1 / aend - 1;
+
+                  NumMassMapBoundaries++;
+                }
+
+              MassMapBoundariesAscale = (double *)Mem.mymalloc_movable(&MassMapBoundariesAscale, "MassMapBoundariesAscale",
+                                                                       NumMassMapBoundaries * sizeof(double));
+
+              mpi_printf("LIGHTCONE_MASSMAPS: %d entries\n", NumMassMapBoundaries);
+
+              if(NumMassMapBoundaries < 2)
+                Terminate("Less than two boundaries detected");
+            }
+          else if(iter == 1)
+            {
+              double zend                                   = 0;
+              double com_dist_end                           = 0;
+              MassMapBoundariesAscale[NumMassMapBoundaries] = 1.0;
+              NumMassMapBoundaries++;
+
+              while(zend <= All.LightConeMassMapMaxRedshift)
+                {
+                  com_dist_end += All.LightConeMassMapThickness;
+
+                  double aend = Driftfac.get_scalefactor_for_comoving_distance(com_dist_end);
+
+                  MassMapBoundariesAscale[NumMassMapBoundaries] = aend;
+
+                  zend = 1 / aend - 1;
+
+                  NumMassMapBoundaries++;
+                }
+            }
+        }
+
+      mycxxsort(MassMapBoundariesAscale, MassMapBoundariesAscale + NumMassMapBoundaries, compare_doubles);
+    }
+
+  /* now tell also all other ranks about the lightcones */
+
+  MPI_Bcast(&NumMassMapBoundaries, 1, MPI_INT, 0, Communicator);
+
+  if(ThisTask != 0)
+    MassMapBoundariesAscale =
+        (double *)Mem.mymalloc_movable(&MassMapBoundariesAscale, "MassMapBoundariesAscale", NumMassMapBoundaries * sizeof(double));
+
+  MPI_Bcast(MassMapBoundariesAscale, NumMassMapBoundaries * sizeof(double), MPI_BYTE, 0, Communicator);
+
+  MassMapBoundariesTime =
+      (integertime *)Mem.mymalloc_movable(&MassMapBoundariesTime, "MassMapBoundariesTime", NumMassMapBoundaries * sizeof(integertime));
+  MassMapBoundariesComDist =
+      (double *)Mem.mymalloc_movable(&MassMapBoundariesComDist, "MassMapBoundariesComDist", NumMassMapBoundaries * sizeof(double));
+}
+
+int lightcone::lightcone_massmap_report_boundaries(void)
+{
+  double fac_max               = 0;
+  const double allowed_fac_max = LIGHTCONE_MAX_BOXREPLICAS;
+
+  for(int i = 0; i < NumMassMapBoundaries; i++)
+    {
+      MassMapBoundariesTime[i]    = log(MassMapBoundariesAscale[i] / All.TimeBegin) / All.Timebase_interval;
+      MassMapBoundariesComDist[i] = Driftfac.get_comoving_distance(MassMapBoundariesTime[i]);
+    }
+
+  for(int i = 0; i < NumMassMapBoundaries - 1; i++)
+    {
+      MassMapBoundariesTime[i]    = log(MassMapBoundariesAscale[i] / All.TimeBegin) / All.Timebase_interval;
+      MassMapBoundariesComDist[i] = Driftfac.get_comoving_distance(MassMapBoundariesTime[i]);
+
+      double fac   = (4 * M_PI / 3.0) * pow(MassMapBoundariesComDist[i], 3) / pow(All.BoxSize, 3);
+      double shell = fac - (4 * M_PI / 3.0) * pow(MassMapBoundariesComDist[i + 1], 3) / pow(All.BoxSize, 3);
+
+      mpi_printf(
+          "LIGHTCONE_MASSMAPS:   entry=%3d   scale_factor=%10.6f   redshift=%10.6f   comoving distance=%12.3f   shell-volume in "
+          "units of box "
+          "volume=%g\n",
+          i, MassMapBoundariesAscale[i], 1 / MassMapBoundariesAscale[i] - 1, MassMapBoundariesComDist[i], shell);
+
+      if(fac > fac_max)
+        fac_max = fac;
+    }
+
+  if(fac_max > allowed_fac_max)
+    {
+      mpi_printf(
+          "\nLIGHTCONE_MASSMAPS: Your lightcone mass maps extend to such high redshift that the box needs to be replicated a huge "
+          "number of times to cover it (volume ratio %g). We better don't do such an inefficient run.\n"
+          "Setting LIGHTCONE_MAX_BOXREPLICAS=%g to a higher value (at least %g) can override this, however.\n",
+          fac_max, (double)LIGHTCONE_MAX_BOXREPLICAS, fac_max);
+      return 1;
+    }
+
+  return 0;
+}
+
+void lightcone::lightcone_massmap_flush(int dump_allowed_flag)
+{
+  lightcone_massmap_binning();
+
+  if(dump_allowed_flag)
+    {
+      while(All.CurrentMassMapBoundary < NumMassMapBoundaries - 1 &&
+            (All.Time >= MassMapBoundariesAscale[All.CurrentMassMapBoundary + 1] || All.Ti_Current >= TIMEBASE))
+        {
+          lightcone_massmap_io Lcone(Mp, this, Communicator, All.SnapFormat); /* get an I/O object */
+          Lcone.lightcone_massmap_save(All.CurrentMassMapBoundary++);
+
+          lightcone_massmap_binning();
+        }
+    }
+}
+
+void lightcone::lightcone_massmap_binning(void)
+{
+  double t0 = Logs.second();
+
+  int *Send_count  = (int *)Mem.mymalloc_movable(&Send_count, "Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc_movable(&Send_offset, "Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc_movable(&Recv_count, "Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc_movable(&Recv_offset, "Recv_offset", sizeof(int) * NTask);
+
+  /* count how many we have of each task */
+  for(int i = 0; i < NTask; i++)
+    Send_count[i] = 0;
+
+  for(int i = 0; i < Mp->NumPart; i++)
+    {
+      int target = Mp->P[i].Task;
+
+      if(target < 0 || target >= NTask)
+        Terminate("i=%d: target=%d target < 0 || target >= NTask", i, target);
+
+      if(target != ThisTask)
+        Send_count[target]++;
+    }
+
+  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+  Recv_offset[0] = Send_offset[0] = 0;
+  int nexport = 0, nimport = 0;
+
+  for(int i = 0; i < NTask; i++)
+    {
+      nimport += Recv_count[i];
+      nexport += Send_count[i];
+
+      if(i > 0)
+        {
+          Send_offset[i] = Send_offset[i - 1] + Send_count[i - 1];
+          Recv_offset[i] = Recv_offset[i - 1] + Recv_count[i - 1];
+        }
+    }
+
+  lightcone_massmap_data *send_P =
+      (lightcone_massmap_data *)Mem.mymalloc_movable(&send_P, "send_P", nexport * sizeof(lightcone_massmap_data));
+
+  for(int i = 0; i < NTask; i++)
+    Send_count[i] = 0;
+
+  for(int i = 0; i < Mp->NumPart; i++)
+    {
+      int target = Mp->P[i].Task;
+
+      if(target != ThisTask)
+        {
+          send_P[Send_offset[target] + Send_count[target]] = Mp->P[i];
+          Send_count[target]++;
+
+          Mp->P[i] = Mp->P[Mp->NumPart - 1];
+          Mp->NumPart--;
+          i--;
+        }
+    }
+
+  if(Mp->NumPart + nimport > Mp->MaxPart)
+    Mp->reallocate_memory_maxpart(Mp->NumPart + nimport);
+
+  for(int ngrp = 1; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&send_P[Send_offset[recvTask]], Send_count[recvTask] * sizeof(lightcone_massmap_data), MPI_BYTE, recvTask,
+                       TAG_DENS_A, &Mp->P[Mp->NumPart + Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(lightcone_massmap_data),
+                       MPI_BYTE, recvTask, TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  Mp->NumPart += nimport;
+
+  Mem.myfree_movable(send_P);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  mpi_printf("LIGHTCONE_MASSMAPS: before binning  %9d   (maxpart=%9d, memory for buffer  %g MB)\n", Mp->NumPart, Mp->MaxPart,
+             (double)Mp->MaxPart * sizeof(lightcone_massmap_data) * TO_MBYTE_FAC);
+
+  int expunge = 0;
+
+  /* now bin on the map */
+  for(int i = 0; i < Mp->NumPart; i++)
+    {
+      if(Mp->P[i].Task != ThisTask)
+        Terminate("can't be");
+
+      if(Mp->P[i].Ascale >= MassMapBoundariesAscale[All.CurrentMassMapBoundary] &&
+         Mp->P[i].Ascale < MassMapBoundariesAscale[All.CurrentMassMapBoundary + 1])
+        {
+          int pix = Mp->P[i].PixIndex - Mp->FirstPix;
+          if(pix < 0 || pix >= Mp->NpixLoc)
+            Terminate("wrong pixel");
+
+          MassMap[pix] += Mp->P[i].getMass();
+
+          Mp->P[i] = Mp->P[Mp->NumPart - 1];
+          Mp->NumPart--;
+          i--;
+        }
+      else if(Mp->P[i].Ascale < MassMapBoundariesAscale[All.CurrentMassMapBoundary])
+        {
+          Mp->P[i] = Mp->P[Mp->NumPart - 1];
+          Mp->NumPart--;
+          i--;
+
+          expunge++;
+        }
+    }
+
+  if(Mp->MaxPart > LIGHTCONE_MASSMAP_ALLOC_FAC * (Sp->TotNumPart / NTask) &&
+     Mp->NumPart < LIGHTCONE_MAX_FILLFACTOR * LIGHTCONE_MASSMAP_ALLOC_FAC * (Sp->TotNumPart / NTask))
+    Mp->reallocate_memory_maxpart(LIGHTCONE_MASSMAP_ALLOC_FAC * (Sp->TotNumPart / NTask));
+
+  double t1 = Logs.second();
+
+  mpi_printf("LIGHTCONE_MASSMAPS:  after binning  %9d   (maxpart=%9d, memory for buffer  %g MB) took=%g sec expunge=%d\n", Mp->NumPart,
+             Mp->MaxPart, (double)Mp->MaxPart * sizeof(lightcone_massmap_data) * TO_MBYTE_FAC, Logs.timediff(t0, t1), expunge);
+}
+
+#endif
+
+#endif
diff --git a/src/lightcone/lightcone.h b/src/lightcone/lightcone.h
new file mode 100644
index 0000000000000000000000000000000000000000..96bd9615096583d0222cac171908da3b162258c5
--- /dev/null
+++ b/src/lightcone/lightcone.h
@@ -0,0 +1,190 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  lightcone.h
+ *
+ *  \brief declares a class for accumulating particles on the lightcone
+ */
+
+#ifndef LIGHTCONE_H
+#define LIGHTCONE_H
+
+#include "gadgetconfig.h"
+
+#ifdef LIGHTCONE
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/lcparticles.h"
+#include "../data/mmparticles.h"
+#include "../data/simparticles.h"
+#include "../data/symtensors.h"
+#include "../mpi_utils/setcomm.h"
+#include "../time_integration/driftfac.h"
+
+#ifdef LIGHTCONE_MASSMAPS
+#include <chealpix.h>
+#endif
+
+#ifndef LIGHTCONE_MAX_BOXREPLICAS
+#define LIGHTCONE_MAX_BOXREPLICAS 1000
+#endif
+
+#ifndef LIGHTCONE_ORDER_NSIDE
+#define LIGHTCONE_ORDER_NSIDE 256
+#endif
+
+#define LC_TYPE_FULLSKY 0
+#define LC_TYPE_OCTANT 1
+#define LC_TYPE_PENCIL 2
+#define LC_TYPE_DISK 3
+#define LC_TYPE_SQUAREMAP 4
+
+class lightcone : public parameters
+{
+ public:
+  simparticles *Sp;
+
+#if defined(LIGHTCONE_PARTICLES)
+  lcparticles *Lp;
+#endif
+#if defined(LIGHTCONE_MASSMAPS)
+  mmparticles *Mp;
+#endif
+
+ public:
+#if defined(LIGHTCONE_PARTICLES) && defined(LIGHTCONE_MASSMAPS) /* both particles and massmaps */
+  lightcone(MPI_Comm comm, simparticles *Sp_ptr, lcparticles *Lp_ptr, mmparticles *Mp_ptr) : parameters(comm)
+  {
+    Sp = Sp_ptr;
+    Lp = Lp_ptr;
+    Mp = Mp_ptr;
+
+    Sp->LightCone = this;
+  }
+#else
+#if defined(LIGHTCONE_PARTICLES)
+  lightcone(MPI_Comm comm, simparticles *Sp_ptr, lcparticles *Lp_ptr) : parameters(comm) /* only particles */
+  {
+    Sp = Sp_ptr;
+    Lp = Lp_ptr;
+
+    Sp->LightCone = this;
+  }
+#else
+#if defined(LIGHTCONE_MASSMAPS)
+  lightcone(MPI_Comm comm, simparticles *Sp_ptr, mmparticles *Mp_ptr) : parameters(comm) /* only massmaps */
+  {
+    Sp = Sp_ptr;
+    Mp = Mp_ptr;
+
+    Sp->LightCone = this;
+  }
+#endif
+#endif
+#endif
+
+  void lightcone_init_intposconverter(double linklength);
+
+  int lightcone_test_for_particle_addition(particle_data *P, integertime time0, integertime time1, double dt_drift);
+
+  void register_parameters(void);
+
+  void makeimage(int argc, char **argv);
+
+#ifdef LIGHTCONE_PARTICLES
+  int Nlightcones;
+
+  struct cone_data
+  {
+    double Astart;
+    double Aend;
+
+    integertime Time_start;
+    integertime Time_end;
+
+    double ComDistStart;
+    double ComDistEnd;
+
+    int LcType;
+    int OnlyMostBoundFlag;
+
+    int OctantNr;
+
+    vector<double> PencilDir;
+    double PencilAngle;
+    double PencilAngleRad;
+
+    vector<double> DiskNormal;
+    double DiskThickness;
+
+    vector<double> SquareMapXdir;
+    vector<double> SquareMapYdir;
+    vector<double> SquareMapZdir;
+
+    double SquareMapAngle;
+    double SquareMapAngleRad;
+
+    char Tag[100];
+  };
+  cone_data *Cones;
+
+  double ConeGlobAstart;
+  double ConeGlobAend;
+
+  double ConeGlobTime_start;
+  double ConeGlobTime_end;
+
+  double ConeGlobComDistStart;
+  double ConeGlobComDistEnd;
+
+#endif
+
+  struct boxlist
+  {
+    int i, j, k; /* displacement of principal box */
+    double Rmin; /* minimum comoving distance of this box */
+    double Rmax; /* minimum comoving distance of this box */
+  };
+  boxlist *BoxList;
+  int NumBoxes;
+  int NumLastCheck;
+
+  void lightcone_init_geometry(char *fname);
+  void lightcone_add_position_particles(particle_data *P, double *pos, double ascale);
+  int lightcone_init_times(void);
+  bool lightcone_is_cone_member(int i, int cone);
+  bool lightcone_is_cone_member_basic(double ascale, vector<double> &pos, bool previously, int cone);
+
+  bool lightcone_box_at_corner_overlaps_at_least_with_one_cone(double *corner, double &rmin, double &rmax);
+  void lightcone_clear_boxlist(double ascale);
+
+  static bool lightcone_compare_BoxList_Rmax(const boxlist &a, const boxlist &b)
+  {
+    return a.Rmax > b.Rmax; /* sort in descending order */
+  }
+
+#ifdef LIGHTCONE_MASSMAPS
+
+  double *MassMap;
+
+  int NumMassMapBoundaries;
+  double *MassMapBoundariesAscale;
+  integertime *MassMapBoundariesTime;
+  double *MassMapBoundariesComDist;
+
+  void lightcone_init_massmaps(void);
+  void lightcone_massmap_binning(void);
+  void lightcone_massmap_flush(int dump_allowed_flag);
+  int lightcone_add_position_massmaps(particle_data *P, double *pos, double ascale);
+  int lightcone_massmap_report_boundaries(void);
+
+  static bool compare_doubles(const double &a, const double &b) { return a < b; }
+#endif
+};
+
+#endif
+#endif
diff --git a/src/lightcone/lightcone_massmap_io.cc b/src/lightcone/lightcone_massmap_io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c044ebac1b6f967bbea092e28205e4573a44632
--- /dev/null
+++ b/src/lightcone/lightcone_massmap_io.cc
@@ -0,0 +1,209 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file lightcone_massmap_io.cc
+ *
+ * \brief routines for I/O of lightcone mass projections
+ */
+
+#include "gadgetconfig.h"
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_MASSMAPS)
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../lightcone/lightcone.h"
+#include "../lightcone/lightcone_massmap_io.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../system/system.h"
+
+/*!
+ * \brief Function for field registering.
+ *
+ * For init_field( arguments read the documentation of init_field(.
+ * Don't forget to add the new IO_FLAG to io_private.h
+ */
+
+lightcone_massmap_io::lightcone_massmap_io(mmparticles *Mp_ptr, lightcone *LightCone_ptr, MPI_Comm comm, int format)
+    : IO_Def(comm, format)
+{
+  Mp        = Mp_ptr;
+  LightCone = LightCone_ptr;
+
+  this->N_IO_Fields  = 0;
+  this->N_DataGroups = 1;
+  this->header_size  = sizeof(header);
+  this->header_buf   = &header;
+  this->type_of_file = FILE_IS_MASSMAP;
+  sprintf(this->info, "LIGHTCONE: writing mass map data");
+
+  init_field("MAMP", "Mass", MEM_DOUBLE, FILE_MY_IO_FLOAT, SKIP_ON_READ, 1, A_MM, &LightCone->MassMap[0], NULL, MASSMAPS, 1, 0., -1.,
+             0., 1., 0., All.UnitMass_in_g, true);
+}
+
+void lightcone_massmap_io::lightcone_massmap_save(int num)
+{
+  char buf[2 * MAXLEN_PATH];
+
+  selected_bnd = num;
+
+  long long NumLP_tot = LightCone->Mp->NpixLoc;
+  MPI_Allreduce(MPI_IN_PLACE, &NumLP_tot, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
+  mpi_printf("\nLIGHTCONE: writing lightcone massmap files #%d ... (Npix_tot = %lld,  ascale %g to %g)\n", num, NumLP_tot,
+             LightCone->MassMapBoundariesAscale[num], LightCone->MassMapBoundariesAscale[num + 1]);
+
+  if(All.NumFilesPerSnapshot > 1)
+    {
+      if(ThisTask == 0)
+        {
+          char buf[2 * MAXLEN_PATH];
+          sprintf(buf, "%s/mapsdir_%03d", All.OutputDir, num);
+          mkdir(buf, 02755);
+        }
+      MPI_Barrier(Communicator);
+    }
+
+  if(All.NumFilesPerSnapshot > 1)
+    sprintf(buf, "%s/mapsdir_%03d/%s_%03d", All.OutputDir, num, "maps", num);
+  else
+    sprintf(buf, "%s/%s_%03d", All.OutputDir, "maps", num);
+
+  write_multiple_files(buf, All.NumFilesPerSnapshot);
+
+  mpi_printf("LIGHTCONE: done with writing mass map.\n");
+
+  /* clear the massmap again */
+  memset(LightCone->MassMap, 0, LightCone->Mp->NpixLoc * sizeof(double));
+}
+
+void lightcone_massmap_io::fill_file_header(int writeTask, int lastTask, long long *n_type, long long *ntot_type)
+{
+  /* determine global and local pixel numbers */
+
+  n_type[0] = LightCone->Mp->NpixLoc;
+
+  /* determine particle numbers of each type in file */
+  if(ThisTask == writeTask)
+    {
+      ntot_type[0] = n_type[0];
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        {
+          long long nn;
+          MPI_Recv(&nn, 1, MPI_LONG_LONG, task, TAG_LOCALN, Communicator, MPI_STATUS_IGNORE);
+          ntot_type[0] += nn;
+        }
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        MPI_Send(&ntot_type[0], 1, MPI_LONG_LONG, task, TAG_N, Communicator);
+    }
+  else
+    {
+      MPI_Send(&n_type[0], 1, MPI_LONG_LONG, writeTask, TAG_LOCALN, Communicator);
+      MPI_Recv(&ntot_type[0], 1, MPI_LONG_LONG, writeTask, TAG_N, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* fill file header */
+  header.nside      = All.LightConeMassMapsNside; /* healpix nside */
+  header.npix_local = ntot_type[0];
+  header.npix_total = LightCone->Mp->Npix;
+  header.num_files  = All.NumFilesPerSnapshot;
+
+  header.AscaleStart  = LightCone->MassMapBoundariesAscale[selected_bnd];
+  header.AscaleEnd    = LightCone->MassMapBoundariesAscale[selected_bnd + 1];
+  header.ComDistStart = LightCone->MassMapBoundariesComDist[selected_bnd];
+  header.ComDistEnd   = LightCone->MassMapBoundariesComDist[selected_bnd + 1];
+}
+
+void lightcone_massmap_io::write_header_fields(hid_t handle)
+{
+  write_scalar_attribute(handle, "Nside", &header.nside, H5T_NATIVE_INT);
+
+  write_scalar_attribute(handle, "NpixLocal", &header.npix_local, H5T_NATIVE_INT);
+
+  write_scalar_attribute(handle, "NpixTotal", &header.npix_total, H5T_NATIVE_INT);
+
+  write_scalar_attribute(handle, "NumFiles", &header.num_files, H5T_NATIVE_INT);
+
+  write_scalar_attribute(handle, "AscaleStart", &header.AscaleStart, H5T_NATIVE_DOUBLE);
+
+  write_scalar_attribute(handle, "AscaleEnd", &header.AscaleEnd, H5T_NATIVE_DOUBLE);
+
+  write_scalar_attribute(handle, "ComDistStart", &header.ComDistStart, H5T_NATIVE_DOUBLE);
+
+  write_scalar_attribute(handle, "ComDistEnd", &header.ComDistEnd, H5T_NATIVE_DOUBLE);
+}
+
+void lightcone_massmap_io::set_filenr_in_header(int numfiles) { header.num_files = numfiles; }
+
+void lightcone_massmap_io::get_datagroup_name(int type, char *buf)
+{
+  if(type == 0)
+    sprintf(buf, "/Maps");
+  else
+    Terminate("should not get here");
+}
+
+void *lightcone_massmap_io::get_base_address_of_structure(enum arrays array, int index)
+{
+  switch(array)
+    {
+      case A_MM:
+        return (void *)(LightCone->MassMap + index);
+
+      default:
+        Terminate("we don't expect to get here");
+    }
+
+  return NULL;
+}
+
+void lightcone_massmap_io::read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part,
+                                            long long *npart, int *nstart)
+{
+  /* empty */
+}
+
+void lightcone_massmap_io::read_header_fields(const char *fname)
+{ /* empty */
+}
+
+void lightcone_massmap_io::read_increase_numbers(int type, int n_for_this_task)
+{ /* empty */
+}
+
+int lightcone_massmap_io::get_filenr_from_header(void)
+{
+  /* empty */
+  return 0;
+}
+
+int lightcone_massmap_io::get_type_of_element(int index)
+{
+  /* empty */
+  return 0;
+}
+
+void lightcone_massmap_io::set_type_of_element(int index, int type)
+{ /* empty */
+}
+
+#endif
diff --git a/src/lightcone/lightcone_massmap_io.h b/src/lightcone/lightcone_massmap_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0cecc1702295834e3513eafa0b556b2645d1aee
--- /dev/null
+++ b/src/lightcone/lightcone_massmap_io.h
@@ -0,0 +1,72 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file lightcone_massmap_io.h
+ *
+ * \brief declares class used for I/O of lightcone mass projections
+ */
+
+#ifndef LIGHTCONE_MASSMAP_IO_H
+#define LIGHTCONE_MASSMAP_IO_H
+
+#include "gadgetconfig.h"
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_MASSMAPS)
+
+#include "../data/intposconvert.h"
+#include "../data/mmparticles.h"
+#include "../io/io.h"
+#include "../lightcone/lightcone.h"
+
+class lightcone_massmap_io : public IO_Def
+{
+ public:
+  lightcone_massmap_io(mmparticles *Mp_ptr, lightcone *LightCone_ptr, MPI_Comm comm, int format);
+
+  void lightcone_massmap_save(int num);
+
+  /* supplied virtual functions */
+  void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart);
+  void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                        int *nstart);
+  void get_datagroup_name(int grnr, char *gname);
+  void write_header_fields(hid_t);
+  void read_header_fields(const char *fname);
+  void read_increase_numbers(int type, int n_for_this_task);
+  int get_filenr_from_header(void);
+  void set_filenr_in_header(int);
+  void *get_base_address_of_structure(enum arrays array, int index);
+  int get_type_of_element(int index);
+  void set_type_of_element(int index, int type);
+
+  /** Header for the standard file format.
+   */
+  struct io_header
+  {
+    int nside; /* healpix nside */
+    int npix_local;
+    int npix_total;
+    int num_files;
+    double AscaleStart;
+    double AscaleEnd;
+    double ComDistStart;
+    double ComDistEnd;
+  };
+  io_header header; /**< holds header for snapshot files */
+
+ private:
+  mmparticles *Mp;
+  lightcone *LightCone;
+
+  int selected_bnd;
+
+  /*
+   * special input/output functions for certain fields
+   */
+};
+
+#endif
+#endif
diff --git a/src/lightcone/lightcone_particle_io.cc b/src/lightcone/lightcone_particle_io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..75fdae9b3863ce613ee58f5353bac562d61f0a3a
--- /dev/null
+++ b/src/lightcone/lightcone_particle_io.cc
@@ -0,0 +1,498 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file lightcone_particle_io.cc
+ *
+ * \brief routines for I/O of lightcone particles
+ */
+
+#include "gadgetconfig.h"
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES)
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../lightcone/lightcone.h"
+#include "../lightcone/lightcone_particle_io.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../system/system.h"
+
+/*!
+ * \brief Function for field registering.
+ *
+ * For init_field( arguments read the documentation of init_field(.
+ * Don't forget to add the new IO_FLAG to io_private.h
+ */
+
+lightcone_particle_io::lightcone_particle_io(lcparticles *Lp_ptr, lightcone *LightCone_ptr, mergertree *MergerTree_ptr, MPI_Comm comm,
+                                             int format)
+    : IO_Def(comm, format)
+{
+  Lp         = Lp_ptr;
+  LightCone  = LightCone_ptr;
+  MergerTree = MergerTree_ptr;
+
+  this->N_IO_Fields  = 0;
+  this->N_DataGroups = NTYPES + 2;
+  /* the +1 data group is a tree table used only for storing reordered lightcone data,
+     the +2 data group is a coarse healpix table used only for storing normal lightcone data */
+
+  this->header_size  = sizeof(header);
+  this->header_buf   = &header;
+  this->type_of_file = FILE_IS_LIGHTCONE;
+  sprintf(this->info, "LIGHTCONE: writing particle lightcone data");
+
+  init_field("POS ", "Coordinates", MEM_MY_DOUBLE, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_LC, NULL, io_func_pos, ALL_TYPES, 1, 1.,
+             -1., 1., 0., 0., All.UnitLength_in_cm, true);
+
+#ifdef OUTPUT_VELOCITIES_IN_HALF_PRECISION
+  init_field("VEL ", "Velocities", MEM_MY_FLOAT, FILE_HALF, READ_IF_PRESENT, 3, A_LC, &Lp->P[0].Vel[0], NULL, ALL_TYPES, 1, 0.5, 0.,
+             0., 0., 1., All.UnitVelocity_in_cm_per_s);
+#else
+  init_field("VEL ", "Velocities", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_LC, &Lp->P[0].Vel[0], NULL, ALL_TYPES, 1, 0.5,
+             0., 0., 0., 1., All.UnitVelocity_in_cm_per_s);
+#endif
+
+#ifdef LIGHTCONE_OUTPUT_ACCELERATIONS
+#ifdef OUTPUT_ACCELERATIONS_IN_HALF_PRECISION
+  All.accel_normalize_fac = 10.0 * All.Hubble * (100.0 * 1.0e5 / All.UnitVelocity_in_cm_per_s);
+  init_field("ACCE", "Acceleration", MEM_MY_FLOAT, FILE_HALF, READ_IF_PRESENT, 3, A_LC, NULL, io_func_accel, ALL_TYPES, 1, -2.0, 1, -1,
+             0, 2, All.accel_normalize_fac * All.UnitVelocity_in_cm_per_s * All.UnitVelocity_in_cm_per_s / All.UnitLength_in_cm);
+#else
+  init_field("ACCE", "Acceleration", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 3, A_LC, &Lp->P[0].GravAccel[0], NULL, ALL_TYPES,
+             1, -2.0, 1, -1, 0, 2, All.UnitVelocity_in_cm_per_s * All.UnitVelocity_in_cm_per_s / All.UnitLength_in_cm);
+#endif
+#endif
+
+  init_field("ID  ", "ParticleIDs", MEM_MY_ID_TYPE, FILE_MY_ID_TYPE, READ_IF_PRESENT, 1, A_LC, NULL, io_func_id, ALL_TYPES, 0, 0, 0, 0,
+             0, 0, 0, true);
+
+#ifndef LEAN
+  init_field("MASS", "Masses", MEM_MY_DOUBLE, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_LC, NULL, io_func_mass, MASS_BLOCK, 1, 0., -1.,
+             0., 1., 0., All.UnitMass_in_g);
+#endif
+
+#ifndef LEAN
+  init_field("ASCL", "Ascale", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, READ_IF_PRESENT, 1, A_LC, &Lp->P[0].Ascale, NULL, ALL_TYPES, 0, 0, 0, 0,
+             0, 0, 0);
+#endif
+
+#ifdef REARRANGE_OPTION
+  init_field("MTRI", "TreeID", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_LC, &Lp->P[0].TreeID, NULL, ALL_TYPES, 0, 0, 0, 0, 0, 0, 0);
+#endif
+
+#if defined(SUBFIND) && defined(SUBFIND_STORE_LOCAL_DENSITY)
+  init_field("SFDE", "SubfindDensity", MEM_MY_FLOAT, All.RestartFlag != RST_CREATEICS ? FILE_MY_IO_FLOAT : FILE_NONE, SKIP_ON_READ, 1,
+             A_PS, &Lp->PS[0].SubfindDensity, 0, ALL_TYPES, /* subfind density */
+             1, -3., 2., -3., 1., 0., All.UnitDensity_in_cgs);
+
+  init_field("SFHS", "SubfindHsml", MEM_MY_FLOAT, All.RestartFlag != RST_CREATEICS ? FILE_MY_IO_FLOAT : FILE_NONE, SKIP_ON_READ, 1,
+             A_PS, &Lp->PS[0].SubfindHsml, 0, ALL_TYPES, /* subfind hsml */
+             1, 1., -1., 1., 0., 0., All.UnitLength_in_cm);
+
+  init_field("SFVD", "SubfindVelDisp", MEM_MY_FLOAT, All.RestartFlag != RST_CREATEICS ? FILE_MY_IO_FLOAT : FILE_NONE, SKIP_ON_READ, 1,
+             A_PS, &Lp->PS[0].SubfindVelDisp, 0, ALL_TYPES, /* subfind velocity dispersion */
+             1, 0., 0., 0., 0., 1., All.UnitVelocity_in_cm_per_s);
+#endif
+
+  init_field("MTRL", "ParticleCount", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_TT, &MergerTree->TreeTable[0].HaloCount, NULL, TREETABLE,
+             0, 0, 0, 0, 0, 0, 0, true);
+  init_field("MTRS", "ParticleFirst", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_TT, &MergerTree->TreeTable[0].FirstHalo, NULL,
+             TREETABLE, 0, 0, 0, 0, 0, 0, 0, true);
+  init_field("MTRI", "TreeID", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_TT, &MergerTree->TreeTable[0].TreeID, NULL, TREETABLE, 0, 0,
+             0, 0, 0, 0, 0, true);
+
+  init_field("HPHT", "ParticleCount", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_MM, &Lp->HealPixTab_PartCount[0], NULL, HEALPIXTAB, 0, 0,
+             0, 0, 0, 0, 0, true);
+}
+
+void lightcone_particle_io::lightcone_read(int num, int conenr)
+{
+  double t0 = Logs.second();
+
+  Lp->TotNumPart = 0;
+
+  char fname[2 * MAXLEN_PATH];
+
+  if(All.NumFilesPerSnapshot > 1)
+    sprintf(fname, "%s/lightcone_%02d/conedir_%04d/%s_%04d", All.OutputDir, conenr, num, "conesnap", num);
+  else
+    sprintf(fname, "%s/lightcone_%02d/%s_%04d", All.OutputDir, conenr, "conesnap", num);
+
+  int num_files = find_files(fname, fname);
+
+  reset_io_byte_count();
+
+  for(int rep = 0; rep < 2; rep++)
+    {
+      Lp->NumPart = 0;
+
+      read_files_driver(fname, rep, num_files);
+
+      /* now do the memory allocation */
+      if(rep == 0)
+        {
+          Lp->MaxPart = Lp->NumPart;
+          Lp->allocate_memory(); /* allocate lightcone particle storage */
+        }
+    }
+
+  MPI_Barrier(Communicator);
+
+  long long byte_count = get_io_byte_count(), byte_count_all;
+  sumup_longs(1, &byte_count, &byte_count_all, Communicator);
+
+  double t1 = Logs.second();
+
+  mpi_printf("LIGHTCONE-READ: reading done. Took %g sec, total size %g MB, corresponds to effective I/O rate of %g MB/sec\n",
+             Logs.timediff(t0, t1), byte_count_all / (1024.0 * 1024.0), byte_count_all / (1024.0 * 1024.0) / Logs.timediff(t0, t1));
+
+  mpi_printf("\nLIGHTCONE-READ: Total number of particles :  %lld\n\n", Lp->TotNumPart);
+}
+
+void lightcone_particle_io::lightcone_save(int num, int conenr, bool reordered_flag)
+{
+  char buf[3 * MAXLEN_PATH];
+
+  cone         = conenr; /* note: cone is here a variable of the class, NOT a local variable */
+  reorder_flag = reordered_flag;
+
+  /* determine local and global particle numbers for current lightcone */
+
+  int n_type[NTYPES];
+
+  /* determine global particle numbers for file header */
+  for(int n = 0; n < NTYPES; n++)
+    n_type[n] = 0;
+
+  for(int n = 0; n < Lp->NumPart; n++)
+    {
+      if(reorder_flag || LightCone->lightcone_is_cone_member(n, cone))
+        n_type[Lp->P[n].getType()]++;
+    }
+
+  sumup_large_ints(NTYPES, n_type, ntot_type_all, Communicator);
+
+  if(!reorder_flag)
+    {
+      /* prepare healpix look-up table */
+      Lp->Npix = nside2npix(LIGHTCONE_ORDER_NSIDE);
+
+      subdivide_evenly(Lp->Npix, NTask, ThisTask, &Lp->FirstPix, &Lp->NpixLoc);
+
+      Lp->HealPixTab_PartCount = (int *)Mem.mymalloc("HealPixTab_PartCount", Lp->NpixLoc * sizeof(int));
+
+      int *tmp_PartCount = (int *)Mem.mymalloc_clear("tmp_PartCount", Lp->Npix * sizeof(int));
+      for(int n = 0; n < Lp->NumPart; n++)
+        {
+          if(LightCone->lightcone_is_cone_member(n, cone))
+            tmp_PartCount[Lp->P[n].ipnest]++;
+        }
+
+      MPI_Allreduce(MPI_IN_PLACE, tmp_PartCount, Lp->Npix, MPI_INT, MPI_SUM, Communicator);
+
+      memcpy(Lp->HealPixTab_PartCount, tmp_PartCount + Lp->FirstPix, Lp->NpixLoc * sizeof(int));
+
+      Mem.myfree(tmp_PartCount);
+    }
+  else
+    Lp->Npix = Lp->NpixLoc = 0;
+
+  mpi_printf("\nLIGHTCONE: writing cone=%d\n", conenr);
+
+  char lname[MAXLEN_PATH];
+  if(reordered_flag)
+    sprintf(lname, "lightcone_treeorder");
+  else
+    sprintf(lname, "lightcone");
+
+  if(All.NumFilesPerSnapshot > 1)
+    {
+      if(ThisTask == 0)
+        {
+          char buf[3 * MAXLEN_PATH];
+          sprintf(buf, "%s/%s_%02d", All.OutputDir, lname, cone);
+          mkdir(buf, 02755);
+        }
+      MPI_Barrier(Communicator);
+    }
+
+  if(ThisTask == 0)
+    {
+      char buf[3 * MAXLEN_PATH];
+      sprintf(buf, "%s/%s_%02d/conedir_%04d", All.OutputDir, lname, cone, num);
+      mkdir(buf, 02755);
+    }
+  MPI_Barrier(Communicator);
+
+  if(All.NumFilesPerSnapshot > 1)
+    sprintf(buf, "%s/%s_%02d/conedir_%04d/%s_%04d", All.OutputDir, lname, cone, num, "conesnap", num);
+  else
+    sprintf(buf, "%s/%s_%02d/%s_%04d", All.OutputDir, lname, cone, "conesnap", num);
+
+  write_multiple_files(buf, All.NumFilesPerSnapshot);
+
+  if(!reorder_flag)
+    {
+      Mem.myfree(Lp->HealPixTab_PartCount);
+      Lp->HealPixTab_PartCount = NULL;
+    }
+}
+
+void lightcone_particle_io::fill_file_header(int writeTask, int lastTask, long long *n_type, long long *ntot_type)
+{
+  /* determine global and local particle numbers */
+  for(int n = 0; n < NTYPES + 2; n++)
+    n_type[n] = 0;
+
+  for(int n = 0; n < Lp->NumPart; n++)
+    if(reorder_flag || LightCone->lightcone_is_cone_member(n, cone))
+      if(Lp->P[n].getType() < NTYPES)
+        n_type[Lp->P[n].getType()]++;
+
+  n_type[NTYPES + 0] = MergerTree->Ntrees;
+  n_type[NTYPES + 1] = Lp->NpixLoc;
+
+  /* determine particle numbers of each type in file */
+  if(ThisTask == writeTask)
+    {
+      for(int n = 0; n < NTYPES + 2; n++)
+        ntot_type[n] = n_type[n];
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        {
+          long long nn[NTYPES + 2];
+          MPI_Recv(&nn[0], NTYPES + 2, MPI_LONG_LONG, task, TAG_LOCALN, Communicator, MPI_STATUS_IGNORE);
+          for(int n = 0; n < NTYPES + 2; n++)
+            ntot_type[n] += nn[n];
+        }
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        MPI_Send(&ntot_type[0], NTYPES + 2, MPI_LONG_LONG, task, TAG_N, Communicator);
+    }
+  else
+    {
+      MPI_Send(&n_type[0], NTYPES + 2, MPI_LONG_LONG, writeTask, TAG_LOCALN, Communicator);
+      MPI_Recv(&ntot_type[0], NTYPES + 2, MPI_LONG_LONG, writeTask, TAG_N, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* fill file header */
+
+  for(int n = 0; n < NTYPES; n++)
+    {
+      header.npart[n]      = ntot_type[n];
+      header.npartTotal[n] = ntot_type_all[n];
+    }
+
+  if(reorder_flag)
+    {
+      header.Ntrees    = ntot_type[NTYPES];
+      header.TotNtrees = MergerTree->TotNtrees;
+
+      header.Npix    = 0;
+      header.TotNpix = 0;
+    }
+  else
+    {
+      header.Ntrees    = 0;
+      header.TotNtrees = 0;
+
+      header.Npix    = ntot_type[NTYPES + 1];
+      header.TotNpix = Lp->Npix;
+    }
+
+  header.num_files = All.NumFilesPerSnapshot;
+}
+
+void lightcone_particle_io::write_header_fields(hid_t handle)
+{
+  write_vector_attribute(handle, "NumPart_ThisFile", header.npart, H5T_NATIVE_UINT64, NTYPES);
+  write_vector_attribute(handle, "NumPart_Total", header.npartTotal, H5T_NATIVE_UINT64, NTYPES);
+  write_scalar_attribute(handle, "NumFiles", &header.num_files, H5T_NATIVE_INT);
+
+  if(header.TotNtrees > 0)
+    {
+      write_scalar_attribute(handle, "Ntrees_ThisFile", &header.Ntrees, H5T_NATIVE_UINT64);
+      write_scalar_attribute(handle, "Ntrees_Total", &header.TotNtrees, H5T_NATIVE_UINT64);
+    }
+
+  if(header.TotNpix > 0)
+    {
+      write_scalar_attribute(handle, "Npix_ThisFile", &header.Npix, H5T_NATIVE_UINT32);
+      write_scalar_attribute(handle, "Npix_Total", &header.TotNpix, H5T_NATIVE_UINT32);
+    }
+}
+
+void lightcone_particle_io::set_filenr_in_header(int numfiles) { header.num_files = numfiles; }
+
+void lightcone_particle_io::get_datagroup_name(int type, char *buf)
+{
+  if(type < NTYPES)
+    sprintf(buf, "/PartType%d", type);
+  else if(type == NTYPES)
+    sprintf(buf, "/TreeTable");
+  else if(type == NTYPES + 1)
+    sprintf(buf, "/HealPixHashTable");
+  else
+    Terminate("wrong group");
+}
+
+void *lightcone_particle_io::get_base_address_of_structure(enum arrays array, int index)
+{
+  switch(array)
+    {
+      case A_LC:
+        return (void *)(Lp->P + index);
+
+      case A_PS:
+        return (void *)(Lp->PS + index);
+
+      case A_TT:
+        return (void *)(MergerTree->TreeTable + index);
+
+      case A_MM:
+        return (void *)(Lp->HealPixTab_PartCount + index);
+
+      default:
+        Terminate("we don't expect to get here");
+    }
+
+  return NULL;
+}
+
+void lightcone_particle_io::read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *n_type,
+                                             long long *ntot_type, int *nstart)
+{
+  n_type[NTYPES]        = 0;
+  ntot_type[NTYPES]     = 0;
+  n_type[NTYPES + 1]    = 0;
+  ntot_type[NTYPES + 1] = 0;
+
+  if(Lp->TotNumPart == 0)
+    {
+      if(header.num_files <= 1)
+        for(int i = 0; i < NTYPES; i++)
+          header.npartTotal[i] = header.npart[i];
+
+      Lp->TotNumPart = 0;
+
+      for(int i = 0; i < NTYPES; i++)
+        Lp->TotNumPart += header.npartTotal[i];
+    }
+
+  if(ThisTask == readTask)
+    {
+      if(filenr == 0 && nstart == NULL)
+        {
+          for(int type = 0; type < NTYPES; type++)
+            {
+              mpi_printf("READ-LIGHTCONE: Type %d:         %8lld  (tot=%15lld)\n", type, (long long)header.npart[type],
+                         (long long)header.npartTotal[type]);
+            }
+          mpi_printf("\n");
+        }
+    }
+
+  /* to collect the gas particles all at the beginning (in case several
+     snapshot files are read on the current CPU) we move the collisionless
+     particles such that a gap of the right size is created */
+
+  long long nall = 0;
+  for(int type = 0; type < NTYPES; type++)
+    {
+      ntot_type[type] = header.npart[type];
+
+      long long n_in_file = header.npart[type];
+      int ntask           = lastTask - readTask + 1;
+      int n_for_this_task = n_in_file / ntask;
+      if((ThisTask - readTask) < (n_in_file % ntask))
+        n_for_this_task++;
+
+      n_type[type] = n_for_this_task;
+
+      nall += n_for_this_task;
+    }
+
+  if(nstart)
+    {
+      memmove(&Lp->P[nall], &Lp->P[0], Lp->NumPart * sizeof(lightcone_particle_data));
+
+      *nstart = 0;
+    }
+}
+
+void lightcone_particle_io::read_header_fields(const char *fname)
+{
+  for(int i = 0; i < NTYPES; i++)
+    {
+      header.npart[i]      = 0;
+      header.npartTotal[i] = 0;
+    }
+
+  header.Ntrees    = 0;
+  header.TotNtrees = 0;
+
+  int ntypes = NTYPES;
+
+  hid_t hdf5_file = my_H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
+  hid_t handle    = my_H5Gopen(hdf5_file, "/Header");
+
+  /* check if the file in question actually has this number of types */
+  hid_t hdf5_attribute = my_H5Aopen_name(handle, "NumPart_ThisFile");
+  hid_t space          = H5Aget_space(hdf5_attribute);
+  hsize_t dims, len;
+  H5Sget_simple_extent_dims(space, &dims, &len);
+  H5Sclose(space);
+  if(len != (size_t)ntypes)
+    Terminate("Length of NumPart_ThisFile attribute (%d) does not match NTYPES(ICS) (%d)", (int)len, ntypes);
+  my_H5Aclose(hdf5_attribute, "NumPart_ThisFile");
+
+  /* now read the header fields */
+  read_vector_attribute(handle, "NumPart_ThisFile", header.npart, H5T_NATIVE_UINT64, ntypes);
+  read_vector_attribute(handle, "NumPart_Total", header.npartTotal, H5T_NATIVE_UINT64, ntypes);
+  read_scalar_attribute(handle, "NumFiles", &header.num_files, H5T_NATIVE_INT);
+
+  my_H5Gclose(handle, "/Header");
+  my_H5Fclose(hdf5_file, fname);
+}
+
+void lightcone_particle_io::read_increase_numbers(int type, int n_for_this_task) { Lp->NumPart += n_for_this_task; }
+
+int lightcone_particle_io::get_filenr_from_header(void) { return header.num_files; }
+
+int lightcone_particle_io::get_type_of_element(int index)
+{
+  if(index < 0 || index >= Lp->NumPart)
+    Terminate("index = %d  Lp->NumPart=%d", index, Lp->NumPart);
+
+  if(reorder_flag || LightCone->lightcone_is_cone_member(index, cone))
+    return Lp->P[index].getType();
+  else
+    return -1; /* this will skip this particle */
+}
+
+void lightcone_particle_io::set_type_of_element(int index, int type)
+{
+  if(type < NTYPES)
+    Lp->P[index].setType(type);
+}
+
+#endif
diff --git a/src/lightcone/lightcone_particle_io.h b/src/lightcone/lightcone_particle_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..da81b7949dadf34d82ee55938ee02ef2211407f1
--- /dev/null
+++ b/src/lightcone/lightcone_particle_io.h
@@ -0,0 +1,164 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file lightcone_particle_io.h
+ *
+ * \brief declares class for I/O of lightcone particles
+ */
+
+#ifndef LIGHTCONE_IO_H
+#define LIGHTCONE_IO_H
+
+#include "gadgetconfig.h"
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES)
+
+#include "../data/intposconvert.h"
+#include "../data/lcparticles.h"
+#include "../io/io.h"
+#include "../lightcone/lightcone.h"
+#include "../mergertree/mergertree.h"
+
+class lightcone_particle_io : public IO_Def
+{
+ public:
+  lightcone_particle_io(lcparticles *Lp_ptr, lightcone *LightCone_ptr, mergertree *MergerTree_ptr, MPI_Comm comm, int format);
+
+  void lightcone_save(int num, int conenr, bool reordered_flag);
+  void lightcone_read(int num, int conenr);
+
+  /* supplied virtual functions */
+  void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart);
+  void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                        int *nstart);
+  void get_datagroup_name(int grnr, char *gname);
+  void write_header_fields(hid_t);
+  void read_header_fields(const char *fname);
+  void read_increase_numbers(int type, int n_for_this_task);
+  int get_filenr_from_header(void);
+  void set_filenr_in_header(int);
+  void *get_base_address_of_structure(enum arrays array, int index);
+  int get_type_of_element(int index);
+  void set_type_of_element(int index, int type);
+
+  /** Header for the standard file format.
+   */
+  struct io_header
+  {
+    long long npart[NTYPES]; /**< number of particles of each type in this file */
+    long long npartTotal[NTYPES];
+
+    long long Ntrees;
+    long long TotNtrees;
+
+    int Npix;
+    int TotNpix;
+
+    int num_files;
+  };
+  io_header header; /**< holds header for snapshot files */
+
+ private:
+  lcparticles *Lp;
+  lightcone *LightCone;
+  mergertree *MergerTree;
+
+  int cone;
+  bool reorder_flag;
+  long long ntot_type_all[NTYPES];
+
+  /*
+   * special input/output functions for certain fields
+   */
+
+  static void io_func_pos(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    /* note: we know that components==3 here */
+    lightcone_particle_io *thisobj = (lightcone_particle_io *)ptr;
+
+    if(mode == 0)
+      {
+        MyDouble *out_buffer = (MyDouble *)buffer;
+
+        double xyz[3];
+        thisobj->Lp->signedintpos_to_pos((MySignedIntPosType *)thisobj->Lp->P[particle].IntPos, xyz);
+
+        for(int k = 0; k < 3; k++)
+          out_buffer[k] = xyz[k];
+      }
+    else
+      {
+        MyDouble *in_buffer = (MyDouble *)buffer;
+
+        /* note: for non-periodic positions, the conversion to integer coordinates is undefined only after the initial read.
+         * We therefore store the coordinates first in a temporary array */
+
+        double xyz[3];
+
+        for(int k = 0; k < 3; k++)
+          xyz[k] = in_buffer[k];
+
+        /* converts floating point representation to integers */
+        thisobj->Lp->pos_to_signedintpos(xyz, (MySignedIntPosType *)thisobj->Lp->P[particle].IntPos);
+      }
+  }
+
+#if defined(LIGHTCONE_OUTPUT_ACCELERATIONS) && defined(OUTPUT_ACCELERATIONS_IN_HALF_PRECISION)
+  static void io_func_accel(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    lightcone_particle_io *thisobj = (lightcone_particle_io *)ptr;
+
+    if(mode == 0)  // writing
+      {
+        MyFloat *out_buffer = (MyFloat *)buffer;
+        for(int k = 0; k < 3; k++)
+          out_buffer[k] = thisobj->Lp->P[particle].GravAccel[k] / All.accel_normalize_fac;
+      }
+    else  // reading
+      {
+        MyFloat *in_buffer = (MyFloat *)buffer;
+        for(int k = 0; k < components; k++)
+          thisobj->Lp->P[particle].GravAccel[k] = All.accel_normalize_fac * in_buffer[k];
+      }
+  }
+#endif
+
+  static void io_func_id(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    lightcone_particle_io *thisobj = (lightcone_particle_io *)ptr;
+
+    if(mode == 0)
+      {
+        MyIDType *out_buffer = (MyIDType *)buffer;
+        out_buffer[0]        = thisobj->Lp->P[particle].ID.get();
+      }
+    else
+      {
+        MyIDType *in_buffer = (MyIDType *)buffer;
+        thisobj->Lp->P[particle].ID.set(in_buffer[0]);
+      }
+  }
+
+  static void io_func_mass(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    lightcone_particle_io *thisobj = (lightcone_particle_io *)ptr;
+
+    if(mode == 0)
+      {
+        MyDouble *out_buffer = (MyDouble *)buffer;
+        out_buffer[0]        = thisobj->Lp->P[particle].getMass();
+      }
+    else
+      {
+        MyDouble *in_buffer = (MyDouble *)buffer;
+        thisobj->Lp->P[particle].setMass(in_buffer[0]);
+      }
+  }
+};
+
+#endif
+
+#endif /* LIGHTCONE_IO_H */
diff --git a/src/logs/logs.cc b/src/logs/logs.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3729977e4c1763aaa68ae7403ace8f8913c0c27f
--- /dev/null
+++ b/src/logs/logs.cc
@@ -0,0 +1,708 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file logs.cc
+ *
+ *  \brief routines for log-file handling
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../logs/logs.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/*! \brief Open files for logging.
+ *
+ *   This function opens various log-files that report on the status and
+ *   performance of the simulation. Upon restart, the code will append to
+ *   these files.
+ */
+void logs::open_logfiles(void)
+{
+  char mode[2], buf[MAXLEN_PATH_EXTRA];
+
+  if(All.RestartFlag == RST_BEGIN)
+    strcpy(mode, "w");
+  else
+    strcpy(mode, "a");
+
+  if(ThisTask != 0) /* only the root processors writes to the log files */
+    return;
+
+  sprintf(buf, "%s%s", All.OutputDir, "cpu.txt");
+  if(!(FdCPU = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+  sprintf(buf, "%s%s", All.OutputDir, "cpu.csv");
+  if(!(FdCPUCSV = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+  sprintf(buf, "%s%s", All.OutputDir, "info.txt");
+  if(!(FdInfo = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+  sprintf(buf, "%s%s", All.OutputDir, "energy.txt");
+  if(!(FdEnergy = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+  sprintf(buf, "%s%s", All.OutputDir, "timings.txt");
+  if(!(FdTimings = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+  sprintf(buf, "%s%s", All.OutputDir, "density.txt");
+  if(!(FdDensity = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+  sprintf(buf, "%s%s", All.OutputDir, "hydro.txt");
+  if(!(FdHydro = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+  sprintf(buf, "%s%s", All.OutputDir, "balance.txt");
+  if(!(FdBalance = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+  sprintf(buf, "%s%s", All.OutputDir, "timebins.txt");
+  if(!(FdTimebin = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+  sprintf(buf, "%s%s", All.OutputDir, "domain.txt");
+  if(!(FdDomain = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+#ifdef MEASURE_TOTAL_MOMENTUM
+  sprintf(buf, "%s%s", All.OutputDir, "momentum.txt");
+  if(!(FdMomentum = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+#endif
+
+#ifdef FORCETEST
+  sprintf(buf, "%s%s", All.OutputDir, "forcetest.txt");
+  if(!(FdForceTest = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+
+  fclose(FdForceTest);
+#endif
+
+#ifdef DEBUG_MD5
+  sprintf(buf, "%s%s", All.OutputDir, "debug_md5.txt");
+  if(!(FdDebug = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+#endif
+
+  fprintf(FdBalance, "\n");
+
+  fprintf(FdCPUCSV, "STEP, TIME, CPUS, MULTIPLEDOMAIN, HIGHESTTIMEBIN, ");
+
+  for(int i = 0; i < CPU_LAST; i++)
+    {
+      if(Timer_data[i].symb != 0 && Timer_data[i].symbImbal != 0)
+        fprintf(FdBalance, "%-20s = '%c' / '%c'\n", Timer_data[i].longname, Timer_data[i].symb, Timer_data[i].symbImbal);
+
+      fprintf(FdCPUCSV, "%s1, %s2, %s3, ", Timer_data[i].shortname, Timer_data[i].shortname, Timer_data[i].shortname);
+    }
+  fprintf(FdBalance, "\n");
+
+  fprintf(FdCPUCSV, "\n");
+
+#ifdef STARFORMATION
+  sprintf(buf, "%s%s", All.OutputDir, "sfr.txt");
+  if(!(FdSfr = fopen(buf, mode)))
+    Terminate("error in opening file '%s'\n", buf);
+#endif
+}
+
+int logs::flush_everything(void)
+{
+#ifndef REDUCE_FLUSH
+  return 0;
+#else
+  if(ThisTask == 0)
+    {
+      if((CPUThisRun - All.FlushLast) < All.FlushCpuTimeDiff)
+        {
+          return 0;
+        }
+      else
+        {
+          All.FlushLast = CPUThisRun;
+        }
+    }
+  else
+    {
+      return 0;
+    }
+#endif
+
+  mpi_printf("Flushing...\n");
+
+  fflush(FdDomain);
+  fflush(FdTimings);
+  fflush(FdInfo);
+  fflush(FdTimebin);
+  fflush(FdBalance);
+  fflush(FdCPU);
+  fflush(FdEnergy);
+  fflush(FdCPUCSV);
+
+#ifdef STARFORMATION
+  fflush(FdSfr);
+#endif
+
+  return 1;
+}
+
+/*! \brief Write the FdInfo and FdTimeBin files.
+ *
+ * At each time step this function writes on to two log-files.
+ * In FdInfo, it just lists the timesteps that have been done, while in
+ * FdTimeBin it outputs information about the active and occupied time-bins.
+ */
+void logs::output_log_messages(void)
+{
+  TIMER_START(CPU_LOGS);
+
+  long long tot_count_grav[TIMEBINS], tot_count_sph[TIMEBINS];
+  sumup_large_ints(TIMEBINS, Sp->TimeBinsGravity.TimeBinCount, tot_count_grav, Communicator);
+  sumup_large_ints(TIMEBINS, Sp->TimeBinsHydro.TimeBinCount, tot_count_sph, Communicator);
+
+  Mem.report_detailed_memory_usage_of_largest_task();
+
+  if(ThisTask == 0)
+    {
+      if(All.ComovingIntegrationOn)
+        {
+          double z = 1.0 / (All.Time) - 1;
+          fprintf(FdInfo, "\nSync-Point %d, Time: %g, Redshift: %g, Systemstep: %g, Dloga: %g, Nsync-grv: %10llu, Nsync-hyd: %10llu\n",
+                  All.NumCurrentTiStep, All.Time, z, All.TimeStep, log(All.Time) - log(All.Time - All.TimeStep),
+                  All.GlobalNSynchronizedGravity, All.GlobalNSynchronizedHydro);
+          printf("\n\nSync-Point %d, Time: %g, Redshift: %g, Systemstep: %g, Dloga: %g, Nsync-grv: %10llu, Nsync-hyd: %10llu\n",
+                 All.NumCurrentTiStep, All.Time, z, All.TimeStep, log(All.Time) - log(All.Time - All.TimeStep),
+                 All.GlobalNSynchronizedGravity, All.GlobalNSynchronizedHydro);
+          fprintf(FdTimebin, "\nSync-Point %d, Time: %g, Redshift: %g, Systemstep: %g, Dloga: %g\n", All.NumCurrentTiStep, All.Time, z,
+                  All.TimeStep, log(All.Time) - log(All.Time - All.TimeStep));
+          myflush(FdInfo);
+        }
+      else
+        {
+          fprintf(FdInfo, "\nSync-Point %d, Time: %g, Systemstep: %g, Nsync-grv: %10llu, Nsync-hyd: %10llu\n", All.NumCurrentTiStep,
+                  All.Time, All.TimeStep, All.GlobalNSynchronizedGravity, All.GlobalNSynchronizedHydro);
+          printf("\n\nSync-Point %d, Time: %g, Systemstep: %g, Nsync-grv: %10llu, Nsync-hyd: %10llu\n", All.NumCurrentTiStep, All.Time,
+                 All.TimeStep, All.GlobalNSynchronizedGravity, All.GlobalNSynchronizedHydro);
+          fprintf(FdTimebin, "\nSync-Point %d, Time: %g, Systemstep: %g\n", All.NumCurrentTiStep, All.Time, All.TimeStep);
+          myflush(FdInfo);
+        }
+
+      long long tot_cumulative_grav[TIMEBINS], tot_cumulative_sph[TIMEBINS];
+      tot_cumulative_grav[0] = tot_count_grav[0];
+      tot_cumulative_sph[0]  = tot_count_sph[0];
+
+      for(int i = 1; i < TIMEBINS; i++)
+        {
+          tot_cumulative_grav[i] = tot_count_grav[i] + tot_cumulative_grav[i - 1];
+          tot_cumulative_sph[i]  = tot_count_sph[i] + tot_cumulative_sph[i - 1];
+        }
+
+      double avg_CPU_TimeBin[TIMEBINS];
+
+      for(int i = 0; i < TIMEBINS; i++)
+        {
+          double sum = 0;
+          for(int j = 0; j < All.CPU_TimeBinCountMeasurements[i]; j++)
+            sum += All.CPU_TimeBinMeasurements[i][j];
+
+          if(All.CPU_TimeBinCountMeasurements[i])
+            avg_CPU_TimeBin[i] = sum / All.CPU_TimeBinCountMeasurements[i];
+          else
+            avg_CPU_TimeBin[i] = 0;
+        }
+
+      int weight = 1;
+      double sum = 0;
+      double frac_CPU_TimeBin[TIMEBINS];
+
+      for(int i = All.HighestOccupiedTimeBin; i >= 0 && tot_count_grav[i] > 0; i--, weight *= 2)
+        {
+          int corr_weight;
+
+          if(weight > 1)
+            corr_weight = weight / 2;
+          else
+            corr_weight = weight;
+
+          frac_CPU_TimeBin[i] = corr_weight * avg_CPU_TimeBin[i];
+          sum += frac_CPU_TimeBin[i];
+        }
+
+      for(int i = All.HighestOccupiedTimeBin; i >= 0 && tot_count_grav[i] > 0; i--)
+        {
+          if(sum)
+            frac_CPU_TimeBin[i] /= sum;
+        }
+
+      fprintf(FdTimebin,
+              "Occupied timebins: gravity         sph          dt              cumul-grav   cumul-sph A D    avg-time  cpu-frac\n");
+
+      long long tot_grav = 0, tot_sph = 0;
+      for(int i = TIMEBINS - 1; i >= 0; i--)
+        if(tot_count_sph[i] > 0 || tot_count_grav[i] > 0)
+          {
+            fprintf(
+                FdTimebin, " %c  bin=%2d      %10llu  %10llu   %16.12f       %10llu  %10llu %c %c  %10.2f    %5.1f%%\n",
+                Sp->TimeBinSynchronized[i] ? 'X' : ' ', i, tot_count_grav[i], tot_count_sph[i],
+                i > 0 ? (((integertime)1) << i) * All.Timebase_interval : 0.0, tot_cumulative_grav[i], tot_cumulative_sph[i],
+                (i == All.HighestActiveTimeBin) ? '<' : ' ',
+                (All.HighestActiveTimeBin >= All.SmallestTimeBinWithDomainDecomposition && i == All.HighestActiveTimeBin) ? '*' : ' ',
+                avg_CPU_TimeBin[i], 100.0 * frac_CPU_TimeBin[i]);
+
+            if(Sp->TimeBinSynchronized[i])
+              {
+                tot_grav += tot_count_grav[i];
+                tot_sph += tot_count_sph[i];
+              }
+          }
+      fprintf(FdTimebin, "               ------------------------\n");
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+      if(All.PM_Ti_endstep == All.Ti_Current)
+        {
+          fprintf(FdTimebin, "PM-Step. Total: %10llu  %10llu\n", tot_grav, tot_sph);
+        }
+      else
+#endif
+        {
+          fprintf(FdTimebin, "Total active:   %10llu  %10llu\n", tot_grav, tot_sph);
+        }
+      fprintf(FdTimebin, "\n");
+      myflush(FdTimebin);
+    }
+
+  TIMER_STOP(CPU_LOGS);
+}
+
+void logs::init_cpu_log(simparticles *Sp_ptr)
+{
+  Sp = Sp_ptr;
+
+  for(int i = 0; i < CPU_LAST; i++)
+    {
+      if(Timer_data[i].parent >= 0)
+        Timer_data[i].depth = Timer_data[Timer_data[i].parent].depth + 1;
+      else
+        Timer_data[i].depth = 0;
+    }
+
+  for(int i = 0; i < CPU_LAST; i++)
+    {
+      CPU_Sum[i]  = 0.;
+      CPU_Step[i] = 0.;
+    }
+  CPUThisRun = 0.;
+
+  TimerStackPos = 0;
+  TimerStack[0] = CPU_MISC;
+
+  WallclockTime = Logs.second();
+  StartOfRun    = Logs.second();
+}
+
+/*! \brief Write the FdBalance and FdCPU files.
+ *
+ * At each time step this function writes on to two log-files.
+ * In FdBalance, it outputs in a graphical way the amount of
+ * time spent in the various parts of the code, while
+ * in FdCPU it writes information about the cpu-time consumption
+ * of the various modules.
+ */
+void logs::write_cpu_log(void)
+{
+  TIMER_START(CPU_LOGS);
+
+  double local_total = 0;
+  for(int i = 0; i < CPU_LAST; i++)
+    local_total += CPU_Step[i];
+
+  double max_total = 0;
+  MPI_Reduce(&local_total, &max_total, 1, MPI_DOUBLE, MPI_MAX, 0, Communicator);
+
+  double max_CPU_Step[CPU_LAST], avg_CPU_Step[CPU_LAST];
+  MPI_Reduce(CPU_Step, max_CPU_Step, CPU_LAST, MPI_DOUBLE, MPI_MAX, 0, Communicator);
+  MPI_Reduce(CPU_Step, avg_CPU_Step, CPU_LAST, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+
+  if(ThisTask == 0)
+    {
+      double summed_CPU_Step[CPU_LAST];
+
+      /* sum up cpu items into groups */
+      for(int i = 0; i < CPU_LAST; i++)
+        summed_CPU_Step[i] = avg_CPU_Step[i];
+
+      for(int i = CPU_LAST - 1; i > CPU_ALL; i--)
+        if(Timer_data[i].parent >= 0)
+          summed_CPU_Step[Timer_data[i].parent] += summed_CPU_Step[i];
+
+      /* calc averages, update CPU_Sum */
+      double avg_total = 0;
+      for(int i = 0; i < CPU_LAST; i++)
+        {
+          avg_CPU_Step[i] /= NTask;
+          avg_total += avg_CPU_Step[i];
+
+          summed_CPU_Step[i] /= NTask;
+          CPU_Sum[i] += summed_CPU_Step[i];
+        }
+
+      /* create balance.txt string */
+      char cpu_String[CPU_STRING_LEN + 1];
+      put_symbol(cpu_String, 0., 1.0, '-');
+
+      double tsum = 0.0;
+      for(int i = 1; i < CPU_LAST; i++)
+        {
+          if(max_CPU_Step[i] > 0 && Timer_data[i].symb != 0 && Timer_data[i].symbImbal != 0)
+            {
+              double t0 = tsum;
+              double t1 = tsum + avg_CPU_Step[i] * (avg_CPU_Step[i] / max_CPU_Step[i]);
+              put_symbol(cpu_String, t0 / avg_total, t1 / avg_total, Timer_data[i].symb);
+              tsum += t1 - t0;
+
+              t0 = tsum;
+              t1 = tsum + avg_CPU_Step[i] * ((max_CPU_Step[i] - avg_CPU_Step[i]) / max_CPU_Step[i]);
+              put_symbol(cpu_String, t0 / avg_total, t1 / avg_total, Timer_data[i].symbImbal);
+              tsum += t1 - t0;
+            }
+        }
+
+      // put_symbol(cpu_String, tsum / max_total, 1.0, '-');
+      fprintf(FdBalance, "Step=%7d  sec=%10.3f Nsync-grv=%10llu Nsync-hyd=%10llu  %s\n", All.NumCurrentTiStep, max_total,
+              All.GlobalNSynchronizedGravity, All.GlobalNSynchronizedHydro, cpu_String);
+      myflush(FdBalance);
+
+      if(All.CPU_TimeBinCountMeasurements[All.HighestActiveTimeBin] == NUMBER_OF_MEASUREMENTS_TO_RECORD)
+        {
+          All.CPU_TimeBinCountMeasurements[All.HighestActiveTimeBin]--;
+          memmove(&All.CPU_TimeBinMeasurements[All.HighestActiveTimeBin][0], &All.CPU_TimeBinMeasurements[All.HighestActiveTimeBin][1],
+                  (NUMBER_OF_MEASUREMENTS_TO_RECORD - 1) * sizeof(double));
+        }
+
+      All.CPU_TimeBinMeasurements[All.HighestActiveTimeBin][All.CPU_TimeBinCountMeasurements[All.HighestActiveTimeBin]++] = max_total;
+
+      fprintf(FdCPUCSV, "%d, %g, %d, %d, ", All.NumCurrentTiStep, All.Time, NTask, All.HighestActiveTimeBin);
+
+      fprintf(FdCPU, "Step %d, Time: %g, CPUs: %d, HighestActiveTimeBin: %d\n", All.NumCurrentTiStep, All.Time, NTask,
+              All.HighestActiveTimeBin);
+      fprintf(FdCPU, "                          diff                 cumulative\n");
+
+      for(int i = 0; i < CPU_LAST; i++)
+        {
+          fprintf(FdCPU, "%*s%*s%10.2f  %5.1f%% %10.2f  %5.1f%%\n", 2 * Timer_data[i].depth, "", -20 + 2 * Timer_data[i].depth,
+                  Timer_data[i].longname, summed_CPU_Step[i], summed_CPU_Step[i] / summed_CPU_Step[CPU_ALL] * 100., CPU_Sum[i],
+                  CPU_Sum[i] / CPU_Sum[CPU_ALL] * 100.);
+          fprintf(FdCPUCSV, "%f, %f, %f, ", summed_CPU_Step[i], CPU_Sum[i], CPU_Sum[i] / CPU_Sum[CPU_ALL] * 100.);
+        }
+
+      fprintf(FdCPU, "\n");
+      myflush(FdCPU);
+
+      fprintf(FdCPUCSV, "\n");
+      myflush(FdCPUCSV);
+    }
+
+  CPUThisRun = Logs.timediff(StartOfRun, Logs.second());
+
+  for(int i = 0; i < CPU_LAST; i++)
+    CPU_Step[i] = 0.;
+
+  TIMER_STOP(CPU_LOGS);
+}
+
+/*! \brief Fill the cpu balance string representing the cpu usage in a graphical way
+ *
+ * This function fills a fraction, specified by the parameters t0 and t1,
+ * of the array string with the debug symbol given by c.
+ *
+ * \param string string to fill
+ * \param t0 initial position of the symbol in the array as a fraction of its maximum dimension
+ * \param t1 final position of the symbol in the array as a fraction of its maximum dimension
+ * \param c symbol to be put on string
+ */
+void logs::put_symbol(char *string, double t0, double t1, char c)
+{
+  int i = (int)(t0 * CPU_STRING_LEN + 0.5);
+  int j = (int)(t1 * CPU_STRING_LEN);
+
+  if(i < 0)
+    i = 0;
+  if(j >= CPU_STRING_LEN)
+    j = CPU_STRING_LEN;
+
+  while(i <= j)
+    string[i++] = c;
+
+  string[CPU_STRING_LEN] = 0;
+}
+
+double logs::measure_time(void) /* strategy: call this at end of functions to account for time in this function, and before another
+                                   (nontrivial) function is called */
+{
+  double t      = Logs.second();
+  double dt     = t - WallclockTime;
+  WallclockTime = t;
+
+  return dt;
+}
+
+/* returns the number of cpu-ticks in seconds that
+ * have elapsed. (or the wall-clock time)
+ */
+double logs::second(void)
+{
+  return MPI_Wtime();
+
+  /*
+   * possible alternative:
+   *
+   * return ((double) clock()) / CLOCKS_PER_SEC;
+   *
+   * but note: on AIX and presumably many other 32bit systems,
+   * clock() has only a resolution of 10ms=0.01sec
+   */
+}
+
+/* returns the time difference between two measurements
+ * obtained with Logs.second().
+ */
+double logs::timediff(double t0, double t1)
+{
+  double dt = t1 - t0;
+
+  if(dt < 0)
+    dt = 0;
+
+  return dt;
+}
+
+/*! \brief Computes new global statistics if needed (done by energy_statistics())
+ *
+ */
+void logs::compute_statistics(void)
+{
+  /* check whether we want a full energy statistics */
+  if((All.Time - All.TimeLastStatistics) >= All.TimeBetStatistics &&
+     All.HighestActiveTimeBin == All.HighestOccupiedTimeBin) /* allow only top-level synchronization points */
+    {
+      compute_global_quantities_of_system();
+
+      if(ThisTask == 0)
+        {
+          fprintf(FdEnergy, "%14.8g %14.8g %14.8g %14.8g", All.Time, SysState.EnergyInt, SysState.EnergyPot, SysState.EnergyKin);
+
+          for(int i = 0; i < NTYPES; i++)
+            fprintf(FdEnergy, " %14.8g %14.8g %14.8g", SysState.EnergyIntComp[i], SysState.EnergyPotComp[i],
+                    SysState.EnergyKinComp[i]);
+
+          for(int i = 0; i < NTYPES; i++)
+            fprintf(FdEnergy, " %14.8g", SysState.MassComp[i]);
+
+          fprintf(FdEnergy, "\n");
+
+          myflush(FdEnergy);
+        }
+
+      All.TimeLastStatistics += All.TimeBetStatistics;
+    }
+}
+
+#ifdef MEASURE_TOTAL_MOMENTUM
+void logs::compute_total_momentum(void)
+{
+  double mom[3] = {0, 0, 0};
+
+  for(int i = 0; i < Sp->NumPart; i++)
+    {
+      for(int j = 0; j < 3; j++)
+        mom[j] += Sp->P[i].getMass() * Sp->P[i].Vel[j];
+    }
+
+  // some the stuff over all processors
+  double momsum[3] = {0, 0, 0};
+  MPI_Reduce(mom, momsum, 3, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+
+  if(ThisTask == 0)
+    {
+      fprintf(FdMomentum, "%14.8g   %25.15g %25.15g %25.15g\n", All.Time, momsum[0], momsum[1], momsum[2]);
+
+      myflush(FdMomentum);
+    }
+}
+#endif
+
+/*! \brief This routine computes various global properties of the particle
+ * distribution and stores the result in the struct `SysState'.
+ *
+ * Currently, not all the information that's computed here is
+ * actually used (e.g. momentum is not really used anywhere),
+ * just the energies are written to a log-file every once in a while.
+ */
+void logs::compute_global_quantities_of_system(void)
+{
+  state_of_system sys;
+  double egyspec;
+
+  particle_data *P = Sp->P;
+  // sph_particle_data *SphP = Sp->SphP;
+
+  All.set_cosmo_factors_for_current_time();
+
+  for(int n = 0; n < NTYPES; n++)
+    {
+      sys.MassComp[n] = sys.EnergyKinComp[n] = sys.EnergyPotComp[n] = sys.EnergyIntComp[n] = 0;
+
+      for(int j = 0; j < 4; j++)
+        sys.CenterOfMassComp[n][j] = sys.MomentumComp[n][j] = sys.AngMomentumComp[n][j] = 0;
+    }
+
+  for(int i = 0; i < Sp->NumPart; i++)
+    {
+      sys.MassComp[P[i].getType()] += Sp->P[i].getMass();
+
+#if defined(SELFGRAVITY) && defined(EVALPOTENTIAL)
+      sys.EnergyPotComp[P[i].getType()] += 0.5 * Sp->P[i].getMass() * P[i].Potential / All.cf_atime;
+#endif
+
+#if defined(EXTERNALGRAVITY) && defined(EVALPOTENTIAL)
+#if defined(SELFGRAVITY)
+      sys.EnergyPotComp[P[i].getType()] +=
+          0.5 * P[i].getMass() * P[i].ExtPotential;  // note: ExtPotential already included on P[].p.Potential, that's why only 0.5 is
+                                                     // needed here to recover the rest
+#else
+      sys.EnergyPotComp[P[i].getType()] += 1.0 * P[i].getMass() * P[i].ExtPotential;
+#endif
+#endif
+
+      double vel[3] = {0, 0, 0};
+
+      if(P[i].getType() == 0)
+        {
+          for(int j = 0; j < 3; j++)
+            vel[j] = P[i].Vel[j];
+
+          sys.EnergyKinComp[0] += 0.5 * Sp->P[i].getMass() * (vel[0] * vel[0] + vel[1] * vel[1] + vel[2] * vel[2]);
+
+          egyspec = Sp->get_utherm_from_entropy(i);
+
+          sys.EnergyIntComp[0] += Sp->P[i].getMass() * egyspec;
+        }
+#if(NTYPES > 1)
+      else
+        {
+          for(int j = 0; j < 3; j++)
+            vel[j] = P[i].Vel[j];
+
+          sys.EnergyKinComp[P[i].getType()] +=
+              0.5 * Sp->P[i].getMass() * (vel[0] * vel[0] + vel[1] * vel[1] + vel[2] * vel[2]) * All.cf_a2inv;
+        }
+#endif
+
+      double pos[3];
+      Sp->intpos_to_pos(P[i].IntPos, pos);  // converts the integer coordinates to floating point
+
+      for(int j = 0; j < 3; j++)
+        {
+          sys.MomentumComp[P[i].getType()][j] += Sp->P[i].getMass() * vel[j];
+          sys.CenterOfMassComp[P[i].getType()][j] += Sp->P[i].getMass() * pos[j];
+        }
+
+      sys.AngMomentumComp[P[i].getType()][0] += Sp->P[i].getMass() * (pos[1] * vel[2] - pos[2] * vel[1]);
+      sys.AngMomentumComp[P[i].getType()][1] += Sp->P[i].getMass() * (pos[2] * vel[0] - pos[0] * vel[2]);
+      sys.AngMomentumComp[P[i].getType()][2] += Sp->P[i].getMass() * (pos[0] * vel[1] - pos[1] * vel[0]);
+    }
+
+  // some the stuff over all processors
+  MPI_Reduce(&sys.MassComp[0], &SysState.MassComp[0], NTYPES, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+  MPI_Reduce(&sys.EnergyPotComp[0], &SysState.EnergyPotComp[0], NTYPES, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+  MPI_Reduce(&sys.EnergyIntComp[0], &SysState.EnergyIntComp[0], NTYPES, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+  MPI_Reduce(&sys.EnergyKinComp[0], &SysState.EnergyKinComp[0], NTYPES, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+  MPI_Reduce(&sys.MomentumComp[0][0], &SysState.MomentumComp[0][0], NTYPES * 4, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+  MPI_Reduce(&sys.AngMomentumComp[0][0], &SysState.AngMomentumComp[0][0], NTYPES * 4, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+  MPI_Reduce(&sys.CenterOfMassComp[0][0], &SysState.CenterOfMassComp[0][0], NTYPES * 4, MPI_DOUBLE, MPI_SUM, 0, Communicator);
+
+  if(ThisTask == 0)
+    {
+      for(int i = 0; i < NTYPES; i++)
+        SysState.EnergyTotComp[i] = SysState.EnergyKinComp[i] + SysState.EnergyPotComp[i] + SysState.EnergyIntComp[i];
+
+      SysState.Mass = SysState.EnergyKin = SysState.EnergyPot = SysState.EnergyInt = SysState.EnergyTot = 0;
+
+      for(int j = 0; j < 3; j++)
+        SysState.Momentum[j] = SysState.AngMomentum[j] = SysState.CenterOfMass[j] = 0;
+
+      for(int i = 0; i < NTYPES; i++)
+        {
+          SysState.Mass += SysState.MassComp[i];
+          SysState.EnergyKin += SysState.EnergyKinComp[i];
+          SysState.EnergyPot += SysState.EnergyPotComp[i];
+          SysState.EnergyInt += SysState.EnergyIntComp[i];
+          SysState.EnergyTot += SysState.EnergyTotComp[i];
+
+          for(int j = 0; j < 3; j++)
+            {
+              SysState.Momentum[j] += SysState.MomentumComp[i][j];
+              SysState.AngMomentum[j] += SysState.AngMomentumComp[i][j];
+              SysState.CenterOfMass[j] += SysState.CenterOfMassComp[i][j];
+            }
+        }
+
+      for(int i = 0; i < NTYPES; i++)
+        for(int j = 0; j < 3; j++)
+          if(SysState.MassComp[i] > 0)
+            SysState.CenterOfMassComp[i][j] /= SysState.MassComp[i];
+
+      for(int j = 0; j < 3; j++)
+        if(SysState.Mass > 0)
+          SysState.CenterOfMass[j] /= SysState.Mass;
+
+      for(int i = 0; i < NTYPES; i++)
+        {
+          SysState.CenterOfMassComp[i][3] = SysState.MomentumComp[i][3] = SysState.AngMomentumComp[i][3] = 0;
+          for(int j = 0; j < 3; j++)
+            {
+              SysState.CenterOfMassComp[i][3] += SysState.CenterOfMassComp[i][j] * SysState.CenterOfMassComp[i][j];
+              SysState.MomentumComp[i][3] += SysState.MomentumComp[i][j] * SysState.MomentumComp[i][j];
+              SysState.AngMomentumComp[i][3] += SysState.AngMomentumComp[i][j] * SysState.AngMomentumComp[i][j];
+            }
+          SysState.CenterOfMassComp[i][3] = sqrt(SysState.CenterOfMassComp[i][3]);
+          SysState.MomentumComp[i][3]     = sqrt(SysState.MomentumComp[i][3]);
+          SysState.AngMomentumComp[i][3]  = sqrt(SysState.AngMomentumComp[i][3]);
+        }
+
+      SysState.CenterOfMass[3] = SysState.Momentum[3] = SysState.AngMomentum[3] = 0;
+
+      for(int j = 0; j < 3; j++)
+        {
+          SysState.CenterOfMass[3] += SysState.CenterOfMass[j] * SysState.CenterOfMass[j];
+          SysState.Momentum[3] += SysState.Momentum[j] * SysState.Momentum[j];
+          SysState.AngMomentum[3] += SysState.AngMomentum[j] * SysState.AngMomentum[j];
+        }
+
+      SysState.CenterOfMass[3] = sqrt(SysState.CenterOfMass[3]);
+      SysState.Momentum[3]     = sqrt(SysState.Momentum[3]);
+      SysState.AngMomentum[3]  = sqrt(SysState.AngMomentum[3]);
+    }
+
+  // give everyone the result, maybe the want to do something with it
+  MPI_Bcast(&SysState, sizeof(state_of_system), MPI_BYTE, 0, Communicator);
+}
diff --git a/src/logs/logs.h b/src/logs/logs.h
new file mode 100644
index 0000000000000000000000000000000000000000..680a40a643a334fc45904c6d48bad49ab4e59de9
--- /dev/null
+++ b/src/logs/logs.h
@@ -0,0 +1,261 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file logs.h
+ *
+ *  \brief declares constants and some classes used for log-file handling
+ */
+
+#ifndef LOGS_H
+#define LOGS_H
+
+#include <stdio.h>
+
+#include "../main/simulation.h"
+#include "../mpi_utils/setcomm.h"
+
+#define CPU_STRING_LEN 120
+
+#define TIMER_STACK_DEPTH 30
+
+class simparticles;
+class lcparticles;
+
+class logs : public setcomm
+{
+ public:
+  logs() : setcomm("delayed init")  // constructor
+  {
+
+#define TIMER_STRUCT
+#include "../logs/timer.h"
+#undef TIMER_STRUCT
+
+  }
+
+
+
+  simparticles *Sp;
+
+  double CPUThisRun; /*!< Sums CPU time of current process */
+
+  FILE *FdInfo,   /**< file handle for info.txt log-file. */
+      *FdEnergy,  /**< file handle for energy.txt log-file. */
+      *FdTimings, /**< file handle for timings.txt log-file. */
+      *FdDensity, /**< file handle for timings.txt log-file. */
+      *FdHydro,   /**< file handle for timings.txt log-file. */
+      *FdBalance, /**< file handle for balance.txt log-file. */
+      *FdTimebin, /**< file handle for timebins.txt log-file. */
+      *FdDomain,  /**< file handle for domain.txt log-file. */
+      *FdCPU;     /**< file handle for cpu.txt log-file. */
+
+  FILE *FdCPUCSV; /**< file handle for cpu.csv log-file. Used if the cpu log is printed in csv format as well. */
+
+#ifdef MEASURE_TOTAL_MOMENTUM
+  FILE *FdMomentum;
+#endif
+
+#ifdef STARFORMATION
+  FILE *FdSfr; /**< file handle for sfr.txt log-file. */
+#endif
+
+#ifdef DEBUG_MD5
+  FILE *FdDebug; /**< file handle for debug_md5.txt log-file. */
+#endif
+
+#ifdef FORCETEST
+  FILE *FdForceTest; /*!< file handle for forcetest.txt log-file. */
+#endif
+
+  void init_cpu_log(simparticles *Sp_ptr);
+  void open_logfiles(void);
+  void write_cpu_log(void);
+  void output_log_messages(void);
+  void compute_statistics(void);
+  void print_particle_info_from_ID(MyIDType ID);
+  void print_particle_info(int i);
+  void log_debug_md5(const char *msg);
+  void calc_memory_checksum(const char *msg, void *base, size_t bytes);
+  void block_checksum(void *base, size_t bytes, int res[4]);
+  int flush_everything(void);
+  void compute_total_momentum(void);
+
+  double measure_time(void);
+  double timediff(double t0, double t1);
+  double second(void);
+
+  enum timers
+  {
+    CPU_NONE = -2, /*!< used for counters without a parent */
+    CPU_ROOT = -1, /*!< root node of the tree */
+
+#define TIMER_ENUM
+#include "../logs/timer.h"
+#undef TIMER_ENUM
+
+  };
+
+  /*! \brief struct containing the information of a CPU timer
+   *
+   */
+  struct timer_d
+  {
+    int parent;         /*!< id of the parent timer */
+    char shortname[30]; /*!< string containing the internal name of the timer */
+    char longname[30];  /*!< name of the timer */
+    char symb;          /*!< symbol used in balance.txt for the active part */
+    char symbImbal;     /*!< symbol used in balance.txt for imbalance */
+    char depth;         /*!< depth in the tree-like structure of this timer */
+  };
+  timer_d Timer_data[CPU_LAST + 1];
+
+  double CPU_Step[CPU_LAST];
+  double CPU_Step_Stored[CPU_LAST];
+  double CPU_Sum[CPU_LAST]; /**< sums wallclock time/CPU consumption in whole run */
+
+  enum timers TimerStack[TIMER_STACK_DEPTH];
+  int TimerStackPos = 0;
+
+ private:
+  double StartOfRun; /*!< This stores the time of the start of the run for evaluating the elapsed time */
+
+  double WallclockTime; /*!< This holds the last wallclock time measurement for timings measurements */
+
+  void put_symbol(char *string, double t0, double t1, char c);
+
+  /* global state of system */
+  struct state_of_system
+  {
+    double Mass, EnergyKin, EnergyPot, EnergyInt, EnergyTot, Momentum[4], AngMomentum[4], CenterOfMass[4], MassComp[NTYPES],
+        EnergyKinComp[NTYPES], EnergyPotComp[NTYPES], EnergyIntComp[NTYPES], EnergyTotComp[NTYPES], MomentumComp[NTYPES][4],
+        AngMomentumComp[NTYPES][4], CenterOfMassComp[NTYPES][4];
+  };
+  state_of_system SysState;
+
+  void compute_global_quantities_of_system(void);
+
+ public:
+  void timer_stop(enum timers counter)
+  {
+    if(TimerStack[TimerStackPos] != counter)
+      Terminate("Wrong use of timer_stop(), you must stop the timer started last");
+
+    CPU_Step[TimerStack[TimerStackPos--]] += measure_time();
+
+    if(TimerStackPos < 0)
+      Terminate("Do not stop the out CPU_MISC timer");
+  }
+
+  void timer_start(enum timers counter)
+  {
+    CPU_Step[TimerStack[TimerStackPos]] += measure_time();
+
+    for(int itimer = 0; itimer <= TimerStackPos; itimer++)
+      if(counter == TimerStack[itimer])
+        Terminate("Try to start timer %d, but it is already running.\n", counter);
+
+    if(++TimerStackPos >= TIMER_STACK_DEPTH)
+      {
+        Terminate("Run out of timer stack space, increase TIMER_STACK_DEPTH");
+      }
+    else
+      TimerStack[TimerStackPos] = counter;
+  }
+};
+
+#include "../data/simparticles.h"
+
+#define TIMER_START_INTERNAL(counter)                                                      \
+  {                                                                                        \
+    Logs.CPU_Step[Logs.TimerStack[Logs.TimerStackPos]] += Logs.measure_time();             \
+    for(int itimer = 0; itimer <= Logs.TimerStackPos; itimer++)                            \
+      if(logs::counter == Logs.TimerStack[itimer])                                         \
+        {                                                                                  \
+          Terminate("Try to start timer %d, but it is already running.\n", logs::counter); \
+        };                                                                                 \
+    if(++Logs.TimerStackPos >= TIMER_STACK_DEPTH)                                          \
+      {                                                                                    \
+        Terminate("Run out of timer stack space, increase TIMER_STACK_DEPTH");             \
+      }                                                                                    \
+    else                                                                                   \
+      {                                                                                    \
+        Logs.TimerStack[Logs.TimerStackPos] = logs::counter;                               \
+      }                                                                                    \
+  }
+
+/*! \def  TIMER_START(counter)
+ * \brief Starts the timer counter
+ *
+ * Use this macro instead of directly accessing the CPU_Step array,
+ * so manual instrumentation APIs can be attached.
+ *
+ * \param counter Name of the timer to start
+ */
+#define TIMER_START(counter) TIMER_START_INTERNAL(counter)
+
+#define TIMER_STOP_INTERNAL(counter)                                                \
+  {                                                                                 \
+    if(Logs.TimerStack[Logs.TimerStackPos] != logs::counter)                        \
+      {                                                                             \
+        Terminate("Wrong use of TIMER_STOP, you must stop the timer started last"); \
+      }                                                                             \
+    Logs.CPU_Step[Logs.TimerStack[Logs.TimerStackPos--]] += Logs.measure_time();    \
+    if(Logs.TimerStackPos < 0)                                                      \
+      {                                                                             \
+        Terminate("Do not stop the out CPU_MISC timer");                            \
+      }                                                                             \
+  }
+
+/*! \def TIMER_STOP(counter)
+ * \brief Stops the timer counter
+ *
+ * Use this macro instead of directly accessing the CPU_Step array,
+ * so manual instrumentation APIs can be attached.
+ *
+ * \param counter Name of the timer to stop
+ */
+#define TIMER_STOP(counter) TIMER_STOP_INTERNAL(counter)
+
+/*! \def TIMER_STOPSTART(stop, start)
+ * \brief Stops the timer 'stop' and starts the timer 'start'
+ *
+ * Use this macro instead of directly accessing the CPU_Step array,
+ * so manual instrumentation APIs can be attached.
+ *
+ * \param stop Name of the timer to stop
+ * \param start Name of the timer to start
+ */
+#define TIMER_STOPSTART(stop, start) \
+  {                                  \
+    TIMER_STOP_INTERNAL(stop);       \
+    TIMER_START_INTERNAL(start);     \
+  }
+
+/*! \def TIMER_ADD(counter, amount)
+ * \brief Adds amount to the timer counter
+
+ * \param counter Name of the timer to add to
+ * \param amount amount to add to timer counter
+ */
+#define TIMER_ADD(counter, amount) Logs.CPU_Step[counter] += (amount);
+
+/*! \def TIMER_DIFF(counter)
+ * \brief Returns amount elapsed for the timer since last save with TIMER_STORE
+
+ * \param counter Name of the timer to add to
+ */
+#define TIMER_DIFF(counter) (Logs.CPU_Step[logs::counter] - Logs.CPU_Step_Stored[logs::counter])
+
+/*! \def TIMER_STORE
+ * \brief Copies the current value of CPU times to a stored variable, such that differences with respect to this reference can be
+ * calculated
+ *
+ */
+#define TIMER_STORE memcpy(Logs.CPU_Step_Stored, Logs.CPU_Step, sizeof(Logs.CPU_Step));
+
+extern logs Logs;
+
+#endif
diff --git a/src/logs/timer.h b/src/logs/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c75dea96830800b32c104733449b70a371e3d510
--- /dev/null
+++ b/src/logs/timer.h
@@ -0,0 +1,104 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file timer.h
+ *
+ *  \brief defines the CPU consumption timers
+ */
+
+/*!
+ *  All CPU timers are defined here. Additionally the macros to use these timers are defined.
+ */
+
+#if defined(TIMER_ENUM) || defined(TIMER_STRUCT)
+
+/*! \def TIMER_CREATE(name, desc, par, symba, symbb)
+ * \brief creates a new CPU timer
+ *
+ * \param name name used in the code to reference this timer
+ * \param desc description string used in output files
+ * \param parent parent of this timer to build a tree-like hierarchy of timers
+ * \param symba character used for active time in balance.txt
+ * \param symbb character used for imbalance in balance.txt
+ *
+ */
+
+#if defined(TIMER_ENUM)
+
+#define TIMER_CREATE(name, desc, parent, symba, symbb) name,
+
+#else
+
+#define TIMER_CREATE(name, desc, par, symba, symbb) \
+  Timer_data[name].parent = par;                    \
+  strcpy(Timer_data[name].shortname, #name);        \
+  strcpy(Timer_data[name].longname, (desc));        \
+  Timer_data[name].symb      = (symba);             \
+  Timer_data[name].symbImbal = (symbb);
+#endif
+
+/*add your counter here, they must appear in the right order*/
+
+TIMER_CREATE(CPU_ALL, "total", CPU_ROOT, 0, 0) /*!< root timer, everything should be below this timer */
+TIMER_CREATE(CPU_TREE, "treegrav", CPU_ALL, 0, 0)
+TIMER_CREATE(CPU_TREEBUILD, "treebuild", CPU_TREE, 0, 0)
+TIMER_CREATE(CPU_TREEBUILD_INSERT, "insert", CPU_TREEBUILD, ':', 'r')
+TIMER_CREATE(CPU_TREEBUILD_BRANCHES, "branches", CPU_TREEBUILD, 'D', 'd')
+TIMER_CREATE(CPU_TREEBUILD_TOPLEVEL, "toplevel", CPU_TREEBUILD, 'E', 'e')
+TIMER_CREATE(CPU_TREEFORCE, "treeforce", CPU_TREE, 0, 0)
+TIMER_CREATE(CPU_TREEWALK, "treewalk", CPU_TREEFORCE, '*', 't')
+TIMER_CREATE(CPU_TREEIMBALANCE, "treeimbalance", CPU_TREEFORCE, 'x', 'X')
+TIMER_CREATE(CPU_TREEFETCH, "treefetch", CPU_TREEFORCE, '&', 'e')
+TIMER_CREATE(CPU_TREESTACK, "treestack", CPU_TREEFORCE, '#', 'a')
+#ifdef ALLOW_DIRECT_SUMMATION
+TIMER_CREATE(CPU_TREEDIRECT, "treedirect", CPU_TREE, 'r', '2')
+#endif
+#ifdef PMGRID
+TIMER_CREATE(CPU_PM_GRAVITY, "pm_grav", CPU_ALL, '|', 'n')
+#endif
+TIMER_CREATE(CPU_NGBTREEBUILD, "ngbtreebuild", CPU_ALL, 'A', 'a')
+TIMER_CREATE(CPU_NGBTREEUPDATEVEL, "ngbtreevelupdate", CPU_ALL, 'B', 'b')
+TIMER_CREATE(CPU_NGBTREEUPDATEMAXHSML, "ngbtreehsmlupdate", CPU_ALL, 'h', 'H')
+TIMER_CREATE(CPU_SPH, "sph", CPU_ALL, 0, 0)
+TIMER_CREATE(CPU_DENSITY, "density", CPU_SPH, 'd', 'D')
+TIMER_CREATE(CPU_DENSWALK, "densitywalk", CPU_DENSITY, 'e', 'E')
+TIMER_CREATE(CPU_DENSFETCH, "densityfetch", CPU_DENSITY, 'f', 'F')
+TIMER_CREATE(CPU_DENSIMBALANCE, "densimbalance", CPU_DENSITY, 'c', 'C')
+TIMER_CREATE(CPU_HYDRO, "hydro", CPU_SPH, 'J', 'j')
+TIMER_CREATE(CPU_HYDROWALK, "hydrowalk", CPU_HYDRO, '9', 'H')
+TIMER_CREATE(CPU_HYDROFETCH, "hydrofetch", CPU_HYDRO, '0', 'K')
+TIMER_CREATE(CPU_HYDROIMBALANCE, "hydroimbalance", CPU_HYDRO, '[', 'y')
+TIMER_CREATE(CPU_DOMAIN, "domain", CPU_ALL, '+', 'l')
+TIMER_CREATE(CPU_PEANO, "peano", CPU_ALL, '"', 'o')
+TIMER_CREATE(CPU_DRIFTS, "drift/kicks", CPU_ALL, '?', 'A')
+TIMER_CREATE(CPU_TIMELINE, "timeline", CPU_ALL, ')', 'I')
+TIMER_CREATE(CPU_TREE_TIMESTEPS, "treetimesteps", CPU_ALL, '0', 'W')
+TIMER_CREATE(CPU_SNAPSHOT, "i/o", CPU_ALL, '3', 'B')
+TIMER_CREATE(CPU_LOGS, "logs", CPU_ALL, 'K', 'k')
+#if defined(COOLING) || defined(STARFORMATION)
+TIMER_CREATE(CPU_COOLING_SFR, "sfrcool", CPU_ALL, '1', 'T')
+#endif
+#ifdef FOF
+TIMER_CREATE(CPU_FOF, "fof", CPU_ALL, '5', 'D')
+TIMER_CREATE(CPU_FOFWALK, "fofwalk", CPU_FOF, '6', 'G')
+TIMER_CREATE(CPU_FOFIMBAL, "fofimbal", CPU_FOF, '7', 'H')
+#endif
+#ifdef SUBFIND
+TIMER_CREATE(CPU_SUBFIND, "subfind", CPU_ALL, '6', 'E')
+#endif
+#ifdef NGENIC
+TIMER_CREATE(CPU_NGENIC, "ngenic", CPU_ALL, 'n', 'N')
+#endif
+TIMER_CREATE(CPU_RESTART, "restart", CPU_ALL, 'C', 'c')
+#ifdef FORCETEST
+TIMER_CREATE(CPU_FORCETEST, "forcetest", CPU_ALL, 't', 'T')
+#endif
+TIMER_CREATE(CPU_MISC, "misc", CPU_ALL, '7', 'F')
+TIMER_CREATE(CPU_LAST, "LAST", CPU_NONE, ' ', ' ') /*!<last item, do not use! */
+
+#undef TIMER_CREATE
+
+#endif
diff --git a/src/main/begrun.cc b/src/main/begrun.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb4bdb73d0794501266c6c19b2c20865d6422837
--- /dev/null
+++ b/src/main/begrun.cc
@@ -0,0 +1,412 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file begrun.cc
+ *
+ *  \brief initial set-up of a simulation run
+ */
+
+#include "compiler-command-line-args.h"
+#include "gadgetconfig.h"
+
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../data/simparticles.h"
+#include "../fmm/fmm.h"
+#include "../fof/fof.h"
+#include "../gitversion/version.h"
+#include "../gravity/ewald.h"
+#include "../gravtree/gravtree.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../io/parameters.h"
+#include "../lightcone/lightcone.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../mpi_utils/shared_mem_handler.h"
+#include "../ngbtree/ngbtree.h"
+#include "../pm/pm.h"
+#include "../system/system.h"
+#include "../time_integration/driftfac.h"
+#include "../time_integration/timestep.h"
+
+/*!
+ *  This file contains various functions to initialize a simulation run. In
+ *  particular, the parameter file is read in and parsed and global variables
+ *  are initialized to their proper values.
+ */
+
+void sim::hello(void)
+{
+  mpi_printf(
+      "\n  ___    __    ____    ___  ____  ____       __\n / __)  /__\\  (  _ \\  / __)( ___)(_  _)___  /. |\n"
+      "( (_-. /(__)\\  )(_) )( (_-. )__)   )( (___)(_  _)\n \\___/(__)(__)(____/  \\___/(____) (__)       (_)\n\n");
+
+  mpi_printf("This is Gadget, version %s.\nGit commit %s, %s\n\n", GADGET_VERSION, GIT_COMMIT, GIT_DATE);
+
+  mpi_printf("Code was compiled with the following compiler and flags:\n%s\n\n\n", compiler_flags);
+
+  mpi_printf("Code was compiled with the following settings:\n");
+
+  if(ThisTask == 0)
+    {
+      output_compile_time_options();
+    }
+
+  mpi_printf("\n\nRunning on %d MPI tasks.\n\n", NTask);
+
+#ifdef EXPLICIT_VECTORIZATION
+
+  int instrset = instrset_detect();
+  mpi_printf("CPU supports instruction sets: ");
+  if(instrset >= 1)
+    mpi_printf("SSE ");
+  if(instrset >= 2)
+    mpi_printf("SSE2 ");
+  if(instrset >= 3)
+    mpi_printf("SSE3 ");
+  if(instrset >= 4)
+    mpi_printf("SSSE3 ");
+  if(instrset >= 5)
+    mpi_printf("SSE4.1 ");
+  if(instrset >= 6)
+    mpi_printf("SSE4.2 ");
+  if(instrset >= 7)
+    mpi_printf("AVX ");
+  if(instrset >= 8)
+    mpi_printf("AVX2 ");
+  if(instrset >= 9)
+    mpi_printf("AVX512F ");
+  if(instrset >= 10)
+    mpi_printf("AVX512VL AVX512BW AVX512DQ ");
+  mpi_printf("\n\n");
+
+  if(instrset < 7)
+    {
+      mpi_printf(
+          "You compiled with explicit vectorization in terms of AVX instructions, but this CPU does not support AVX or higher.\n\n");
+      endrun();
+    }
+#endif
+
+  mpi_printf("\n");
+  mpi_printf("BEGRUN: Size of particle structure       %4d  [bytes]\n", (int)sizeof(particle_data));
+  mpi_printf("BEGRUN: Size of sph particle structure   %4d  [bytes]\n", (int)sizeof(sph_particle_data));
+  mpi_printf("BEGRUN: Size of gravity tree node        %4d  [bytes]\n", (int)sizeof(gravnode));
+  mpi_printf("BEGRUN: Size of neighbour tree node      %4d  [bytes]\n", (int)sizeof(ngbnode));
+  mpi_printf("BEGRUN: Size of subfind auxiliary data   %4d  [bytes]\n", (int)sizeof(subfind_data));
+
+  mpi_printf("\n");
+}
+
+/*! \brief This function performs the initial set-up of the simulation.
+ *
+ *  First, the parameter file is read by read_parameter_file(),
+ *  then routines for setting units, etc are called. This function only does
+ *  the setup necessary to load the IC file. After the IC file has been loaded
+ *  and prepared by init(), setup continues with begrun2(). This splitting is
+ *  done so that we can return cleanly from operations that don't actually
+ *  start the simulation (converting snapshots, making projected images, etc.)
+ */
+void sim::begrun1(const char *parameterFile)
+{
+  All.register_parameters();
+
+  int errorFlag = All.read_parameter_file(parameterFile); /* ... read in parameters for this run on task 0*/
+
+  if(ThisTask == 0)
+    {
+      int n = strlen(All.OutputDir);
+      if(n > 0)
+        if(All.OutputDir[n - 1] != '/')
+          strcat(All.OutputDir, "/");
+
+      mkdir(All.OutputDir, 02755);
+    }
+
+  /* now communicate the relevant parameters to the other processes, *including* the shared memory handler */
+  /* this also tells the shared memory handler how much memory it may allocate */
+  MPI_Bcast(All.get_data_ptr(), All.get_data_size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+
+#ifdef HOST_MEMORY_REPORTING
+  Mem.check_maxmemsize_setting(All.MaxMemSize);
+#endif
+
+  Mem.mymalloc_init(All.MaxMemSize, All.RestartFlag);
+
+  MPI_Bcast(&errorFlag, 1, MPI_INT, 0, Communicator);
+
+  if(errorFlag)
+    {
+      if(Shmem.Island_ThisTask == 0 && Shmem.Island_NTask != Shmem.World_NTask)
+        {
+          char c = 0;
+          // need to send this flag to our shared memory rank so that it also ends itself
+          MPI_Send(&c, 1, MPI_BYTE, Shmem.MyShmRankInGlobal, TAG_KEY, MPI_COMM_WORLD);
+        }
+
+      mpi_printf("\nWe stop because of an error in the parameterfile.\n\n");
+      MPI_Finalize();
+      exit(0);
+    }
+
+  if(All.OutputListOn)
+    All.read_outputlist(All.OutputListFilename);
+  else
+    All.OutputListLength = 0;
+
+  if(All.RestartFlag != RST_LGALAXIES)
+    All.write_used_parameters(All.OutputDir, "parameters-usedvalues");
+
+  All.some_parameter_checks();
+
+#ifdef ENABLE_HEALTHTEST
+  healthtest();
+#endif
+
+  set_units();
+
+  my_create_HDF5_halfprec_handler();
+  my_create_HDF5_special_integer_types();
+
+  my_mpi_types_init();
+
+#ifdef LIGHTCONE_PARTICLES
+  LightCone.lightcone_init_geometry(All.LightConeDefinitionFile);
+#endif
+
+  /* disable automatic error printing of HDF5 - we check for errors ourselves */
+  H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+
+#ifdef DEBUG
+  enable_core_dumps_and_fpu_exceptions();
+#endif
+
+#ifdef PMGRID
+  GravTree.short_range_init();
+#endif
+
+  Sp.TimeBinsHydro.timebins_init("Hydro", &Sp.MaxPartSph);
+  Sp.TimeBinsGravity.timebins_init("Gravity", &Sp.MaxPart);
+
+#ifdef COOLING
+  All.Time = All.TimeBegin;
+  All.set_cosmo_factors_for_current_time();
+  CoolSfr.InitCool();
+#endif
+
+#ifdef STARFORMATION
+  CoolSfr.init_clouds();
+#endif
+
+#if((!defined(PMGRID) || (defined(PMGRID) && defined(TREEPM_NOTIMESPLIT))) && defined(SELFGRAVITY) && defined(PERIODIC)) || \
+    defined(FORCETEST)
+  Ewald.ewald_init();
+#endif
+
+  init_rng(ThisTask);
+
+#ifdef DEBUG_SYMTENSORS
+  symtensor_test();
+#endif
+
+#ifdef PMGRID
+  if(All.RestartFlag == RST_BEGIN || All.RestartFlag == RST_RESUME || All.RestartFlag == RST_STARTFROMSNAP ||
+     All.RestartFlag == RST_POWERSPEC)
+    {
+#ifdef PERIODIC
+      PM.pm_init_periodic(&Sp);
+#ifdef PLACEHIGHRESREGION
+      PM.pm_init_nonperiodic(&Sp);
+#endif
+#else
+      PM.pm_init_nonperiodic(&Sp);
+#endif
+    }
+#endif
+
+  Logs.open_logfiles();
+
+  All.TimeLastRestartFile = Logs.CPUThisRun;
+
+#ifdef DEBUG_MD5
+  Logs.log_debug_md5("START");
+#endif
+}
+
+/*! \brief This function does late setup, after the IC file has been loaded
+ *  but before run() is called.
+ *
+ *  The output files are opened and various modules are initialized. The next output
+ *  time is determined by find_next_outputtime() and various timers are set.
+ *
+ */
+void sim::begrun2(void)
+{
+  char contfname[MAXLEN_PATH_EXTRA];
+  sprintf(contfname, "%scont", All.OutputDir);
+  unlink(contfname);
+
+  if(All.RestartFlag != RST_BEGIN && All.RestartFlag != RST_RESUME && All.RestartFlag != RST_STARTFROMSNAP)
+    Logs.open_logfiles();
+
+  if(All.ComovingIntegrationOn)
+    {
+      Driftfac.init_drift_table();
+
+      if(Shmem.Island_NTask != Shmem.World_NTask && Shmem.Island_ThisTask == 0)
+        {
+          // We actually have multiple shared memory nodes in which we set aside one MPI rank for shared memory communication.
+          // Tell the shared memory node to initialize its drift table as well, so that it can be used for particle preditions on the
+          // fly
+          if(Shmem.Island_ThisTask == 0)
+            {
+              // update All on shared memory handler, to be sure that be get the correct times
+              MPI_Send(All.get_data_ptr(), All.get_data_size(), MPI_BYTE, Shmem.MyShmRankInGlobal, TAG_DRIFT_INIT, MPI_COMM_WORLD);
+            }
+        }
+    }
+
+#ifdef LIGHTCONE
+#ifdef LIGHTCONE_PARTICLES
+  if(LightCone.lightcone_init_times())
+    endrun();
+#endif
+#ifdef LIGHTCONE_MASSMAPS
+  LightCone.lightcone_init_massmaps();
+  if(LightCone.lightcone_massmap_report_boundaries())
+    endrun();
+#endif
+  double linklength = 0;
+
+#ifdef FOF
+  fof<simparticles> FoF{Communicator, &Sp, &Domain};
+  linklength = FoF.fof_get_comoving_linking_length();
+#endif
+
+  LightCone.lightcone_init_intposconverter(linklength);
+#endif
+
+  if(All.RestartFlag == RST_STARTFROMSNAP)
+    All.Ti_nextoutput = find_next_outputtime(All.Ti_Current + 100);
+  else if(All.RestartFlag == RST_RESUME)
+    All.Ti_nextoutput = find_next_outputtime(All.Ti_Current + 1);
+  else
+    All.Ti_nextoutput = find_next_outputtime(All.Ti_Current);
+
+  All.TimeLastRestartFile = Logs.CPUThisRun;
+
+#ifdef REDUCE_FLUSH
+  All.FlushLast = Logs.CPUThisRun;
+#endif
+
+#if defined(FORCETEST) && defined(FORCETEST_TESTFORCELAW)
+  gravity_forcetest_testforcelaw();
+#endif
+}
+
+/*! \brief Computes conversion factors between internal code units and the
+ *  cgs-system.
+ *
+ *  In addition constants like the gravitation constant are set.
+ */
+void sim::set_units(void)
+{
+  All.UnitTime_in_s         = All.UnitLength_in_cm / All.UnitVelocity_in_cm_per_s;
+  All.UnitTime_in_Megayears = All.UnitTime_in_s / SEC_PER_MEGAYEAR;
+  All.UnitTime_in_years     = All.UnitTime_in_s / SEC_PER_YEAR;
+
+  if(All.GravityConstantInternal == 0)
+    All.G = GRAVITY / pow(All.UnitLength_in_cm, 3) * All.UnitMass_in_g * pow(All.UnitTime_in_s, 2);
+  else
+    All.G = All.GravityConstantInternal;
+
+  All.UnitDensity_in_cgs     = All.UnitMass_in_g / pow(All.UnitLength_in_cm, 3);
+  All.UnitPressure_in_cgs    = All.UnitMass_in_g / All.UnitLength_in_cm / pow(All.UnitTime_in_s, 2);
+  All.UnitCoolingRate_in_cgs = All.UnitPressure_in_cgs / All.UnitTime_in_s;
+  All.UnitEnergy_in_cgs      = All.UnitMass_in_g * pow(All.UnitLength_in_cm, 2) / pow(All.UnitTime_in_s, 2);
+
+  if(All.ComovingIntegrationOn)
+    {
+      /* check whether the supplied value of All.Hubble makes sense */
+      if(All.HubbleParam != 1.0)
+        {
+          double Hubble_expected = HUBBLE * All.UnitTime_in_s;
+          if(fabs(Hubble_expected - All.Hubble) > 1.0e-3 * All.Hubble)
+            Terminate(
+                "You have supplied All.Hubble=%g/All.HubbleParam=%g. For this choice, we would expect All.Hubble=%g. We better stop\n",
+                All.Hubble, All.HubbleParam, Hubble_expected);
+        }
+      else
+        {
+          double Hubble_expected = 0.7 * HUBBLE * All.UnitTime_in_s;
+          if(All.Hubble < 0.5 * Hubble_expected || All.Hubble > 2.0 * Hubble_expected)
+            Terminate(
+                "You have supplied All.Hubble=%g/All.HubbleParam=%g. For this choice, we would expect All.Hubble somewhere in the "
+                "ballpark of %g. We better stop\n",
+                All.Hubble, All.HubbleParam, Hubble_expected);
+        }
+    }
+
+  mpi_printf("BEGRUN: Hubble (internal units)  = %g\n", All.Hubble);
+  mpi_printf("BEGRUN: h                        = %g\n", All.HubbleParam);
+  mpi_printf("BEGRUN: G (internal units)       = %g\n", All.G);
+  mpi_printf("BEGRUN: UnitMass_in_g            = %g\n", All.UnitMass_in_g);
+  mpi_printf("BEGRUN: UnitLenth_in_cm          = %g\n", All.UnitLength_in_cm);
+  mpi_printf("BEGRUN: UnitTime_in_s            = %g\n", All.UnitTime_in_s);
+  mpi_printf("BEGRUN: UnitVelocity_in_cm_per_s = %g\n", All.UnitVelocity_in_cm_per_s);
+  mpi_printf("BEGRUN: UnitDensity_in_cgs       = %g\n", All.UnitDensity_in_cgs);
+  mpi_printf("BEGRUN: UnitEnergy_in_cgs        = %g\n", All.UnitEnergy_in_cgs);
+  mpi_printf("\n");
+
+#ifdef STARFORMATION
+  CoolSfr.set_units_sfr();
+#endif
+}
+
+/** \brief This function aborts the simulations.
+ *
+ * This method has to be called by all processes. It should be used only
+ * if the simulation ends without a errors or a an error message is already printed.
+ * Otherwise Terminate() should be used instead.
+ */
+void sim::endrun(void)
+{
+  mpi_printf("endrun called, calling MPI_Finalize()\nbye!\n\n");
+  fflush(stdout);
+
+  if(Shmem.Island_ThisTask == 0 && Shmem.Island_NTask != Shmem.World_NTask)
+    {
+      char c = 0;
+      // need to send this flag to our shared memory rank so that it also ends itself
+      MPI_Send(&c, 1, MPI_BYTE, Shmem.MyShmRankInGlobal, TAG_KEY, MPI_COMM_WORLD);
+    }
+
+  /* The hdf5 library will sometimes register an atexit() handler that calls its error handler.
+   * This is set to my_hdf_error_handler, which calls MPI_Abort.
+   * Calling MPI_Abort after MPI_Finalize is not allowed.
+   * Hence unset the HDF error handler here*/
+  H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+
+  MPI_Finalize();
+  exit(0);
+}
diff --git a/src/main/init.cc b/src/main/init.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3eb3466bbf91a4df7509c6a06435fcedded958b0
--- /dev/null
+++ b/src/main/init.cc
@@ -0,0 +1,693 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file init.cc
+ *
+ *  \brief code for initialization of a simulation from initial conditions
+ */
+
+#include "gadgetconfig.h"
+
+#include <mpi.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../io/io.h"
+#include "../io/snap_io.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../ngenic/ngenic.h"
+#include "../pm/pm.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind_readid_io.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+using namespace std;
+
+/*! \brief Prepares the loaded initial conditions for the run
+ *
+ *  It is only called if RestartFlag != RST_RESUME. Various counters and variables are initialized.
+ *  Entries of the particle data structures not read from initial conditions are
+ *  initialized or converted and a initial domain decomposition is performed.
+ *  If SPH particles are present, the initial SPH smoothing lengths are determined.
+ */
+void sim::init(int RestartSnapNum)
+{
+#ifdef NGENIC
+  if(All.RestartFlag == RST_CREATEICS || All.RestartFlag == RST_BEGIN)
+    {
+      Ngenic.ngenic_displace_particles();
+
+      if(All.RestartFlag == RST_CREATEICS)
+        {
+          double fac = 1 / sqrt(All.cf_a3inv);
+          for(int i = 0; i < Sp.NumPart; i++)
+            for(int k = 0; k < 3; k++)
+              Sp.P[i].Vel[k] *= fac;
+
+          strcat(All.SnapshotFileBase, "_ics");
+          mpi_printf("Start writing file %s\nRestartSnapNum %d\n", All.SnapshotFileBase, 0);
+          snap_io Snap(&Sp, Communicator, All.SnapFormat); /* get an I/O object */
+          Snap.write_snapshot(0, NORMAL_SNAPSHOT);
+          endrun();
+        }
+    }
+#else
+  if(All.RestartFlag == RST_CREATEICS)
+    {
+      Terminate("Compile with option NGENIC to create cosmological initial conditions");
+    }
+#endif
+
+#ifdef LIGHTCONE_PARTICLES
+  Lp.MaxPart = LIGHTCONE_ALLOC_FAC * Sp.MaxPart;
+  Lp.NumPart = 0;
+  Lp.allocate_memory();
+#endif
+
+#ifdef LIGHTCONE_MASSMAPS
+  LightCone.Mp->Npix = nside2npix(All.LightConeMassMapsNside);
+
+  subdivide_evenly(LightCone.Mp->Npix, NTask, ThisTask, &LightCone.Mp->FirstPix, &LightCone.Mp->NpixLoc);
+
+  Mp.MaxPart = LIGHTCONE_MASSMAP_ALLOC_FAC * (Sp.TotNumPart / NTask);
+  Mp.NumPart = 0;
+  Mp.allocate_memory();
+  LightCone.MassMap = (double *)Mem.mymalloc_movable_clear(&LightCone.MassMap, "MassMap", LightCone.Mp->NpixLoc * sizeof(double));
+#endif
+
+  /* this makes sure that masses are initialized in the case that the mass-block
+     is empty for this particle type */
+
+  for(int i = 0; i < Sp.NumPart; i++)
+    if(All.MassTable[Sp.P[i].getType()] != 0)
+      {
+#ifndef LEAN
+        Sp.P[i].setMass(All.MassTable[Sp.P[i].getType()]);
+#else
+        All.PartMass = All.MassTable[Sp.P[i].getType()];
+#endif
+      }
+
+#if NSOFTCLASSES > 1
+  for(int i = 0; i < Sp.NumPart; i++)
+    Sp.P[i].setSofteningClass(All.SofteningClassOfPartType[Sp.P[i].getType()]);
+#endif
+
+#ifdef GENERATE_GAS_IN_ICS
+  if(All.RestartFlag == RST_BEGIN)
+    {
+      /* determine maximum ID */
+      MyIDType maxid = 0;
+      for(int i = 0; i < Sp.NumPart; i++)
+        if(Sp.P[i].ID.get() > maxid)
+          maxid = Sp.P[i].ID.get();
+
+      MyIDType *tmp = (MyIDType *)Mem.mymalloc("tmp", NTask * sizeof(MyIDType));
+      MPI_Allgather(&maxid, sizeof(MyIDType), MPI_BYTE, tmp, sizeof(MyIDType), MPI_BYTE, Communicator);
+
+      for(int i = 0; i < NTask; i++)
+        if(tmp[i] > maxid)
+          maxid = tmp[i];
+
+      Mem.myfree(tmp);
+
+      All.FlagICsContainedEntropy = 0;
+
+      int count = 0;
+      for(int i = 0; i < Sp.NumPart; i++)
+#ifdef SPLIT_PARTICLE_TYPE
+        if((1 << Sp.P[i].getType()) & (SPLIT_PARTICLE_TYPE))
+#else
+        if(Sp.P[i].getType() == 1)
+#endif
+          count++;
+
+      int *numpart_list = (int *)Mem.mymalloc("numpart_list", NTask * sizeof(int));
+      MPI_Allgather(&count, 1, MPI_INT, numpart_list, 1, MPI_INT, Communicator);
+
+      maxid++;
+
+      for(int i = 0; i < ThisTask; i++)
+        maxid += numpart_list[i];
+
+      Mem.myfree(numpart_list);
+
+      Domain.domain_resize_storage(count + Sp.NumPart, count, 0);
+
+      memmove(Sp.P + count, Sp.P, sizeof(particle_data) * Sp.NumPart);
+
+      Sp.NumPart += count;
+      Sp.NumGas += count;
+
+      if(Sp.NumGas > Sp.MaxPartSph)
+        Terminate("Task=%d ends up getting more SPH particles (%d) than allowed (%d)\n", ThisTask, Sp.NumGas, Sp.MaxPartSph);
+
+      if(Sp.NumPart > Sp.MaxPart)
+        Terminate("Task=%d ends up getting more particles (%d) than allowed (%d)\n", ThisTask, Sp.NumPart, Sp.MaxPart);
+
+      double fac = All.OmegaBaryon / All.Omega0;
+      double rho = All.Omega0 * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G);
+
+      int j = 0;
+      for(int i = count; i < Sp.NumPart; i++)
+#ifdef SPLIT_PARTICLE_TYPE
+        if((1 << Sp.P[i].getType()) & (SPLIT_PARTICLE_TYPE))
+#else
+        if(Sp.P[i].getType() == 1)
+#endif
+          {
+            double d = pow(Sp.P[i].getMass() / rho, 1.0 / 3);
+            double a = 0.5 * All.OmegaBaryon / All.Omega0 * d;
+            double b = 0.5 * (All.Omega0 - All.OmegaBaryon) / All.Omega0 * d;
+
+            MyIntPosType delta_a[3];
+            double aa[3] = {a, a, a};
+            Sp.pos_to_signedintpos(aa, (MySignedIntPosType *)delta_a);
+
+            MyIntPosType delta_b[3];
+            double bb[3] = {b, b, b};
+            Sp.pos_to_signedintpos(bb, (MySignedIntPosType *)delta_b);
+
+            Sp.P[j] = Sp.P[i];
+
+            Sp.P[j].setMass(Sp.P[j].getMass() * fac);
+            Sp.P[i].setMass(Sp.P[i].getMass() * (1 - fac));
+
+            Sp.P[j].setType(0);
+#if NSOFTCLASSES > 1
+            Sp.P[j].setSofteningClass(All.SofteningClassOfPartType[0]);
+#endif
+            Sp.P[j].ID.set(maxid++);
+            Sp.P[i].IntPos[0] += delta_a[0];
+            Sp.P[i].IntPos[1] += delta_a[1];
+            Sp.P[i].IntPos[2] += delta_a[2];
+            Sp.P[j].IntPos[0] -= delta_b[0];
+            Sp.P[j].IntPos[1] -= delta_b[1];
+            Sp.P[j].IntPos[2] -= delta_b[2];
+
+            j++;
+          }
+
+      All.MassTable[0] = 0;
+
+#ifdef SPLIT_PARTICLE_TYPE
+      for(int i = 1; i < NTYPES; i++)
+        if((1 << i) & (SPLIT_PARTICLE_TYPE))
+          All.MassTable[i] *= (1 - fac);
+#else
+      All.MassTable[1] *= (1 - fac);
+#endif
+
+      mpi_printf("\nGENERATE_GAS_IN_ICS: Generated gas particles from DM particle distribution.  TotNumGas=%lld\n\n", Sp.TotNumGas);
+    }
+#endif
+
+#ifdef STARFORMATION
+  if(All.RestartFlag == RST_BEGIN)
+    {
+      if(All.MassTable[STAR_TYPE] == 0 && All.MassTable[0] > 0)
+        {
+          All.MassTable[0] = 0;
+        }
+    }
+#endif
+
+  double u_init = (1.0 / GAMMA_MINUS1) * (BOLTZMANN / PROTONMASS) * All.InitGasTemp;
+  u_init *= All.UnitMass_in_g / All.UnitEnergy_in_cgs; /* unit conversion */
+
+  double molecular_weight;
+  if(All.InitGasTemp > 1.0e4) /* assuming FULL ionization */
+    molecular_weight = 4 / (8 - 5 * (1 - HYDROGEN_MASSFRAC));
+  else /* assuming NEUTRAL GAS */
+    molecular_weight = 4 / (1 + 3 * HYDROGEN_MASSFRAC);
+
+  u_init /= molecular_weight;
+
+  All.InitGasU = u_init;
+
+  if(All.RestartFlag == RST_BEGIN)
+    {
+      if(All.InitGasTemp > 0)
+        {
+          for(int i = 0; i < Sp.NumGas; i++)
+            {
+              if(ThisTask == 0 && i == 0 && Sp.SphP[i].Entropy == 0)
+                mpi_printf("READIC: Initializing u from InitGasTemp !\n");
+
+              if(Sp.SphP[i].Entropy == 0)
+                Sp.SphP[i].Entropy = All.InitGasU;
+              /* Note: the coversion to entropy will be done in the function init(),
+                 after the densities have been computed */
+            }
+        }
+    }
+
+  for(int i = 0; i < Sp.NumGas; i++)
+    Sp.SphP[i].Entropy = std::max<double>(All.MinEgySpec, Sp.SphP[i].Entropy);
+
+#ifdef COOLING
+  CoolSfr.IonizeParams();
+#endif
+
+  if(All.ComovingIntegrationOn)
+    {
+      All.Timebase_interval = (log(All.TimeMax) - log(All.TimeBegin)) / TIMEBASE;
+      All.Ti_Current        = 0;
+    }
+  else
+    {
+      All.Timebase_interval = (All.TimeMax - All.TimeBegin) / TIMEBASE;
+      All.Ti_Current        = 0;
+    }
+
+  All.set_cosmo_factors_for_current_time();
+
+  All.NumCurrentTiStep  = 0; /* setup some counters */
+  All.SnapshotFileCount = 0;
+
+  if(All.RestartFlag == RST_STARTFROMSNAP)
+    {
+      if(RestartSnapNum < 0)
+        All.SnapshotFileCount = atoi(All.InitCondFile + strlen(All.InitCondFile) - 3) + 1;
+      else
+        All.SnapshotFileCount = RestartSnapNum + 1;
+    }
+
+  All.TotNumOfForces     = 0;
+  All.TotNumDirectForces = 0;
+  All.TotNumDensity      = 0;
+  All.TotNumHydro        = 0;
+
+  All.TopNodeAllocFactor = 0.08;
+  All.TreeAllocFactor    = 0.3;
+  All.NgbTreeAllocFactor = 0.7;
+
+  All.TimeLastStatistics = All.TimeBegin - All.TimeBetStatistics;
+
+#if defined(EVALPOTENTIAL) && defined(PMGRID) && defined(PERIODIC)
+  double mass_sum = 0;
+
+  for(int i = 0; i < Sp.NumPart; i++)
+    mass_sum += Sp.P[i].getMass();
+
+  MPI_Allreduce(&mass_sum, &All.TotalMass, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+#endif
+
+  if(All.ComovingIntegrationOn) /*  change to new velocity variable */
+    {
+      double afac = sqrt(All.Time) * All.Time;
+
+      for(int i = 0; i < Sp.NumPart; i++)
+        {
+          for(int j = 0; j < 3; j++)
+            Sp.P[i].Vel[j] *= afac; /* for dm/gas particles, p = a^2 xdot */
+        }
+    }
+
+  for(int i = 0; i < TIMEBINS; i++)
+    All.Ti_begstep[i] = 0;
+
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+  All.PM_Ti_endstep = All.PM_Ti_begstep = 0;
+#endif
+
+  for(int i = 0; i < Sp.NumPart; i++) /*  start-up initialization with non-zero values where required */
+    {
+#ifndef LEAN
+      Sp.P[i].access.clear();
+#endif
+#ifdef MERGERTREE
+      Sp.P[i].PrevSubhaloNr.set(HALONR_MAX);
+#endif
+    }
+
+  for(int i = 0; i < TIMEBINS; i++)
+    Sp.TimeBinSynchronized[i] = 1;
+
+  Sp.reconstruct_timebins();
+
+  for(int i = 0; i < Sp.NumGas; i++) /* initialize sph_properties with non-zero values where required */
+    {
+      Sp.SphP[i].EntropyPred = Sp.SphP[i].Entropy;
+
+      for(int j = 0; j < 3; j++)
+        Sp.SphP[i].VelPred[j] = Sp.P[i].Vel[j];
+
+      if(All.RestartFlag == RST_BEGIN)
+        {
+#ifdef COOLING
+          Sp.SphP[i].Ne = 1.0;
+#endif
+        }
+
+#ifdef PRESSURE_ENTROPY_SPH
+      Sp.SphP[i].EntropyToInvGammaPred = pow(Sp.SphP[i].EntropyPred, 1.0 / GAMMA);
+#endif
+
+#ifdef TIMEDEP_ART_VISC
+#ifdef HIGH_ART_VISC_START
+      Sp.SphP[i].Alpha = All.ArtBulkViscConst;
+#else
+      Sp.SphP[i].Alpha = All.AlphaMin;
+#endif
+#endif
+    }
+
+#ifdef RECREATE_UNIQUE_IDS
+  recreate_unique_ids();
+#endif
+
+  test_id_uniqueness();
+
+  Domain.domain_decomposition(STANDARD); /* do initial domain decomposition (gives equal numbers of particles) */
+
+  GravTree.set_softenings();
+
+#ifdef ADAPTIVE_HYDRO_SOFTENING
+  mpi_printf("INIT: Adaptive hydro softening, minimum gravitational softening for SPH particles: %g\n",
+             All.MinimumComovingHydroSoftening);
+  mpi_printf("INIT: Adaptive hydro softening, maximum gravitational softening for SPH particles: %g\n",
+             All.MinimumComovingHydroSoftening * pow(All.AdaptiveHydroSofteningSpacing, NSOFTCLASSES_HYDRO - 1));
+  mpi_printf("INIT: Adaptive hydro softening, number of softening values: %d\n", NSOFTCLASSES_HYDRO);
+#endif
+
+#ifdef INDIVIDUAL_GRAVITY_SOFTENING
+  Sp.init_individual_softenings();
+#endif
+
+  if(All.RestartFlag == RST_FOF)
+    {
+#ifdef FOF
+
+#if defined(SUBFIND) && defined(MERGERTREE)
+      // we are reading the previous subhalo catalogue, if available, to assign the previous subhalo length to particles
+      MergerTree.get_previous_size_of_subhalo_for_each_particle(RestartSnapNum - 1);
+#endif
+
+      Sp.PS = (subfind_data *)Mem.mymalloc_movable(&Sp.PS, "PS", Sp.MaxPart * sizeof(subfind_data));
+      memset(Sp.PS, 0, Sp.MaxPart * sizeof(subfind_data));
+
+      /* First, we save the original location of the particles, in order to be able to revert to this layout later on */
+      for(int i = 0; i < Sp.NumPart; i++)
+        {
+          Sp.PS[i].OriginTask  = ThisTask;
+          Sp.PS[i].OriginIndex = i;
+        }
+      fof<simparticles> FoF{Communicator, &Sp, &Domain};
+      FoF.fof_fof(RestartSnapNum, "fof", "groups", 0);
+
+      {
+        All.DumpFlag_nextoutput = 1;
+        snap_io Snap(&Sp, Communicator, All.SnapFormat); /* get an I/O object */
+        Snap.write_snapshot(RestartSnapNum, NORMAL_SNAPSHOT);
+      }
+
+#ifdef SUBFIND_ORPHAN_TREATMENT
+      {
+        /* now read the IDs of the most bound particles of a previously existing special dump, with the idea being to
+         * be able also on postprocessing to construct a cumulative list of all particles that used to be a most particles
+         * in any previous dump. This can be accomplished by computing the group catalogues consecutively from 000 to the last
+         * snapshoyt number
+         */
+
+        if(RestartSnapNum > 0)
+          {
+            subreadid_io SnapIDread(&Sp.IdStore, Communicator, All.SnapFormat);
+            SnapIDread.previously_bound_read_snap_ids(RestartSnapNum - 1);
+
+            FoF.subfind_match_ids_of_previously_most_bound_ids(&Sp);
+          }
+
+        snap_io Snap(&Sp, Communicator, All.SnapFormat);
+        Snap.write_snapshot(RestartSnapNum, MOST_BOUND_PARTICLE_SNAPHOT); /* write special snapshot file */
+      }
+#endif
+
+#endif
+      endrun();
+    }
+
+  /* build neighbor tree */
+  NgbTree.treeallocate(Sp.NumGas, &Sp, &Domain);
+  NgbTree.treebuild(Sp.NumGas, NULL);
+
+  if(All.RestartFlag == RST_POWERSPEC)
+    {
+#if defined(PMGRID) && defined(PERIODIC)
+
+      PM.calculate_power_spectra(RestartSnapNum);
+#else
+      mpi_printf("\nThis option (Power Spectrum) only works for PERIODIC and enabled PMGRID.\n\n");
+#endif
+      endrun();
+    }
+
+  All.Ti_Current = 0;
+
+  setup_smoothinglengths();
+
+  /* at this point, the entropy variable actually contains the
+   * internal energy, read in from the initial conditions file.
+   * Once the density has been computed, we can convert to entropy.
+   */
+#ifdef PRESSURE_ENTROPY_SPH
+  if(All.FlagICsContainedEntropy == 0)
+    NgbTree.setup_entropy_to_invgamma();
+#endif
+
+  double mass = 0;
+  for(int i = 0; i < Sp.NumGas; i++)
+    {
+      if(All.FlagICsContainedEntropy == 0)
+        {
+          if(ThisTask == 0 && i == 0)
+            printf("INIT: Converting u -> entropy\n");
+
+#if !defined(PRESSURE_ENTROPY_SPH) && !defined(ISOTHERM_EQS)
+          Sp.SphP[i].Entropy = GAMMA_MINUS1 * Sp.SphP[i].Entropy / pow(Sp.SphP[i].Density * All.cf_a3inv, GAMMA_MINUS1);
+#endif
+          Sp.SphP[i].EntropyPred = Sp.SphP[i].Entropy;
+        }
+
+      /* The predicted entropy values have been already set for all SPH formulation */
+      /* so it should be ok computing pressure and csound now */
+      Sp.SphP[i].set_thermodynamic_variables();
+
+      mass += Sp.P[i].getMass();
+    }
+
+  if(All.ComovingIntegrationOn)
+    {
+#ifdef PERIODIC
+      if(All.RestartFlag == RST_BEGIN || All.RestartFlag == RST_RESUME || All.RestartFlag == RST_STARTFROMSNAP ||
+         All.RestartFlag == RST_CREATEICS)
+        {
+          /* can't do this check when not all particles are loaded */
+          check_omega();
+        }
+      else
+        {
+          mpi_printf("Skipping Omega check since not all particles are loaded\n");
+        }
+#endif
+    }
+
+#ifdef STARFORMATION
+  /* initialize absolute masses in materials */
+  for(int i = 0; i < Sp.NumGas; i++)
+    {
+      Sp.SphP[i].Metallicity = Sp.P[i].Metallicity;  // set above
+
+      Sp.SphP[i].MassMetallicity = Sp.SphP[i].Metallicity * Sp.P[i].getMass();
+    }
+#endif
+
+    // tree_based_timesteps_set_initialmaxtistep();
+
+#ifdef DEBUG_MD5
+  Logs.log_debug_md5("AFTER-INIT");
+#endif
+
+  return;
+}
+
+#ifdef PERIODIC
+/*! \brief This routine computes the mass content of the box and compares it to the
+ * specified value of Omega-matter.
+ *
+ * If discrepant, the run is terminated.
+ */
+void sim::check_omega(void)
+{
+  double mass = 0;
+
+  for(int i = 0; i < Sp.NumPart; i++)
+    mass += Sp.P[i].getMass();
+
+  double masstot;
+  MPI_Allreduce(&mass, &masstot, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+
+  double omega = masstot * (LONG_Z * LONG_Y * LONG_Z) / (All.BoxSize * All.BoxSize * All.BoxSize) /
+                 (3 * All.Hubble * All.Hubble / (8 * M_PI * All.G));
+  if(fabs(omega - All.Omega0) > 1.0e-2)
+    {
+      mpi_printf(
+          "\n\nI've found something odd!\nThe mass content accounts only for Omega=%g,\nbut you specified Omega=%g in the "
+          "parameterfile.\n\nI better stop.\n",
+          omega, All.Omega0);
+      endrun();
+    }
+}
+#endif
+
+/*! \brief This function is used to find an initial smoothing length for each SPH
+ *  particle.
+ *
+ *  It guarantees that the number of neighbours will be between
+ *  desired_ngb-MAXDEV and desired_ngb+MAXDEV. For simplicity, a first guess
+ *  of the smoothing length is provided to the function density(), which will
+ *  then iterate if needed to find the right smoothing length.
+ */
+void sim::setup_smoothinglengths(void)
+{
+  Sp.TimeBinsGravity.NActiveParticles = 0;
+
+  for(int i = 0; i < Sp.NumGas; i++)
+    Sp.TimeBinsGravity.ActiveParticleList[Sp.TimeBinsGravity.NActiveParticles++] = i;
+
+  sumup_large_ints(1, &Sp.TimeBinsGravity.NActiveParticles, &Sp.TimeBinsGravity.GlobalNActiveParticles, Communicator);
+
+  if(Sp.TimeBinsGravity.GlobalNActiveParticles > 0)
+    {
+      mpi_printf("INIT: Setup smoothing lengths.\n");
+
+      GravTree.treeallocate(Sp.NumPart, &Sp, &Domain);
+      GravTree.treebuild(Sp.TimeBinsGravity.NActiveParticles, Sp.TimeBinsGravity.ActiveParticleList);
+
+      for(int i = 0; i < Sp.NumGas; i++)
+        {
+          int no = GravTree.Father[i];
+
+          while(10 * All.DesNumNgb * Sp.P[i].getMass() > GravTree.get_nodep(no)->mass)
+            {
+              int p = GravTree.get_nodep(no)->father;
+
+              if(p < 0)
+                break;
+
+              no = p;
+            }
+
+          double len;
+          if(GravTree.get_nodep(no)->level > 0)
+            len = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - GravTree.get_nodep(no)->level)) * Sp.FacIntToCoord;
+          else
+            len = Sp.RegionLen;
+
+          Sp.SphP[i].Hsml = pow(3.0 / (4 * M_PI) * All.DesNumNgb * Sp.P[i].getMass() / GravTree.get_nodep(no)->mass, 1.0 / 3) * len;
+        }
+
+      Sp.TimeBinsHydro.NActiveParticles = 0;
+      for(int i = 0; i < Sp.NumGas; i++)
+        Sp.TimeBinsGravity.ActiveParticleList[Sp.TimeBinsHydro.NActiveParticles++] = i;
+
+      NgbTree.density(Sp.TimeBinsGravity.ActiveParticleList, Sp.TimeBinsHydro.NActiveParticles);
+
+#ifdef PRESSURE_ENTROPY_SPH
+      for(int i = 0; i < Sp.NumGas; i++)
+        Sp.SphP[i].PressureSphDensity = Sp.SphP[i].Density;
+#endif
+
+      GravTree.treefree();
+    }
+}
+
+void sim::recreate_unique_ids(void)
+{
+  mpi_printf("INIT: Setting new unique IDs.\n");
+
+  int *numpart_list = (int *)Mem.mymalloc("numpart_list", NTask * sizeof(int));
+
+  MPI_Allgather(&Sp.NumPart, 1, MPI_INT, numpart_list, 1, MPI_INT, Communicator);
+
+  MyIDType id = 1;
+
+  for(int i = 0; i < ThisTask; i++)
+    id += numpart_list[i];
+
+  for(int i = 0; i < Sp.NumPart; i++)
+    Sp.P[i].ID.set(id++);
+
+  Mem.myfree(numpart_list);
+}
+
+/*! \brief This function checks for unique particle  IDs
+ *
+ *  The particle IDs are copied to an array and then sorted among all tasks.
+ *  This array is then checked for duplicates. In that case the code terminates.
+ */
+void sim::test_id_uniqueness(void)
+{
+  mpi_printf("INIT: Testing ID uniqueness...\n");
+
+  double t0 = Logs.second();
+
+  MyIDType *ids       = (MyIDType *)Mem.mymalloc("ids", (Sp.NumPart + 1) * sizeof(MyIDType));
+  MyIDType *ids_first = (MyIDType *)Mem.mymalloc("ids_first", NTask * sizeof(MyIDType));
+  int *num_list       = (int *)Mem.mymalloc("num_list", NTask * sizeof(int));
+
+  for(int i = 0; i < Sp.NumPart; i++)
+    ids[i] = Sp.P[i].ID.get();
+
+  mycxxsort_parallel(ids, ids + Sp.NumPart, Sp.compare_IDs, Communicator);
+
+  for(int i = 1; i < Sp.NumPart; i++)
+    {
+      if(ids[i] == ids[i - 1])
+        Terminate("non-unique ID=%lld found on task=%d (i=%d Sp.NumPart=%d type=%d)\n", (long long)ids[i], ThisTask, i, Sp.NumPart,
+                  Sp.P[i].getType());
+    }
+
+  MPI_Allgather(&ids[0], sizeof(MyIDType), MPI_BYTE, ids_first, sizeof(MyIDType), MPI_BYTE, Communicator);
+  MPI_Allgather(&Sp.NumPart, 1, MPI_INT, num_list, 1, MPI_INT, Communicator);
+
+  int next_non_empty_task = ThisTask + 1;
+
+  while(next_non_empty_task < NTask)
+    if(num_list[next_non_empty_task] == 0)
+      next_non_empty_task++;
+    else
+      break;
+
+  if(Sp.NumPart > 0 && next_non_empty_task < NTask)
+    {
+      if(ids[Sp.NumPart - 1] == ids_first[next_non_empty_task])
+        Terminate("non-unique ID=%lld found on task=%d\n", (long long)ids[Sp.NumPart - 1], ThisTask);
+    }
+
+  Mem.myfree(num_list);
+  Mem.myfree(ids_first);
+  Mem.myfree(ids);
+
+  double t1 = Logs.second();
+
+  mpi_printf("INIT: success.  took=%g sec\n\n", Logs.timediff(t0, t1));
+}
diff --git a/src/main/main.cc b/src/main/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..021f6c1065ccfb92d8daac95ac4280c17ac0668c
--- /dev/null
+++ b/src/main/main.cc
@@ -0,0 +1,355 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  main.cc
+ *
+ *  \brief start of the program
+ */
+
+#include "gadgetconfig.h"
+
+#include <gsl/gsl_math.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../half/half.hpp"
+#include "../io/io.h"
+#include "../io/restart.h"
+#include "../io/snap_io.h"
+#include "../logs/logs.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/shared_mem_handler.h"
+#include "../ngenic/ngenic.h"
+#include "../system/system.h"
+#include "../time_integration/driftfac.h"
+
+/* create instances of global objects */
+
+global_data_all_processes All;
+driftfac Driftfac;
+ewald Ewald; /* get an instance of the Ewald correction tables */
+logs Logs;
+memory Mem; /* our instance of the memory object */
+shmem Shmem;
+
+/*!
+ *  This function initializes the MPI communication packages, and sets
+ *  cpu-time counters to 0.  Then begrun1() is called, which sets up
+ *  the simulation. Then either IC's or restart files are loaded. In
+ *  case of IC's init() is called which prepares the IC's for the run.
+ *  A call to begrun2() finishes the initialization. Finally, run() is
+ *  started, the main simulation loop, which iterates over the timesteps.
+ */
+int main(int argc, char **argv)
+{
+  /* find out how many core we have per CPU and which ones we can use */
+  pinning Pin;
+  Pin.detect_topology();
+  Pin.get_core_set();
+
+  /* initialize MPI, this may already impose some pinning */
+  MPI_Init(&argc, &argv);
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &Shmem.World_ThisTask);
+  MPI_Comm_size(MPI_COMM_WORLD, &Shmem.World_NTask);
+
+#if NUMBER_OF_MPI_LISTENERS_PER_NODE > 1
+  MPI_Comm fullsharedmemnode;
+  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &fullsharedmemnode);
+
+  int fullsharedmemnode_ThisTask, fullsharedmemnode_NTask;
+  MPI_Comm_rank(fullsharedmemnode, &fullsharedmemnode_ThisTask);
+  MPI_Comm_size(fullsharedmemnode, &fullsharedmemnode_NTask);
+
+  int bin;
+  subdivide_evenly_get_bin(fullsharedmemnode_NTask, NUMBER_OF_MPI_LISTENERS_PER_NODE, fullsharedmemnode_ThisTask, &bin);
+
+  MPI_Comm_split(fullsharedmemnode, bin, fullsharedmemnode_ThisTask, &Shmem.SharedMemComm);
+#else
+  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &Shmem.SharedMemComm);
+#endif
+
+  MPI_Comm_rank(Shmem.SharedMemComm, &Shmem.Island_ThisTask);
+  MPI_Comm_size(Shmem.SharedMemComm, &Shmem.Island_NTask);
+
+  int min_ntask, max_ntask;
+  MPI_Allreduce(&Shmem.Island_NTask, &max_ntask, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(&Shmem.Island_NTask, &min_ntask, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+  MPI_Allreduce(&Shmem.World_ThisTask, &Shmem.Island_Smallest_WorldTask, 1, MPI_INT, MPI_MIN, Shmem.SharedMemComm);
+
+  if(Shmem.World_ThisTask == 0)
+    printf("Shared memory islands host a minimum of %d and a maximum of %d MPI ranks.\n", min_ntask, max_ntask);
+
+  Shmem.GhostRank = 0;
+
+  if(max_ntask < Shmem.World_NTask)
+    {
+      if(Shmem.Island_ThisTask == Shmem.Island_NTask - 1)  // selected the ghost MPI ranks
+        Shmem.GhostRank = 1;
+
+      if(min_ntask > 1)
+        {
+          int comm_ranks;
+          MPI_Allreduce(&Shmem.GhostRank, &comm_ranks, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+          if(Shmem.World_ThisTask == 0)
+            printf("We shall use %d MPI ranks in total for assisting one-sided communication (%d per shared memory node).\n",
+                   comm_ranks, NUMBER_OF_MPI_LISTENERS_PER_NODE);
+        }
+      else
+        {
+          if(Shmem.World_ThisTask == 0)
+            Terminate("We have shared memory islands with just one MPI rank -- can't put aside this one just for communication");
+        }
+
+      if(max_ntask > MAX_NUMBER_OF_RANKS_WITH_SHARED_MEMORY)
+        {
+          if(Shmem.World_ThisTask == 0)
+            Terminate(
+                "We have shared memory islands with %d MPI ranks, which is larger than  MAX_NUMBER_OF_RANKS_WITH_SHARED_MEMORY=%d\n"
+                "You may consider increasing NUMBER_OF_MPI_LISTENERS_PER_NODE, current value is %d\n",
+                max_ntask, MAX_NUMBER_OF_RANKS_WITH_SHARED_MEMORY, NUMBER_OF_MPI_LISTENERS_PER_NODE);
+        }
+    }
+
+  /* we can now split the communicator into the processing ones, and the ones reserved for communication */
+  MPI_Comm_split(MPI_COMM_WORLD, Shmem.GhostRank, Shmem.World_ThisTask, &Shmem.SimulationComm);
+
+  MPI_Comm_rank(Shmem.SimulationComm, &Shmem.Sim_ThisTask);
+  MPI_Comm_size(Shmem.SimulationComm, &Shmem.Sim_NTask);
+
+  /* Let's now find out for everyone the global rank of the responsible shared memory ghost */
+  Shmem.MyShmRankInGlobal = Shmem.World_ThisTask;
+
+  MPI_Bcast(&Shmem.MyShmRankInGlobal, 1, MPI_INT, Shmem.Island_NTask - 1, Shmem.SharedMemComm);
+
+  if(Shmem.GhostRank == 1)
+    {
+      Mem.initcomm(Shmem.SimulationComm);
+      Shmem.shared_memory_handler();  // note: this call will not return
+    }
+
+  /* creating main simulation object, holding all the data and objects of our simulation */
+  sim Sim{Shmem.SimulationComm};
+
+  Sim.determine_compute_nodes();
+
+  /* initialize the communicator structures of our global objects */
+  Ewald.initcomm(Sim.Communicator);  // a global class
+  Logs.initcomm(Sim.Communicator);   // a global class
+  All.initcomm(Sim.Communicator);    // a global class
+  Mem.initcomm(Sim.Communicator);    // a global class
+  Mem.determine_compute_nodes();
+
+  /* output a welcome message */
+  Sim.hello();
+
+  /* initialize CPU-time/Wallclock-time measurement */
+  Logs.init_cpu_log(&Sim.Sp);
+
+  /* pin the MPI ranks to the available core set */
+  Pin.pin_to_core_set(&Sim);
+
+#ifdef HOST_MEMORY_REPORTING
+  Sim.mpi_report_comittable_memory();
+  Mem.MemoryOnNode       = Sim.MemoryOnNode;
+  Mem.SharedMemoryOnNode = Sim.SharedMemoryOnNode;
+#endif
+
+  if(argc < 2)
+    {
+      if(Sim.ThisTask == 0)
+        {
+          printf("\nParameters are missing.\n");
+          printf("Start as ./Gadget4 <ParameterFile> [<RestartFlag>] [<SpecialOptions>]\n");
+          printf("\n");
+          printf("   RestartFlag    Action\n");
+          printf("      %2d          Read initial conditions and start simulation\n", RST_BEGIN);
+          printf("      %2d          Read restart files and resume simulation\n", RST_RESUME);
+          printf("      %2d          Restart from specified snapshot dump and resume simulation\n", RST_STARTFROMSNAP);
+          printf("      %2d          Run FOF and optionally SUBFIND\n", RST_FOF);
+          printf("      %2d          Calculate a matter power spectrum\n", RST_POWERSPEC);
+          printf("      %2d          Convert snapshot file to different format [input=ICFormat  output=SnapFormat\n", RST_CONVERTSNAP);
+          printf("      %2d          Create cosmological initial conditions\n", RST_CREATEICS);
+          printf("      %2d          Calculate descendants/progenitors [connecting group catalogues SnapNum-1 and SnapNum]\n",
+                 RST_CALCDESC);
+          printf("      %2d          Arrange halos in merger trees [using group catalogues up to SnapNum]\n", RST_MAKETREES);
+          printf("      %2d          Carry out I/O bandwidth test to determine best setting for number of concurrent reads/writes\n",
+                 RST_IOBANDWIDTH);
+          printf("      %2d          Make an image of a lightcone particle output [image parameterfile]\n", RST_LCIMAGE);
+          printf("      %2d          Produce semi-analytic galaxies by running LGalaxies [lgal-parameterfile <conenr>]\n",
+                 RST_LGALAXIES);
+          printf("      %2d          Rearrange particle-lightcone data in merger tree order <conenr>  <firstnum>  <lastnum>\n",
+                 RST_LCREARRANGE);
+          printf("      %2d          Rearrange most-bound snapshot data in merger tree order <firstnum>  <lastnum>\n",
+                 RST_SNPREARRANGE);
+          printf("\n");
+        }
+      Sim.endrun();
+    }
+
+  /*  argv[1]  holds the parameterfile */
+
+  if(argc >= 3)
+    All.RestartFlag = (enum restart_options)atoi(argv[2]);
+  else
+    All.RestartFlag = RST_BEGIN;
+
+  int restartSnapNum = -1;
+  if(argc >= 4)
+    restartSnapNum = atoi(argv[3]);
+
+  /* Do minimal validation of arguments here rather than in random places in the code */
+  if((All.RestartFlag == RST_FOF || All.RestartFlag == RST_POWERSPEC || All.RestartFlag == RST_CONVERTSNAP ||
+      All.RestartFlag == RST_CALCDESC || All.RestartFlag == RST_MAKETREES) &&
+     restartSnapNum < 0)
+    {
+      Terminate("Need to give the snapshot number for the chosen option.\n");
+    }
+
+  /* set-up run based on parameterfile */
+  Sim.begrun1(argv[1]);
+
+  /* see if we are loading a restart file or an IC file */
+  if(All.RestartFlag == RST_RESUME)
+    {
+      restart Restart{Sim.Communicator};
+      Restart.load(&Sim);
+    }
+  else
+    {
+      /* We're reading an IC file. Is it a snapshot or really an IC? */
+      char fname[MAXLEN_PATH_EXTRA];
+
+      if(All.RestartFlag != RST_BEGIN && All.RestartFlag != RST_RESUME && restartSnapNum >= 0)
+        {
+          if(All.NumFilesPerSnapshot > 1)
+            sprintf(fname, "%s/snapdir_%03d/%s_%03d", All.OutputDir, restartSnapNum, All.SnapshotFileBase, restartSnapNum);
+          else
+            sprintf(fname, "%s%s_%03d", All.OutputDir, All.SnapshotFileBase, restartSnapNum);
+        }
+      else
+        {
+          strcpy(fname, All.InitCondFile);
+        }
+
+      if(All.RestartFlag == RST_STARTFROMSNAP)
+        {
+          All.ICFormat = All.SnapFormat;
+        }
+
+      if(All.RestartFlag == RST_CALCDESC)
+        {
+#ifdef MERGERTREE
+          Sim.MergerTree.descendants_in_postprocessing(restartSnapNum);
+          Sim.endrun();
+#else
+          Terminate("Compile with option MERGERTREE for this option");
+#endif
+        }
+
+      if(All.RestartFlag == RST_MAKETREES)
+        {
+#ifdef MERGERTREE
+          Sim.MergerTree.halotrees_construct(restartSnapNum);
+          Sim.endrun();
+#else
+          Terminate("Compile with option MERGERTREE for this option");
+#endif
+        }
+
+      if(All.RestartFlag == RST_IOBANDWIDTH)
+        {
+          Sim.measure_io_bandwidth();
+          Sim.endrun();
+        }
+
+      if(All.RestartFlag == RST_LCIMAGE)
+        {
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES)
+          Sim.LightCone.makeimage(argc, argv);
+          Sim.endrun();
+#else
+          Terminate("Compile with option LIGHTCONE_PARTICLES for this option");
+#endif
+        }
+
+      if(All.RestartFlag == RST_LGALAXIES)
+        {
+#ifdef LGALAXIES
+          Sim.LGalaxies.compute_semi_analytic_galaxies(argc, argv);
+          Sim.endrun();
+#else
+          Terminate("Compile with option LGALAXIES for this option");
+#endif
+        }
+
+      if(All.RestartFlag == RST_LCREARRANGE)
+        {
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES) && defined(REARRANGE_OPTION)
+          Sim.rearrange_lightcone(argc, argv);
+#else
+          Terminate("need to compile with REARRANGE_OPTION for this option to work\n");
+#endif
+          Sim.endrun();
+        }
+
+      if(All.RestartFlag == RST_SNPREARRANGE)
+        {
+#if defined(MERGERTREE) && defined(REARRANGE_OPTION)
+          Sim.rearrange_snapshot(argc, argv);
+#else
+          Terminate("need to compile with REARRANGE_OPTION for this option to work\n");
+#endif
+          Sim.endrun();
+        }
+
+#ifdef CREATE_GRID
+      if(All.RestartFlag == RST_BEGIN || All.RestartFlag == RST_CREATEICS)
+        Sim.Ngenic.create_grid();
+      else
+#endif
+        {
+          snap_io Snap(&Sim.Sp, Sim.Communicator, All.ICFormat); /* get an I/O object */
+
+          Snap.read_ic(fname);
+        }
+
+      /* If we are supposed to just convert the file, write and exit here. */
+      if(All.RestartFlag == RST_CONVERTSNAP)
+        {
+#ifdef COOLING
+          Sim.CoolSfr.InitCool();
+#endif
+          strcat(All.SnapshotFileBase, "_converted");
+          Sim.mpi_printf("Start writing file %s\nSnapNum %d\n", All.SnapshotFileBase, restartSnapNum);
+
+          snap_io Snap(&Sim.Sp, Sim.Communicator, All.SnapFormat); /* get an I/O object */
+          Snap.write_snapshot(restartSnapNum, NORMAL_SNAPSHOT);
+
+          Sim.endrun();
+        }
+
+      Sim.init(restartSnapNum);
+    }
+
+  Sim.begrun2();
+
+  Sim.run(); /* main simulation loop */
+
+  Sim.endrun(); /* clean up & finalize MPI */
+
+  return 0;
+}
diff --git a/src/main/main.h b/src/main/main.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc8a17c961bef36e282e712f22432d0f2c2e25e5
--- /dev/null
+++ b/src/main/main.h
@@ -0,0 +1,20 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file main.h
+ *
+ * \brief declares some global auxiliary functions
+ */
+
+#ifndef MAIN_H
+#define MAIN_H
+
+#include "gadgetconfig.h"
+
+int instrset_detect(void);
+void output_compile_time_options(void);
+
+#endif
diff --git a/src/main/run.cc b/src/main/run.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48bb4053c47c16a2f1b3efd92a75acb7fcc2559f
--- /dev/null
+++ b/src/main/run.cc
@@ -0,0 +1,749 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  run.cc
+ *
+ *  \brief contains the basic simulation loop that iterates over timesteps
+ */
+
+#include "gadgetconfig.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../gravtree/gravtree.h"
+#include "../io/io.h"
+#include "../io/snap_io.h"
+#include "../lightcone/lightcone_massmap_io.h"
+#include "../lightcone/lightcone_particle_io.h"
+#include "../logs/logs.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../ngbtree/ngbtree.h"
+#include "../sort/parallel_sort.h"
+#include "../system/system.h"
+
+/* TODO:
+ * - modify restart code to still support continuation of run beyond a new MaxTime from restart files.
+ * - consolidate global variables into ones that need to be kept across a restart, and ones that don't.
+ */
+
+/*!
+ * Main driver routine for advancing the simulation forward in time.
+ * The loop terminates when the cpu-time limit is reached, when a `stop' file
+ * is found in the output directory, or when the simulation ends because we
+ * arrived at TimeMax.
+ *
+ * If the simulation is started from initial conditions, an initial domain
+ * decomposition is performed, the gravitational forces are computed and
+ * initial hydro forces are calculated.
+ */
+void sim::run(void)
+{
+#if defined(NGENIC_TEST) && defined(PERIODIC) && defined(PMGRID)
+  snap_io Snap(&Sp, Communicator, All.SnapFormat);             /* get an I/O object */
+  Snap.write_snapshot(All.SnapshotFileCount, NORMAL_SNAPSHOT); /* write snapshot file */
+#if defined(POWERSPEC_ON_OUTPUT)
+  PM.calculate_power_spectra(All.SnapshotFileCount);
+#endif
+  return;
+#endif
+
+  while(1) /* main loop over synchronization points */
+    {
+      /* store old time for logging purposes */
+      All.TimeOld = All.Time;
+
+      /* determine the next synchronization time at which we have active particles */
+      integertime ti_next_kick_global = Sp.find_next_sync_point();
+
+#ifdef OUTPUT_NON_SYNCHRONIZED_ALLOWED
+      while(ti_next_kick_global > All.Ti_nextoutput && All.Ti_nextoutput >= 0)
+        {
+          All.Ti_Current = All.Ti_nextoutput;
+          All.Time       = All.get_absolutetime_from_integertime(All.Ti_Current);
+          All.set_cosmo_factors_for_current_time();
+
+          Sp.drift_all_particles();
+          create_snapshot_if_desired();
+        }
+#endif
+
+      All.Ti_Current = ti_next_kick_global;
+      All.Time       = All.get_absolutetime_from_integertime(All.Ti_Current);
+      All.set_cosmo_factors_for_current_time();
+      All.TimeStep = All.Time - All.TimeOld;
+
+#ifdef LIGHTCONE
+#ifdef LIGHTCONE_PARTICLES
+      mpi_printf("LIGHTCONE_PARTICLES: Lp.NumPart=%d   Checked %d box replicas out of list of length %d\n", Lp.NumPart,
+                 LightCone.NumLastCheck, LightCone.NumBoxes);
+#endif
+#ifdef LIGHTCONE_MASSMAPS
+      mpi_printf("LIGHTCONE_MASSMAPS:  Mp.NumPart=%d \n", Mp.NumPart);
+#endif
+#if defined(LIGHTCONE_MASSMAPS) || defined(LIGHTCONE_PARTICLES_GROUPS)
+      Sp.drift_all_particles();  // we do this here to be able to avoid large buffer sizes, if needed multiple binning operations are
+                                 // done
+#endif
+#endif
+
+      /* mark the timebins that are active on this step */
+      Sp.mark_active_timebins();
+
+      /* create lists with the particles that are synchronized on this step */
+      Sp.make_list_of_active_particles();
+
+      /* produce some further log messages */
+      Logs.output_log_messages();
+
+      /* call functions that update certain 'extra' physics settings to new current time */
+      set_non_standard_physics_for_current_time();
+
+      /* for sufficiently large steps, carry out a new domain decomposition */
+      if(All.HighestActiveTimeBin >= All.SmallestTimeBinWithDomainDecomposition)
+        {
+          NgbTree.treefree();
+          Domain.domain_free();
+
+          Sp.drift_all_particles();
+
+#ifdef LIGHTCONE_PARTICLES
+          LightCone.lightcone_clear_boxlist(All.Time);
+#endif
+
+#ifdef DEBUG_MD5
+          Logs.log_debug_md5("C");
+#endif
+          Domain.domain_decomposition(STANDARD);
+
+#ifdef DEBUG_MD5
+          Logs.log_debug_md5("D");
+#endif
+
+          NgbTree.treeallocate(Sp.NumGas, &Sp, &Domain);
+          NgbTree.treebuild(Sp.NumGas, NULL);
+        }
+
+      /* compute SPH densities and smoothing lengths for active SPH particles, and optionally those
+       * accessed passively. This also creates the list of active hydro particles at this
+       * synchronization point, which is stored in the list TimeBinsHydro.ActiveParticleList[].
+       * This list is reused for the subsequent second and first hydro step. */
+      NgbTree.compute_densities();
+
+      /* if particles have increased their smoothing lengths, this is recorded in parent tree nodes */
+      NgbTree.update_maxhsml();
+
+      /* hydro-forces, second half-step. This will also update the predicted velocities/entropies with the new current ones */
+      do_hydro_step_second_half();
+
+      /* this does the closing gravity half-step for the timebins that end at the current synchronization point */
+      do_gravity_step_second_half();
+
+      /* do any extra physics, in a Strang-split way, for the timesteps that are finished */
+      calculate_non_standard_physics_end_of_step();
+
+#ifdef DEBUG_MD5
+      Logs.log_debug_md5("A");
+#endif
+
+      Logs.compute_statistics();
+
+      Logs.flush_everything();
+
+#ifdef DEBUG_MD5
+      Logs.log_debug_md5("BEFORE SNAP");
+#endif
+      create_snapshot_if_desired();
+
+#ifdef DEBUG_MD5
+      Logs.log_debug_md5("AFTER SNAP");
+#endif
+
+      if(All.Ti_Current >= TIMEBASE) /* did we reached the final time? */
+        {
+          mpi_printf("\nFinal time=%g reached. Simulation ends.\n", All.TimeMax);
+
+          if(All.Ti_lastoutput != All.Ti_Current) /* make a snapshot at the final time in case none has been produced at this time */
+            {
+              snap_io Snap(&Sp, Communicator, All.SnapFormat); /* get an I/O object */
+              /* this snapshot will be overwritten if All.TimeMax is increased and the run is continued */
+              Snap.write_snapshot(All.SnapshotFileCount++, NORMAL_SNAPSHOT);
+            }
+
+          break;
+        }
+
+      /* kicks particles by half a gravity step */
+      find_timesteps_and_do_gravity_step_first_half();
+
+#ifdef DEBUG_MD5
+      Logs.log_debug_md5("B");
+#endif
+
+      /* Find new hydro timesteps. This will not change the set of active hydro particles at this synchronization point,
+       * but it can change how they are distributed over timebins. */
+      find_hydro_timesteps();
+
+      /* compute hydro-forces and apply momentum changes to interacting particle pairs for first half-steps */
+      do_hydro_step_first_half();
+
+      /* update the neighbor tree with the new velocities */
+      NgbTree.update_velocities();
+
+      /* output some CPU usage log-info (accounts for everything needed up to complete the previous timestep) */
+      Logs.write_cpu_log();
+
+#ifdef STOP_AFTER_STEP
+      if(All.NumCurrentTiStep == STOP_AFTER_STEP)
+        {
+          mpi_printf("RUN: We have reached the timestep specified with STOP_AFTER_STEP and therefore stop.");
+          endrun();
+        }
+#endif
+
+      All.NumCurrentTiStep++;
+
+      /* Check whether we should write a restart file */
+      if(check_for_interruption_of_run())
+        return;
+    }
+
+  restart Restart{Communicator};
+  Restart.write(this); /* write a restart file at final time - can be used to continue simulation beyond final time */
+
+  Logs.write_cpu_log(); /* output final cpu measurements */
+}
+
+/*! \brief calls extra modules after drift operator
+ *
+ * This routine is called after a new synchronization time has been determined.
+ */
+void sim::set_non_standard_physics_for_current_time(void)
+{
+#ifdef COOLING
+  CoolSfr.IonizeParams(); /* set UV background for the current time */
+#endif
+}
+
+/*! \brief calls extra modules at the end of the run loop
+ *
+ * The second gravitational half kick has already been applied to the
+ * particles at this time, i.e. the particles at the sync-point have finished their regular timestep.
+ *
+ */
+void sim::calculate_non_standard_physics_end_of_step(void)
+{
+#ifdef COOLING
+#ifdef STARFORMATION
+  CoolSfr.sfr_create_star_particles(&Sp);
+  CoolSfr.cooling_and_starformation(&Sp);
+#else
+  CoolSfr.cooling_only(&Sp);
+#endif
+#endif
+
+#ifdef MEASURE_TOTAL_MOMENTUM
+  Logs.compute_total_momentum();
+#endif
+}
+
+/*! \brief checks whether the run must interrupted
+ *
+ * The run is interrupted either if the stop file is present or,
+ * if 85% of the CPU time are up. This routine also handles the
+ * regular writing of restart files. The restart file is also
+ * written if the restart file is present
+ *
+ * \return 1 if the run has to be interrupted, 0 otherwise
+ */
+int sim::check_for_interruption_of_run(void)
+{
+  /* Check whether we need to interrupt the run */
+  int stopflag = 0;
+  if(ThisTask == 0)
+    {
+      FILE *fd;
+      char stopfname[MAXLEN_PATH_EXTRA];
+
+      sprintf(stopfname, "%sstop", All.OutputDir);
+      if((fd = fopen(stopfname, "r"))) /* Is the stop-file present? If yes, interrupt the run. */
+        {
+          fclose(fd);
+          printf("stop-file detected. stopping.\n");
+          stopflag = 1;
+          unlink(stopfname);
+        }
+
+      sprintf(stopfname, "%srestart", All.OutputDir);
+      if((fd = fopen(stopfname, "r"))) /* Is the restart-file present? If yes, write a user-requested restart file. */
+        {
+          fclose(fd);
+          printf("restart-file detected. writing restart files.\n");
+          stopflag = 3;
+          unlink(stopfname);
+        }
+
+      if(Logs.CPUThisRun > 0.85 * All.TimeLimitCPU) /* are we running out of CPU-time ? If yes, interrupt run. */
+        {
+          printf("reaching time-limit. stopping.\n");
+          stopflag = 2;
+        }
+    }
+
+  MPI_Bcast(&stopflag, 1, MPI_INT, 0, Communicator);
+
+  if(stopflag)
+    {
+      restart Restart{Communicator};
+      Restart.write(this); /* write restart file */
+      MPI_Barrier(Communicator);
+
+      if(stopflag == 3)
+        return 0;
+
+      if(stopflag == 2 && ThisTask == 0)
+        {
+          FILE *fd;
+          char contfname[MAXLEN_PATH_EXTRA];
+          sprintf(contfname, "%scont", All.OutputDir);
+          if((fd = fopen(contfname, "w")))
+            fclose(fd);
+        }
+      return 1;
+    }
+
+  /* is it time to write a regular restart-file? (for security) */
+  if(ThisTask == 0)
+    {
+      if((Logs.CPUThisRun - All.TimeLastRestartFile) >= All.CpuTimeBetRestartFile)
+        {
+          All.TimeLastRestartFile = Logs.CPUThisRun;
+          stopflag                = 3;
+        }
+      else
+        stopflag = 0;
+    }
+
+  MPI_Bcast(&stopflag, 1, MPI_INT, 0, Communicator);
+
+  if(stopflag == 3)
+    {
+      restart Restart{Communicator};
+      Restart.write(this); /* write an occasional restart file */
+      stopflag = 0;
+    }
+  return 0;
+}
+
+/*! \brief Returns the next output time that is equal or larger than
+ *  ti_curr
+ *
+ *  \param ti_curr current simulation time
+ */
+integertime sim::find_next_outputtime(integertime ti_curr)
+{
+  integertime ti;
+  integertime ti_next = -1;
+  double time;
+
+  All.DumpFlag_nextoutput = 1;
+
+  if(All.OutputListOn)
+    {
+      for(int i = 0; i < All.OutputListLength; i++)
+        {
+          time = All.OutputListTimes[i];
+
+          if(time >= All.TimeBegin && time <= All.TimeMax)
+            {
+              if(All.ComovingIntegrationOn)
+                ti = (integertime)(log(time / All.TimeBegin) / All.Timebase_interval);
+              else
+                ti = (integertime)((time - All.TimeBegin) / All.Timebase_interval);
+
+#ifndef OUTPUT_NON_SYNCHRONIZED_ALLOWED
+              /* We will now modify 'ti' to map it to the closest available output time according to the specified MaxSizeTimestep.
+               * The real output time may hence deviate by  +/- 0.5*MaxSizeTimestep from the desired output time.
+               */
+
+              /* first, determine maximum output interval based on All.MaxSizeTimestep */
+              integertime timax = (integertime)(All.MaxSizeTimestep / All.Timebase_interval);
+
+              /* make it a power 2 subdivision */
+              integertime ti_min = TIMEBASE;
+              while(ti_min > timax)
+                ti_min >>= 1;
+              timax = ti_min;
+
+              double multiplier = ti / ((double)timax);
+
+              /* now round this to the nearest multiple of timax */
+              ti = ((integertime)(multiplier + 0.5)) * timax;
+#endif
+
+              if(ti >= ti_curr)
+                {
+                  if(ti_next == -1)
+                    {
+                      ti_next                 = ti;
+                      All.DumpFlag_nextoutput = All.OutputListFlag[i];
+                    }
+
+                  if(ti_next > ti)
+                    {
+                      ti_next                 = ti;
+                      All.DumpFlag_nextoutput = All.OutputListFlag[i];
+                    }
+                }
+            }
+        }
+    }
+  else
+    {
+      if(All.ComovingIntegrationOn)
+        {
+          if(All.TimeBetSnapshot <= 1.0)
+            Terminate("TimeBetSnapshot > 1.0 required for your simulation.\n");
+        }
+      else
+        {
+          if(All.TimeBetSnapshot <= 0.0)
+            Terminate("TimeBetSnapshot > 0.0 required for your simulation.\n");
+        }
+      time     = All.TimeOfFirstSnapshot;
+      int iter = 0;
+
+      while(time < All.TimeBegin)
+        {
+          if(All.ComovingIntegrationOn)
+            time *= All.TimeBetSnapshot;
+          else
+            time += All.TimeBetSnapshot;
+
+          iter++;
+
+          if(iter > 1000000)
+            Terminate("Can't determine next output time.\n");
+        }
+
+      while(time <= All.TimeMax)
+        {
+          if(All.ComovingIntegrationOn)
+            ti = (integertime)(log(time / All.TimeBegin) / All.Timebase_interval);
+          else
+            ti = (integertime)((time - All.TimeBegin) / All.Timebase_interval);
+
+#ifndef OUTPUT_NON_SYNCHRONIZED_ALLOWED
+          /* We will now modify 'ti' to map it to the closest available output time according to the specified MaxSizeTimestep.
+           * The real output time may hence deviate by  +/- 0.5*MaxSizeTimestep from the desired output time.
+           */
+
+          /* first, determine maximum output interval based on All.MaxSizeTimestep */
+          integertime timax = (integertime)(All.MaxSizeTimestep / All.Timebase_interval);
+
+          /* make it a power 2 subdivision */
+          integertime ti_min = TIMEBASE;
+          while(ti_min > timax)
+            ti_min >>= 1;
+          timax = ti_min;
+
+          double multiplier = ti / ((double)timax);
+
+          /* now round this to the nearest multiple of timax */
+          ti = ((integertime)(multiplier + 0.5)) * timax;
+#endif
+
+          if(ti >= ti_curr)
+            {
+              ti_next = ti;
+              break;
+            }
+
+          if(All.ComovingIntegrationOn)
+            time *= All.TimeBetSnapshot;
+          else
+            time += All.TimeBetSnapshot;
+
+          iter++;
+
+          if(iter > MAXITER)
+            Terminate("Can't determine next output time.\n");
+        }
+    }
+
+  if(ti_next == -1)
+    {
+      ti_next = 2 * TIMEBASE; /* this will prevent any further output */
+
+      mpi_printf("\nSNAPSHOT: There is no valid time for a further snapshot file.\n");
+    }
+  else
+    {
+      double next = All.get_absolutetime_from_integertime(ti_next);
+
+      mpi_printf("\nSNAPSHOT: Setting next time for snapshot file to Time_next= %g  (DumpFlag=%d)\n\n", next, All.DumpFlag_nextoutput);
+    }
+
+  return ti_next;
+}
+
+/*! \brief Check if a snapshot should be saved
+ *
+ * This function checks whether a snapshot file or other kinds of output files,
+ * such as a projection, should be saved at the current time-step.
+ * If that is the case, the appropriate functions to produce the
+ * desired file are called and the parameter controlling the output are updated
+ * accordingly.
+ */
+void sim::create_snapshot_if_desired(void)
+{
+#if defined(LIGHTCONE_MASSMAPS)
+  /* we may do this on partial timesteps since for massmaps we always drift all particles, i.e. the lightcone is complete up to
+   * All.Time */
+  LightCone.lightcone_massmap_flush(1);
+#endif
+
+#ifndef OUTPUT_NON_SYNCHRONIZED_ALLOWED
+  if(All.HighestActiveTimeBin == All.HighestOccupiedTimeBin) /* allow only top-level synchronization points */
+#endif
+    if(All.Ti_Current >= All.Ti_nextoutput && All.Ti_nextoutput >= 0)
+      {
+        for(int i = 0; i < Sp.NumPart; i++)
+          if(Sp.P[i].Ti_Current != All.Ti_Current)
+            Terminate("P[i].Ti_Current != All.Ti_Current");
+
+        NgbTree.treefree();
+        Sp.TimeBinsGravity.timebins_free();
+        Sp.TimeBinsHydro.timebins_free();
+
+#ifdef FOF
+        mpi_printf("\nFOF: We shall first compute a group catalog for this snapshot file\n");
+
+        /* this structure will hold auxiliary information for each particle, needed only during group finding */
+        Sp.PS = (subfind_data *)Mem.mymalloc_movable(&Sp.PS, "PS", Sp.MaxPart * sizeof(subfind_data));
+        memset(Sp.PS, 0, Sp.MaxPart * sizeof(subfind_data));
+
+        /* First, we save the original location of the particles, in order to be able to revert to this layout later on */
+        for(int i = 0; i < Sp.NumPart; i++)
+          {
+            Sp.PS[i].OriginTask  = ThisTask;
+            Sp.PS[i].OriginIndex = i;
+          }
+
+        fof<simparticles> FoF{Communicator, &Sp, &Domain};
+
+        FoF.fof_fof(All.SnapshotFileCount, "fof", "groups", 0);
+
+#if defined(MERGERTREE) && defined(SUBFIND)
+        MergerTree.CurrTotNsubhalos = FoF.TotNsubhalos;
+        MergerTree.CurrNsubhalos    = FoF.Nsubhalos;
+
+        MergerTree.mergertree_determine_descendants_on_the_fly(All.SnapshotFileCount);
+
+        MergerTree.PrevTotNsubhalos = FoF.TotNsubhalos;
+        MergerTree.PrevNsubhalos    = FoF.Nsubhalos;
+
+        for(int n = 0; n < Sp.NumPart; n++)
+          {
+            Sp.P[n].PrevSubhaloNr     = Sp.PS[n].SubhaloNr;
+            Sp.P[n].PrevSizeOfSubhalo = Sp.PS[n].SizeOfSubhalo;
+            Sp.P[n].PrevRankInSubhalo = Sp.PS[n].RankInSubhalo;
+
+            if(Sp.P[n].PrevSubhaloNr.get() >= MergerTree.PrevTotNsubhalos && Sp.P[n].PrevSubhaloNr.get() != HALONR_MAX)
+              Terminate("Sp.P[n].PrevSubhaloNr=%lld  MergerTree.PrevTotNsubhalos=%lld\n", (long long)Sp.P[n].PrevSubhaloNr.get(),
+                        (long long)MergerTree.PrevTotNsubhalos);
+
+            if(Sp.P[n].PrevSizeOfSubhalo.get() > 0 && Sp.P[n].PrevSubhaloNr.get() == HALONR_MAX)
+              Terminate("Sp.P[n].PrevSizeOfSubhalo=%d  Sp.P[n].PrevSubhaloNr=%lld\n", (int)Sp.P[n].PrevSizeOfSubhalo.get(),
+                        (long long)Sp.P[n].PrevSubhaloNr.get());
+          }
+#endif
+#endif
+
+        if(All.DumpFlag_nextoutput)
+          {
+            snap_io Snap(&Sp, Communicator, All.SnapFormat);             /* get an I/O object */
+            Snap.write_snapshot(All.SnapshotFileCount, NORMAL_SNAPSHOT); /* write snapshot file */
+          }
+
+#ifdef SUBFIND_ORPHAN_TREATMENT
+        {
+          snap_io Snap(&Sp, Communicator, All.SnapFormat);
+          Snap.write_snapshot(All.SnapshotFileCount, MOST_BOUND_PARTICLE_SNAPHOT); /* write special snapshot file */
+        }
+#endif
+
+#ifdef FOF
+        /* now revert from output order to the original order */
+        for(int n = 0; n < Sp.NumPart; n++)
+          {
+            Sp.PS[n].TargetTask  = Sp.PS[n].OriginTask;
+            Sp.PS[n].TargetIndex = Sp.PS[n].OriginIndex;
+          }
+
+        TIMER_START(CPU_FOF);
+
+        Domain.particle_exchange_based_on_PS(Communicator);
+
+        TIMER_STOP(CPU_FOF);
+
+        Mem.myfree(Sp.PS);
+#endif
+
+#if defined(POWERSPEC_ON_OUTPUT) && defined(PERIODIC) && defined(PMGRID)
+        PM.calculate_power_spectra(All.SnapshotFileCount);
+#endif
+
+        All.SnapshotFileCount++;
+        All.Ti_nextoutput = find_next_outputtime(All.Ti_Current + 1);
+
+        Sp.TimeBinsHydro.timebins_allocate();
+        Sp.TimeBinsGravity.timebins_allocate();
+
+        /* we need to reconstruct the timebins here. Even though the particles are in the same place again,
+         * it could have happened that Sp.P was reduced in size temporarily below NumPart on a certain task,
+         * in which case the timebin data may have become invalid.
+         */
+        Sp.reconstruct_timebins();
+
+        NgbTree.treeallocate(Sp.NumGas, &Sp, &Domain);
+        NgbTree.treebuild(Sp.NumGas, NULL);
+      }
+
+#if defined(LIGHTCONE_PARTICLES)
+  if(Lp.TestIfAboveFillFactor(std::min<int>(Lp.MaxPart, Sp.MaxPart)))
+    {
+#if defined(LIGHTCONE_PARTICLES_GROUPS) && defined(FOF)
+      /* do this only on full timesteps if groups are calculated on lightcone */
+      if(All.Ti_Current >= TIMEBASE || All.HighestActiveTimeBin == All.HighestOccupiedTimeBin)
+        {
+          mpi_printf("\nLIGHTCONE_PARTICLES_GROUPS: We shall first compute a group catalogue for the lightcone particles\n");
+
+          /* assign unique IDs to Lp particles */
+
+          int *numlist = (int *)Mem.mymalloc("numlist", Lp.NumPart * sizeof(int));
+
+          MPI_Allgather(&Lp.NumPart, 1, MPI_INT, numlist, 1, MPI_INT, Communicator);
+
+          long long newID = 1;
+          for(int i = 0; i < ThisTask; i++)
+            newID += numlist[i];
+
+          for(int i = 0; i < Lp.NumPart; i++)
+            Lp.P[i].ID.set(newID++);
+
+          Mem.myfree(numlist);
+
+          domain<lcparticles> LcDomain(Communicator, &Lp);
+
+          LcDomain.domain_decomposition(STANDARD);
+
+          /* this structure will hold auxiliary information for each particle, needed only during group finding */
+          Lp.PS = (subfind_data *)Mem.mymalloc_movable(&Lp.PS, "PS", Lp.MaxPart * sizeof(subfind_data));
+          memset(Lp.PS, 0, Lp.MaxPart * sizeof(subfind_data));
+
+          /* First, we save the original location of the particles, in order to be able to revert to this layout later on */
+          for(int i = 0; i < Lp.NumPart; i++)
+            {
+              Lp.PS[i].OriginTask  = ThisTask;
+              Lp.PS[i].OriginIndex = i;
+            }
+
+          fof<lcparticles> FoF{Communicator, &Lp, &LcDomain};
+
+          double inner_distance = Driftfac.get_comoving_distance_for_scalefactor(All.Time);
+
+          FoF.fof_fof(All.LightconeFileCount, "lc_fof", "lc_groups", inner_distance);
+
+#endif
+
+          {
+            MergerTree.Ntrees = 0;
+            lightcone_particle_io Lcone(&Lp, &LightCone, &MergerTree, Communicator, All.SnapFormat); /* get an I/O object */
+
+            long long NumLP_tot = Lp.NumPart;
+            MPI_Allreduce(MPI_IN_PLACE, &NumLP_tot, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
+            mpi_printf("\nLIGHTCONE: writing particle lightcone conesnap files #%d ... (NumLP_tot = %lld)\n", All.LightconeFileCount,
+                       NumLP_tot);
+
+            for(int i = 0; i < Lp.NumPart; i++)
+              {
+                double pos[3];
+                Lp.signedintpos_to_pos((MySignedIntPosType *)Lp.P[i].IntPos, pos);
+                vec2pix_ring(LIGHTCONE_ORDER_NSIDE, pos, &Lp.P[i].ipnest);
+              }
+
+#if !defined(LIGHTCONE_PARTICLES_GROUPS)
+            /* let's now sort the lightcone_particle_data according to healpix pixel number */
+            mycxxsort_parallel(Lp.P, Lp.P + Lp.NumPart, Lp.compare_ipnest, Communicator);
+#endif
+
+            for(int conenr = 0; conenr < LightCone.Nlightcones; conenr++)
+              Lcone.lightcone_save(All.LightconeFileCount, conenr, false);
+
+            mpi_printf("LIGHTCONE: done with writing files.\n");
+
+            All.LightconeFileCount++;
+            /* let's put this in brackets so that the object's destructor will be called already here */
+          }
+
+#if defined(LIGHTCONE_PARTICLES_GROUPS) && defined(FOF)
+          /* now revert from output order to the original order */
+          for(int n = 0; n < Lp.NumPart; n++)
+            {
+              Lp.PS[n].TargetTask  = Lp.PS[n].OriginTask;
+              Lp.PS[n].TargetIndex = Lp.PS[n].OriginIndex;
+            }
+
+          TIMER_START(CPU_FOF);
+
+          LcDomain.particle_exchange_based_on_PS(Communicator);
+
+          Mem.myfree(Lp.PS);
+
+          LcDomain.domain_free();
+
+          TIMER_STOP(CPU_FOF);
+
+          int ncount[2] = {0, 0};
+          long long nsum[2];
+          for(int n = 0; n < Lp.NumPart; n++)
+            {
+              if(Lp.P[n].getFlagSaveDistance())
+                {
+                  Lp.P[n--] = Lp.P[--Lp.NumPart];
+                  ncount[0]++;
+                }
+              else
+                {
+                  ncount[1]++;
+                }
+            }
+
+          sumup_large_ints(2, ncount, nsum, Communicator);
+          mpi_printf("LIGHTCONE_PARTICLES_GROUPS: We could store %lld particles from the buffer, but had to keep %lld\n", nsum[0],
+                     nsum[1]);
+        }
+#else
+      Lp.NumPart = 0;
+#endif
+
+      if(Lp.MaxPart > LIGHTCONE_ALLOC_FAC * Sp.MaxPart + 1 && Lp.NumPart < LIGHTCONE_ALLOC_FAC * Sp.MaxPart)
+        Lp.reallocate_memory_maxpart(LIGHTCONE_ALLOC_FAC * Sp.MaxPart);
+    }
+#endif
+}
diff --git a/src/main/simulation.h b/src/main/simulation.h
new file mode 100644
index 0000000000000000000000000000000000000000..324234c6d88962df0edef2c36faa4f7435a7dc89
--- /dev/null
+++ b/src/main/simulation.h
@@ -0,0 +1,192 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  simulation.h
+ *
+ *  \brief declares the main simulation class holding its data and principle modules
+ */
+
+#ifndef SIMULATION_H
+#define SIMULATION_H
+
+#include "gadgetconfig.h"
+
+#include <mpi.h>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/constants.h"
+#include "../data/dtypes.h"
+#include "../data/lcparticles.h"
+#include "../data/macros.h"
+#include "../data/mmparticles.h"
+#include "../data/mymalloc.h"
+#include "../data/simparticles.h"
+#include "../domain/domain.h"
+#include "../fmm/fmm.h"
+#include "../fof/fof.h"
+#include "../gravity/ewald.h"
+#include "../gravity/grav_forcetest.h"
+#include "../gravtree/gravtree.h"
+#include "../gravtree/gwalk.h"
+#include "../io/parameters.h"
+#include "../io/restart.h"
+#include "../io/test_io_bandwidth.h"
+#include "../lgalaxies/lgalaxies.h"
+#include "../lightcone/lightcone.h"
+#include "../logs/logs.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../mpi_utils/setcomm.h"
+#include "../ngbtree/ngbtree.h"
+#include "../ngenic/ngenic.h"
+#include "../pm/pm.h"
+#include "../sph/sph.h"
+#include "../system/pinning.h"
+
+class sim : public pinning, public test_io_bandwidth
+{
+ public:
+  sim(MPI_Comm comm) : setcomm(comm), test_io_bandwidth(comm) {}
+
+  /* here come the main classes the code operates on */
+
+  simparticles Sp{Communicator}; /* stores all the simulation particles of the simulation */
+
+  domain<simparticles> Domain{Communicator, &Sp}; /* get an instance of a domain decomposition, operating on Sp */
+
+  /* Note: GravTree/NgbTree  inherit their communicator and NTask/ThisTask from their associated domain object  */
+
+#ifdef FMM
+  fmm GravTree; /* get an instance of a gravitational tree */
+#else
+  gwalk GravTree; /* get an instance of a gravitational tree */
+#endif
+
+  sph NgbTree; /* get an instance of a neighbour search tree */
+
+#ifdef PMGRID
+  pm PM{Communicator};
+#endif
+
+#ifdef MERGERTREE
+  mergertree MergerTree{Communicator, &Sp};
+#endif
+
+#ifdef COOLING
+  coolsfr CoolSfr{Communicator};
+#endif
+
+#ifdef NGENIC
+  ngenic Ngenic{Communicator, &Sp};
+#endif
+
+#ifdef LIGHTCONE
+#ifdef LIGHTCONE_PARTICLES
+  lcparticles Lp{Communicator}; /* stores all the buffered light-cone particles of the simulation */
+#endif
+#ifdef LIGHTCONE_MASSMAPS
+  mmparticles Mp{Communicator}; /* stores buffered particles to construct projected mass maps on the lightcone */
+#endif
+
+#if defined(LIGHTCONE_PARTICLES) && defined(LIGHTCONE_MASSMAPS) /* both particles and massmaps */
+  lightcone LightCone{Communicator, &Sp, &Lp, &Mp};
+#else
+#if defined(LIGHTCONE_PARTICLES)
+  lightcone LightCone{Communicator, &Sp, &Lp};
+#else
+  lightcone LightCone{Communicator, &Sp, &Mp};
+#endif
+#endif
+#endif  // end of LIGHTCONE
+
+#ifdef LGALAXIES
+#if defined(LIGHTCONE_PARTICLES)
+  lgalaxies LGalaxies{Communicator, &LightCone};
+#else
+  lgalaxies LGalaxies{Communicator};
+#endif
+#endif
+
+#ifdef FORCETEST
+  gravtest GravTest{&Sp, &GravTree, &Domain};
+#endif
+
+  void rearrange_lightcone(int argc, char **argv);
+  void rearrange_snapshot(int argc, char **argv);
+
+  template <typename partset>
+  void rearrange_generic(partset &Tp, int conenr, int firstnr, int lastnr);
+
+  template <typename partset>
+  void rearrange_fill_treetable(partset &Tp);
+
+  template <typename partset>
+  void rearrange_read(partset &Tp, int num, int conenr);
+
+  template <typename partset>
+  void rearrange_write(partset &Tp, int num, int conenr);
+
+ private:
+#ifdef PERIODIC
+  void check_omega(void);
+#endif
+
+  void setup_smoothinglengths(void);
+  void recreate_unique_ids(void);
+  void test_id_uniqueness(void);
+  int check_for_interruption_of_run(void);
+  void set_non_standard_physics_for_current_time(void);
+  void calculate_non_standard_physics_end_of_step(void);
+  integertime find_next_outputtime(integertime ti_curr);
+  double measure_cpu_performance(MPI_Comm Communicator);
+  double measure_hyper_cube_speed(const char *tag, MPI_Comm Communicator);
+  void measure_iprobe_performance(const char *tag);
+
+#ifdef SECOND_ORDER_LPT_ICS
+  double F1_Omega(double a);
+  double F2_Omega(double a);
+  int executed = 0;
+  void second_order_ic_correction(void);
+#endif
+
+ public:
+  void hello(void);
+  void endrun(void);
+  void begrun1(const char *parameterFile);
+  void begrun2(void);
+  void init(int RestartSnapNum);
+  void run(void);
+  void set_units(void);
+  void create_snapshot_if_desired(void);
+  void healthtest(void);
+  void mpi_report_comittable_memory(void);
+  long long report_comittable_memory(long long *MemTotal, long long *Committed_AS, long long *SwapTotal, long long *SwapFree);
+  long long report_free_size_in_tmpfs(void);
+  void do_gravity_step_second_half(void);
+  void find_timesteps_and_do_gravity_step_first_half(void);
+  void do_hydro_step_first_half(void);
+  void do_hydro_step_second_half(void);
+  void find_global_timesteps(void);
+  void find_hydro_timesteps(void);
+  void gravity(int timebin);
+  void gravity_long_range_force(void);
+  void gravity_comoving_factors(int timebin);
+  void gravity_pm(int timebin);
+  void gravity_set_oldacc(int timebin);
+
+  void hydro_force(int step_indicator);
+  void compute_grav_accelerations(int timebin);
+#ifdef FORCETEST_TESTFORCELAW
+  void gravity_forcetest_testforcelaw(void);
+#endif
+
+#ifdef EXTERNALGRAVITY
+  void gravity_external(void) { Terminate("Currently, no external gravity field is implemented.\n"); }
+#endif
+};
+
+#endif
diff --git a/src/mergertree/descendant.cc b/src/mergertree/descendant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f79ce7bc357756963e9e6935745c6ea868d968
--- /dev/null
+++ b/src/mergertree/descendant.cc
@@ -0,0 +1,951 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  descendant.cc
+ *
+ *  \brief code to determine the descendant subhalo of all subhalos in one catalogue in another one
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_descendant.h"
+#include "../mergertree/io_progenitors.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+/* This function allocates and fills the "Descendants" array, which gives the number of the descendant subhalo in the newly created
+ * subhalo catalogue for every subhalo from the previous catalogue.
+ */
+void mergertree::mergertree_determine_descendants_postproc(int num)
+{
+  Descendants = (desc_list *)Mem.mymalloc_movable(&Descendants, "Descendants", (PrevNsubhalos + 1) * sizeof(desc_list));
+  Progenitors = (prog_list *)Mem.mymalloc_movable(&Progenitors, "Progenitors", (CurrNsubhalos + 1) * sizeof(prog_list));
+
+  Sp->NumPart = MtrP_NumPart;
+
+  /* allocate a work structure */
+  desc = (desc_partdata *)Mem.mymalloc_movable(&desc, "desc", sizeof(desc_partdata) * Sp->NumPart);
+
+  /* let's fill in some relevant data into the work-structure */
+  for(int i = 0; i < Sp->NumPart; i++)
+    {
+      desc[i].CurrSubhaloNr.set(MtrP[i].SubhaloNr);
+      desc[i].CurrRankInSubhalo.set(MtrP[i].RankInSubhalo);
+
+      desc[i].PrevSubhaloNr.set(MtrP[i].PrevSubhaloNr);
+      desc[i].PrevRankInSubhalo.set(MtrP[i].PrevRankInSubhalo);
+
+      if(desc[i].PrevSubhaloNr.get() >= PrevTotNsubhalos && desc[i].PrevSubhaloNr.get() != HALONR_MAX)
+        Terminate("strange: i=%d  desc[i].PrevSubhaloNr=%lld  PrevTotNsubhalos=%lld", i, (long long)desc[i].PrevSubhaloNr.get(),
+                  (long long)PrevTotNsubhalos);
+    }
+
+  mergertree_determine_descendants(num);
+
+  mpi_printf("DONE!\n");
+}
+
+/* This function allocates and fills the "Descendants" array, which gives the number of the descendant subhalo in the newly created
+ * subhalo catalogue for every subhalo from the previous catalogue.
+ */
+void mergertree::mergertree_determine_descendants_on_the_fly(int num)
+{
+  if(num == 0)  // for the first output, we don't yet have anything to link
+    return;
+
+  Descendants = (desc_list *)Mem.mymalloc_movable(&Descendants, "Descendants", (PrevNsubhalos + 1) * sizeof(desc_list));
+  Progenitors = (prog_list *)Mem.mymalloc_movable(&Progenitors, "Progenitors", (CurrNsubhalos + 1) * sizeof(prog_list));
+
+  /* allocate a work structure */
+  desc = (desc_partdata *)Mem.mymalloc_movable(&desc, "desc", sizeof(desc_partdata) * Sp->NumPart);
+
+  /* Let's fill in some relevant data into a work-structure defined for every particle
+   * For each particle, we know the new and the old subhalo number, as well as its rank in the previous subhalo
+   */
+  for(int i = 0; i < Sp->NumPart; i++)
+    {
+      desc[i].CurrSubhaloNr     = Sp->PS[i].SubhaloNr;
+      desc[i].CurrRankInSubhalo = Sp->PS[i].RankInSubhalo;
+
+      desc[i].PrevSubhaloNr     = Sp->P[i].PrevSubhaloNr;
+      desc[i].PrevRankInSubhalo = Sp->P[i].PrevRankInSubhalo;
+
+      if(desc[i].PrevSubhaloNr.get() >= PrevTotNsubhalos && desc[i].PrevSubhaloNr.get() != HALONR_MAX)
+        Terminate("strange: i=%d  desc[i].PrevSubhaloNr=%lld  PrevTotNsubhalos=%lld", i, (long long)desc[i].PrevSubhaloNr.get(),
+                  (long long)PrevTotNsubhalos);
+    }
+
+  mergertree_determine_descendants(num);
+
+  Mem.myfree(desc);
+  Mem.myfree(Progenitors);
+  Mem.myfree(Descendants);
+}
+
+void mergertree::mergertree_determine_descendants(int num)
+{
+  /* determine matching pieces of subhalos and their mutual scores */
+  int nmatch = mergertree_find_matching_segments_and_scores();
+
+  /* select the main descendants */
+  mergertree_select_maximum_score_descendants(nmatch);
+
+  /* select the main progenitors */
+  mergertree_select_maximum_score_progenitors(nmatch);
+
+  /* let's determine the next-progenitor fields, which chain up those subhalos that have the same descendant */
+  mergertree_chain_up_progenitors_with_same_descendant();
+
+  /* set first progenitor field for chaining up those subhalos that have the same descendant  */
+  mergertree_set_first_progenitor_with_same_descendant();
+
+  /* let's determine the next-descendant fields, which chain up those subhalos that have the same progenitor */
+  mergertree_chain_up_descendants_with_same_progenitor();
+
+  /* set first descendant field for chaining up those subhalos that have the same progenitor */
+  mergertree_set_first_descendant_with_same_progenitor();
+
+  /**** write stuff to files *****/
+
+  descendant_io Desc(this, this->Communicator, All.SnapFormat); /* get an I/O object */
+  Desc.mergertree_save_descendants(num);
+
+  progenitors_io Prog(this, this->Communicator, All.SnapFormat);
+  Prog.mergertree_save_progenitors(num);
+
+  /* calculate and output some statistics to characterize linking */
+
+  int count_links = 0;
+  for(int i = 0; i < PrevNsubhalos; i++)
+    if(Descendants[i].DescSubhaloNr != HALONR_MAX)
+      count_links++;
+
+  long long tot_count_links;
+  sumup_large_ints(1, &count_links, &tot_count_links, Communicator);
+
+  mpi_printf("MERGERTREE: Was able to identify descendants for %lld out of %lld subhalos, i.e. for a fraction of %g\n",
+             tot_count_links, (long long)PrevTotNsubhalos, tot_count_links / (PrevTotNsubhalos + SMALLNUM));
+
+  int count_splits = 0;
+  for(int i = 0; i < CurrNsubhalos; i++)
+    if(Progenitors[i].NextDescSubhaloNr != HALONR_MAX)
+      count_splits++;
+
+  long long tot_count_splits;
+  sumup_large_ints(1, &count_splits, &tot_count_splits, Communicator);
+
+  mpi_printf("MERGERTREE: We have found secondary descendants for %lld halos out of %lld subhalos, i.e. for a fraction of %g\n",
+             tot_count_splits, (long long)CurrTotNsubhalos, tot_count_splits / (CurrTotNsubhalos + SMALLNUM));
+}
+
+int mergertree::mergertree_find_matching_segments_and_scores(void)
+{
+  /*  let's eliminate unbound particles from the work list */
+  int nmatch = Sp->NumPart;
+
+  for(int i = 0; i < nmatch; i++)
+    if(desc[i].CurrSubhaloNr.get() == HALONR_MAX || desc[i].PrevSubhaloNr.get() == HALONR_MAX)
+      {
+        desc[i] = desc[nmatch - 1];
+        nmatch--;
+        i--;
+      }
+
+  /* let's do the scoring */
+  for(int i = 0; i < nmatch; i++)
+    {
+      if(desc[i].PrevRankInSubhalo.get() < NUM_MOST_BOUND_PARTICLES_USED_FOR_TRACKING)
+        desc[i].DescScore = 1.0 / (1 + pow(desc[i].PrevRankInSubhalo.get(), 0.5));
+      else
+        desc[i].DescScore = 0;
+
+      if(desc[i].CurrRankInSubhalo.get() < NUM_MOST_BOUND_PARTICLES_USED_FOR_TRACKING)
+        desc[i].ProgScore = 1.0 / (1 + pow(desc[i].CurrRankInSubhalo.get(), 0.5));
+      else
+        desc[i].ProgScore = 0;
+    }
+
+  /* Now we sort the list such that the old subhalos are grouped together, and the new subhalo numbers are consecutive within them
+   */
+  mycxxsort_parallel(desc, desc + nmatch, mergertree_compare_PrevSubNr_NewSubNr, Communicator);
+
+  /* eliminate duplicate matched pairs on local processor and sum up the scores
+   */
+  int start = 0;
+  int count = 0;
+
+  if(nmatch > 0)
+    count = 1;
+
+  for(int i = 1; i < nmatch; i++)
+    if(desc[i].PrevSubhaloNr == desc[start].PrevSubhaloNr && desc[i].CurrSubhaloNr == desc[start].CurrSubhaloNr)
+      {
+        desc[start].DescScore += desc[i].DescScore;
+        desc[start].ProgScore += desc[i].ProgScore;
+      }
+    else
+      {
+        desc[count] = desc[i];
+        start       = count;
+        count++;
+      }
+
+  nmatch = count;
+
+  /* now we consolidate duplicate matched pairs on different processors. The list is still ordered, but there could be gaps
+   * (i.e. processors with only one or potentially zero entries)
+   */
+
+  /* obtain last and first element of each processor, and the counts from each processor */
+  desc_partdata *desc_first = (desc_partdata *)Mem.mymalloc("desc_first", NTask * sizeof(desc_partdata));
+  desc_partdata *desc_last  = (desc_partdata *)Mem.mymalloc("desc_last", NTask * sizeof(desc_partdata));
+  int *nmatch_list          = (int *)Mem.mymalloc("nmatch_list", NTask * sizeof(int));
+
+  MPI_Allgather(&desc[0], sizeof(desc_partdata), MPI_BYTE, desc_first, sizeof(desc_partdata), MPI_BYTE,
+                Communicator); /* note: the 0th element is guaranteed to be allocated */
+  MPI_Allgather(&desc[nmatch > 0 ? nmatch - 1 : 0], sizeof(desc_partdata), MPI_BYTE, desc_last, sizeof(desc_partdata), MPI_BYTE,
+                Communicator);
+  MPI_Allgather(&nmatch, 1, MPI_INT, nmatch_list, 1, MPI_INT, Communicator);
+
+  /* We go from the back through the tasks, and for the first element in each case, we move it ahead in the list as far as possible.
+   * Eliminated items are marked through DescScore = -1, and are then filtered out later.
+   */
+  for(int i = NTask - 1; i > 0; i--)
+    if(nmatch_list[i] > 0)
+      {
+        int target = -1;
+
+        for(int j = i - 1; j >= 0; j--)
+          {
+            if(nmatch_list[j] > 0)
+              {
+                if(nmatch_list[j] > 1)
+                  {
+                    if(desc_last[j].PrevSubhaloNr == desc_first[i].PrevSubhaloNr &&
+                       desc_last[j].CurrSubhaloNr == desc_first[i].CurrSubhaloNr)
+                      target = j;
+                  }
+                else /* nmatch_list[j] == 1 */
+                  {
+                    if(desc_first[j].PrevSubhaloNr == desc_first[i].PrevSubhaloNr &&
+                       desc_first[j].CurrSubhaloNr == desc_first[i].CurrSubhaloNr)
+                      target = j;
+                  }
+
+                break;
+              }
+          }
+
+        if(target >= 0)
+          {
+            if(nmatch_list[target] > 1)
+              {
+                desc_last[target].DescScore += desc_first[i].DescScore;
+                desc_last[target].ProgScore += desc_first[i].ProgScore;
+
+                if(ThisTask == target)
+                  {
+                    desc[nmatch - 1].DescScore += desc_first[i].DescScore;
+                    desc[nmatch - 1].ProgScore += desc_first[i].ProgScore;
+                  }
+              }
+            else
+              {
+                desc_first[target].DescScore += desc_first[i].DescScore;
+                desc_first[target].ProgScore += desc_first[i].ProgScore;
+
+                if(ThisTask == target)
+                  {
+                    desc[0].DescScore += desc_first[i].DescScore;
+                    desc[0].ProgScore += desc_first[i].ProgScore;
+                  }
+              }
+
+            desc_first[i].DescScore = -1;
+            desc_first[i].ProgScore = -1;
+
+            if(ThisTask == i)
+              {
+                desc[0].DescScore = -1;
+                desc[0].ProgScore = -1;
+              }
+          }
+      }
+
+  Mem.myfree(nmatch_list);
+  Mem.myfree(desc_last);
+  Mem.myfree(desc_first);
+
+  /* now eliminate the ones with negative score
+   */
+  if(nmatch > 0 && desc[0].DescScore < 0)
+    {
+      nmatch--;
+      memmove(desc, desc + 1, nmatch * sizeof(desc_partdata));
+    }
+
+  return nmatch;
+}
+
+void mergertree::mergertree_chain_up_descendants_with_same_progenitor(void)
+{
+  /* sort by progenitor to bring equal ones next to each other */
+  mycxxsort_parallel(Progenitors, Progenitors + CurrNsubhalos, mergertree_compare_ProgSubhaloNr, Communicator);
+
+  prog_list *elem_first = (prog_list *)Mem.mymalloc("elem_first", NTask * sizeof(prog_list));
+  prog_list *elem_last  = (prog_list *)Mem.mymalloc("elem_last", NTask * sizeof(prog_list));
+
+  /* note: the 0th element is guaranteed to be allocated even on ranks with zero CurrNsubhalos */
+  MPI_Allgather(&Progenitors[0], sizeof(prog_list), MPI_BYTE, elem_first, sizeof(prog_list), MPI_BYTE, Communicator);
+  MPI_Allgather(&Progenitors[CurrNsubhalos > 0 ? CurrNsubhalos - 1 : 0], sizeof(prog_list), MPI_BYTE, elem_last, sizeof(prog_list),
+                MPI_BYTE, Communicator);
+
+  /* get list of the subhalo count on each processor, and the cumulative number stored before */
+  int *tab_CurrNsubhalos = (int *)Mem.mymalloc("tab_CurrNsubhalos", sizeof(int) * NTask);
+  MPI_Allgather(&CurrNsubhalos, 1, MPI_INT, tab_CurrNsubhalos, 1, MPI_INT, Communicator);
+
+  int next_task = -1;
+  for(int i = ThisTask + 1; i < NTask; i++)
+    if(tab_CurrNsubhalos[i] > 0)
+      {
+        next_task = i;
+        break;
+      }
+
+  int prev_task = -1;
+  for(int i = ThisTask - 1; i >= 0; i--)
+    if(tab_CurrNsubhalos[i] > 0)
+      {
+        prev_task = i;
+        break;
+      }
+
+  for(int i = 0; i < CurrNsubhalos; i++)
+    {
+      if(i < CurrNsubhalos - 1)
+        {
+          if(Progenitors[i].ProgSubhaloNr == Progenitors[i + 1].ProgSubhaloNr && Progenitors[i].ProgSubhaloNr != HALONR_MAX)
+            Progenitors[i].NextDescSubhaloNr = Progenitors[i + 1].SubhaloNr;
+          else
+            Progenitors[i].NextDescSubhaloNr = HALONR_MAX;
+        }
+      else
+        {
+          if(next_task >= 0 && Progenitors[i].ProgSubhaloNr == elem_first[next_task].ProgSubhaloNr &&
+             Progenitors[i].ProgSubhaloNr != HALONR_MAX)
+            Progenitors[i].NextDescSubhaloNr = elem_first[next_task].SubhaloNr;
+          else
+            Progenitors[i].NextDescSubhaloNr = HALONR_MAX;
+        }
+
+      if(i > 0)
+        {
+          if(Progenitors[i].ProgSubhaloNr != Progenitors[i - 1].ProgSubhaloNr && Progenitors[i].ProgSubhaloNr != HALONR_MAX)
+            Progenitors[i].FirstDescFlag = 1; /* flags the first progenitors */
+          else
+            Progenitors[i].FirstDescFlag = 0;
+        }
+      else
+        {
+          if(Progenitors[i].ProgSubhaloNr != HALONR_MAX &&
+             (ThisTask == 0 || (prev_task >= 0 && Progenitors[i].ProgSubhaloNr != elem_last[prev_task].ProgSubhaloNr)))
+            Progenitors[i].FirstDescFlag = 1; /* flags the first progenitors */
+          else
+            Progenitors[i].FirstDescFlag = 0;
+        }
+    }
+
+  Mem.myfree(tab_CurrNsubhalos);
+  Mem.myfree(elem_last);
+  Mem.myfree(elem_first);
+
+  /* bring back into original order */
+  mycxxsort_parallel(Progenitors, Progenitors + CurrNsubhalos, mergertree_compare_SubhaloNr, Communicator);
+}
+
+/* This function determines the next progenitor field, which chains up those subhalos that have the same descendant */
+void mergertree::mergertree_chain_up_progenitors_with_same_descendant(void)
+{
+  /* sort by descendant to bring equal ones next to each other */
+  mycxxsort_parallel(Descendants, Descendants + PrevNsubhalos, mergertree_compare_DescSubhaloNr, Communicator);
+
+  desc_list *elem_first = (desc_list *)Mem.mymalloc("elem_first", NTask * sizeof(desc_list));
+  desc_list *elem_last  = (desc_list *)Mem.mymalloc("elem_last", NTask * sizeof(desc_list));
+
+  /* note: the 0th element is guaranteed to be allocated even on ranks with zero PrevNsubhalos */
+  MPI_Allgather(&Descendants[0], sizeof(desc_list), MPI_BYTE, elem_first, sizeof(desc_list), MPI_BYTE, Communicator);
+  MPI_Allgather(&Descendants[PrevNsubhalos > 0 ? PrevNsubhalos - 1 : 0], sizeof(desc_list), MPI_BYTE, elem_last, sizeof(desc_list),
+                MPI_BYTE, Communicator);
+
+  /* get list of the subhalo count on each processor, and the cumulative number stored before */
+  int *tab_PrevNsubhalos = (int *)Mem.mymalloc("tab_PrevNsubhalos", sizeof(int) * NTask);
+  MPI_Allgather(&PrevNsubhalos, 1, MPI_INT, tab_PrevNsubhalos, 1, MPI_INT, Communicator);
+
+  int next_task = -1;
+  for(int i = ThisTask + 1; i < NTask; i++)
+    if(tab_PrevNsubhalos[i] > 0)
+      {
+        next_task = i;
+        break;
+      }
+
+  int prev_task = -1;
+  for(int i = ThisTask - 1; i >= 0; i--)
+    if(tab_PrevNsubhalos[i] > 0)
+      {
+        prev_task = i;
+        break;
+      }
+
+  for(int i = 0; i < PrevNsubhalos; i++)
+    {
+      if(i < PrevNsubhalos - 1)
+        {
+          if(Descendants[i].DescSubhaloNr == Descendants[i + 1].DescSubhaloNr && Descendants[i].DescSubhaloNr != HALONR_MAX)
+            Descendants[i].NextProgSubhaloNr = Descendants[i + 1].PrevSubhaloNr;
+          else
+            Descendants[i].NextProgSubhaloNr = HALONR_MAX;
+        }
+      else
+        {
+          if(next_task >= 0 && Descendants[i].DescSubhaloNr == elem_first[next_task].DescSubhaloNr &&
+             Descendants[i].DescSubhaloNr != HALONR_MAX)
+            Descendants[i].NextProgSubhaloNr = elem_first[next_task].PrevSubhaloNr;
+          else
+            Descendants[i].NextProgSubhaloNr = HALONR_MAX;
+        }
+
+      if(i > 0)
+        {
+          if(Descendants[i].DescSubhaloNr != Descendants[i - 1].DescSubhaloNr && Descendants[i].DescSubhaloNr != HALONR_MAX)
+            Descendants[i].FirstProgFlag = 1; /* flags the first progenitors */
+          else
+            Descendants[i].FirstProgFlag = 0;
+        }
+      else
+        {
+          if(Descendants[i].DescSubhaloNr != HALONR_MAX &&
+             (ThisTask == 0 || (prev_task >= 0 && Descendants[i].DescSubhaloNr != elem_last[prev_task].DescSubhaloNr)))
+            Descendants[i].FirstProgFlag = 1; /* flags the first progenitors */
+          else
+            Descendants[i].FirstProgFlag = 0;
+        }
+    }
+
+  Mem.myfree(tab_PrevNsubhalos);
+  Mem.myfree(elem_last);
+  Mem.myfree(elem_first);
+
+  /* bring back into original order */
+  mycxxsort_parallel(Descendants, Descendants + PrevNsubhalos, mergertree_compare_PrevSubhaloNr, Communicator);
+}
+
+/******** set first progenitor field  ****/
+void mergertree::mergertree_set_first_progenitor_with_same_descendant(void)
+{
+  /* sort by descendant to bring equal ones next to each other */
+  mycxxsort_parallel(Descendants, Descendants + PrevNsubhalos, mergertree_compare_DescSubhaloNr, Communicator);
+
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  int *tab_CurrNsubhalos = (int *)Mem.mymalloc("tab_CurrNsubhalos", sizeof(int) * NTask);
+  MPI_Allgather(&CurrNsubhalos, 1, MPI_INT, tab_CurrNsubhalos, 1, MPI_INT, Communicator);
+
+  long long cumul_currnsubhalos = 0;
+  for(int i = 0; i < ThisTask; i++)
+    cumul_currnsubhalos += tab_CurrNsubhalos[i];
+
+  struct pair_data
+  {
+    long long subhalonr;
+    long long firstprognr;
+  };
+
+  pair_data *send_data = NULL;
+  pair_data *recv_data = NULL;
+  int nexport = 0, nimport = 0;
+
+  for(int mode = 0; mode < 2; mode++)  // go through this twice to simplify bookkeeping
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int task        = 0;
+      long long first = 0;
+
+      for(int i = 0; i < PrevNsubhalos; i++)
+        {
+          if(Descendants[i].FirstProgFlag && Descendants[i].DescSubhaloNr != HALONR_MAX)
+            {
+              while(task < NTask - 1 && Descendants[i].DescSubhaloNr >= first + tab_CurrNsubhalos[task])
+                {
+                  first += tab_CurrNsubhalos[task];
+                  task++;
+                }
+
+              if(mode == 0)
+                Send_count[task]++;
+              else
+                {
+                  int off = Send_offset[task] + Send_count[task]++;
+
+                  send_data[off].subhalonr   = Descendants[i].DescSubhaloNr;
+                  send_data[off].firstprognr = Descendants[i].PrevSubhaloNr;
+                }
+            }
+        }
+
+      if(mode == 0)  // prepare offset tables
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+          Recv_offset[0] = 0;
+          Send_offset[0] = 0;
+
+          for(int j = 0; j < NTask; j++)
+            {
+              nexport += Send_count[j];
+              nimport += Recv_count[j];
+
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          send_data = (pair_data *)Mem.mymalloc("pair_data", nexport * sizeof(pair_data));
+          recv_data = (pair_data *)Mem.mymalloc("pair_data", nimport * sizeof(pair_data));
+        }
+    }
+
+  /* exchange data */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+
+      if(recvTask < NTask)
+        {
+          if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+            MPI_Sendrecv(&send_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(pair_data), MPI_BYTE, recvTask, TAG_DENS_A,
+                         &recv_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(pair_data), MPI_BYTE, recvTask, TAG_DENS_A,
+                         Communicator, MPI_STATUS_IGNORE);
+        }
+    }
+
+  for(int i = 0; i < CurrNsubhalos; i++)
+    Progenitors[i].FirstProgSubhaloNr = HALONR_MAX;
+
+  for(int i = 0; i < nimport; i++)
+    {
+      int off = recv_data[i].subhalonr - cumul_currnsubhalos;
+
+      if(off < 0 || off >= CurrNsubhalos)
+        Terminate("off = %d  CurrNsubhalos = %d", off, CurrNsubhalos);
+
+      Progenitors[off].FirstProgSubhaloNr = recv_data[i].firstprognr;
+    }
+
+  Mem.myfree(recv_data);
+  Mem.myfree(send_data);
+
+  Mem.myfree(tab_CurrNsubhalos);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  /* bring back into original order */
+  mycxxsort_parallel(Descendants, Descendants + PrevNsubhalos, mergertree_compare_PrevSubhaloNr, Communicator);
+}
+
+/********** pick the progenitor with the maximum score *****/
+void mergertree::mergertree_select_maximum_score_progenitors(int nmatch)
+{
+  mycxxsort_parallel(desc, desc + nmatch, mergertree_compare_NewSubNr_PrevSubNr, Communicator);
+
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  int *tab_CurrNsubhalos = (int *)Mem.mymalloc("tab_CurrNsubhalos", sizeof(int) * NTask);
+  MPI_Allgather(&CurrNsubhalos, 1, MPI_INT, tab_CurrNsubhalos, 1, MPI_INT, Communicator);
+
+  long long cumul_currnsubhalos = 0;
+  for(int i = 0; i < ThisTask; i++)
+    cumul_currnsubhalos += tab_CurrNsubhalos[i];
+
+  desc_partdata *send_data = NULL;
+  desc_partdata *recv_data = NULL;
+  int nexport = 0, nimport = 0;
+  for(int mode = 0; mode < 2; mode++)  // go through this twice to simplify bookkeeping
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int task                 = 0;
+      unsigned long long first = 0;
+      for(int i = 0; i < nmatch; i++)
+        {
+          while(task < NTask - 1 && desc[i].CurrSubhaloNr.get() >= first + tab_CurrNsubhalos[task])
+            {
+              first += tab_CurrNsubhalos[task];
+              task++;
+            }
+
+          if(mode == 0)
+            Send_count[task]++;
+          else
+            {
+              int off = Send_offset[task] + Send_count[task]++;
+
+              send_data[off] = desc[i];
+            }
+        }
+
+      if(mode == 0)  // prepare offset tables
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+          Recv_offset[0] = 0;
+          Send_offset[0] = 0;
+
+          for(int j = 0; j < NTask; j++)
+            {
+              nexport += Send_count[j];
+              nimport += Recv_count[j];
+
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          send_data = (desc_partdata *)Mem.mymalloc("send_data", nexport * sizeof(desc_partdata));
+          recv_data = (desc_partdata *)Mem.mymalloc("recv_data", nimport * sizeof(desc_partdata));
+        }
+    }
+
+  /* exchange data */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+
+      if(recvTask < NTask)
+        {
+          if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+            MPI_Sendrecv(&send_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(desc_partdata), MPI_BYTE, recvTask,
+                         TAG_DENS_A, &recv_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(desc_partdata), MPI_BYTE,
+                         recvTask, TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
+        }
+    }
+
+  for(int i = 0; i < CurrNsubhalos; i++)
+    {
+      Progenitors[i].SubhaloNr     = cumul_currnsubhalos + i;
+      Progenitors[i].ProgSubhaloNr = HALONR_MAX;
+      Progenitors[i].MaxScore      = 0;
+    }
+
+  for(int i = 0; i < nimport; i++)
+    {
+      int index = recv_data[i].CurrSubhaloNr.get() - cumul_currnsubhalos;
+
+      if(index < 0 || index >= CurrNsubhalos)
+        Terminate("index=%d  CurrNsubhalos=%d", index, CurrNsubhalos);
+
+      if(recv_data[i].ProgScore > Progenitors[index].MaxScore)
+        {
+          Progenitors[index].MaxScore      = recv_data[i].ProgScore;
+          Progenitors[index].ProgSubhaloNr = recv_data[i].PrevSubhaloNr.get();
+        }
+    }
+
+  Mem.myfree(recv_data);
+  Mem.myfree(send_data);
+
+  Mem.myfree(tab_CurrNsubhalos);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+/********** determine the descendant with the maximum score *****/
+void mergertree::mergertree_select_maximum_score_descendants(int nmatch)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  /* get lists of the subhalo count on each processors, and the cumulative number stored before */
+  int *tab_PrevNsubhalos = (int *)Mem.mymalloc("tab_PrevNsubhalos", sizeof(int) * NTask);
+  MPI_Allgather(&PrevNsubhalos, 1, MPI_INT, tab_PrevNsubhalos, 1, MPI_INT, Communicator);
+
+  long long cumul_prevnsubhalos = 0;
+  for(int i = 0; i < ThisTask; i++)
+    cumul_prevnsubhalos += tab_PrevNsubhalos[i];
+
+  desc_partdata *send_data = NULL;
+  desc_partdata *recv_data = NULL;
+  int nexport = 0, nimport = 0;
+  for(int mode = 0; mode < 2; mode++)  // go through this twice to simplify bookkeeping
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int task                 = 0;
+      unsigned long long first = 0;
+      for(int i = 0; i < nmatch; i++)
+        {
+          if(PrevTotNsubhalos < 1)
+            Terminate("PrevTotNsubhalos = %lld", PrevTotNsubhalos);
+
+          while(task < NTask - 1 && desc[i].PrevSubhaloNr.get() >= first + tab_PrevNsubhalos[task])
+            {
+              first += tab_PrevNsubhalos[task];
+              task++;
+            }
+
+          if(mode == 0)
+            Send_count[task]++;
+          else
+            {
+              int off = Send_offset[task] + Send_count[task]++;
+
+              send_data[off] = desc[i];
+            }
+        }
+
+      if(mode == 0)  // prepare offset tables
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+          Recv_offset[0] = 0;
+          Send_offset[0] = 0;
+
+          for(int j = 0; j < NTask; j++)
+            {
+              nexport += Send_count[j];
+              nimport += Recv_count[j];
+
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          send_data = (desc_partdata *)Mem.mymalloc("send_data", nexport * sizeof(desc_partdata));
+          recv_data = (desc_partdata *)Mem.mymalloc("recv_data", nimport * sizeof(desc_partdata));
+        }
+    }
+
+  /* exchange data */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+
+      if(recvTask < NTask)
+        {
+          if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+            MPI_Sendrecv(&send_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(desc_partdata), MPI_BYTE, recvTask,
+                         TAG_DENS_A, &recv_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(desc_partdata), MPI_BYTE,
+                         recvTask, TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
+        }
+    }
+
+  for(int i = 0; i < PrevNsubhalos; i++)
+    {
+      Descendants[i].PrevSubhaloNr = cumul_prevnsubhalos + i;
+      Descendants[i].DescSubhaloNr = HALONR_MAX;
+      Descendants[i].MaxScore      = 0;
+    }
+
+  for(int i = 0; i < nimport; i++)
+    {
+      int index = recv_data[i].PrevSubhaloNr.get() - cumul_prevnsubhalos;
+
+      if(index < 0 || index >= PrevNsubhalos)
+        Terminate(
+            "index=%d i=%d  nimport=%d  PrevNsubhalos=%d  recv_data[i].PrevSubhaloNr=%lld  PrevTotNsubhalos=%lld "
+            "cumul_prevnsubhalos=%lld",
+            index, i, nimport, PrevNsubhalos, (long long)recv_data[i].PrevSubhaloNr.get(), PrevTotNsubhalos, cumul_prevnsubhalos);
+
+      if(recv_data[i].DescScore > Descendants[index].MaxScore)
+        {
+          Descendants[index].MaxScore      = recv_data[i].DescScore;
+          Descendants[index].DescSubhaloNr = recv_data[i].CurrSubhaloNr.get();
+        }
+    }
+
+  Mem.myfree(recv_data);
+  Mem.myfree(send_data);
+
+  Mem.myfree(tab_PrevNsubhalos);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+void mergertree::mergertree_set_first_descendant_with_same_progenitor(void)
+{
+  /* sort by progenitor to bring equal ones next to each other */
+  mycxxsort_parallel(Progenitors, Progenitors + CurrNsubhalos, mergertree_compare_ProgSubhaloNr, Communicator);
+
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  int *tab_PrevNsubhalos = (int *)Mem.mymalloc("tab_PrevNsubhalos", sizeof(int) * NTask);
+  MPI_Allgather(&PrevNsubhalos, 1, MPI_INT, tab_PrevNsubhalos, 1, MPI_INT, Communicator);
+
+  long long cumul_prevnsubhalos = 0;
+  for(int i = 0; i < ThisTask; i++)
+    cumul_prevnsubhalos += tab_PrevNsubhalos[i];
+
+  struct pair_data
+  {
+    long long subhalonr;
+    long long firstdescnr;
+  };
+
+  pair_data *send_data = NULL;
+  pair_data *recv_data = NULL;
+  int nexport = 0, nimport = 0;
+
+  for(int mode = 0; mode < 2; mode++)  // go through this twice to simplify bookkeeping
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int task        = 0;
+      long long first = 0;
+
+      for(int i = 0; i < CurrNsubhalos; i++)
+        {
+          if(Progenitors[i].FirstDescFlag && Progenitors[i].ProgSubhaloNr != HALONR_MAX)
+            {
+              while(task < NTask - 1 && Progenitors[i].ProgSubhaloNr >= first + tab_PrevNsubhalos[task])
+                {
+                  first += tab_PrevNsubhalos[task];
+                  task++;
+                }
+
+              if(mode == 0)
+                Send_count[task]++;
+              else
+                {
+                  int off = Send_offset[task] + Send_count[task]++;
+
+                  send_data[off].subhalonr   = Progenitors[i].ProgSubhaloNr;
+                  send_data[off].firstdescnr = Progenitors[i].SubhaloNr;
+                }
+            }
+        }
+
+      if(mode == 0)  // prepare offset tables
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+          Recv_offset[0] = 0;
+          Send_offset[0] = 0;
+
+          for(int j = 0; j < NTask; j++)
+            {
+              nexport += Send_count[j];
+              nimport += Recv_count[j];
+
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          send_data = (pair_data *)Mem.mymalloc("pair_data", nexport * sizeof(pair_data));
+          recv_data = (pair_data *)Mem.mymalloc("pair_data", nimport * sizeof(pair_data));
+        }
+    }
+
+  /* exchange data */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+
+      if(recvTask < NTask)
+        {
+          if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+            MPI_Sendrecv(&send_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(pair_data), MPI_BYTE, recvTask, TAG_DENS_A,
+                         &recv_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(pair_data), MPI_BYTE, recvTask, TAG_DENS_A,
+                         Communicator, MPI_STATUS_IGNORE);
+        }
+    }
+
+  for(int i = 0; i < PrevNsubhalos; i++)
+    Descendants[i].FirstDescSubhaloNr = HALONR_MAX;
+
+  for(int i = 0; i < nimport; i++)
+    {
+      int off = recv_data[i].subhalonr - cumul_prevnsubhalos;
+
+      if(off < 0 || off >= PrevNsubhalos)
+        Terminate("off = %d  PrevNsubhalos = %d", off, PrevNsubhalos);
+
+      Descendants[off].FirstDescSubhaloNr = recv_data[i].firstdescnr;
+    }
+
+  Mem.myfree(recv_data);
+  Mem.myfree(send_data);
+
+  Mem.myfree(tab_PrevNsubhalos);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  /* bring back into original order */
+  mycxxsort_parallel(Progenitors, Progenitors + CurrNsubhalos, mergertree_compare_SubhaloNr, Communicator);
+}
+
+#endif
diff --git a/src/mergertree/halotrees.cc b/src/mergertree/halotrees.cc
new file mode 100644
index 0000000000000000000000000000000000000000..52b2d0a03071e736960638d4084e8fad8ac757a9
--- /dev/null
+++ b/src/mergertree/halotrees.cc
@@ -0,0 +1,2166 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  halotrees.cc
+ *
+ *  \brief constructs trees that link together just the subhalos related by descendant relationships
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../fof/fof_io.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_descendant.h"
+#include "../mergertree/io_halotrees.h"
+#include "../mergertree/io_progenitors.h"
+#include "../mergertree/io_treelinks.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+/* This is the main function for constructing the halo trees.
+ */
+void mergertree::halotrees_construct(int lastsnapnr)
+{
+  domain<simparticles> Domain{Communicator, Sp};
+  fof<simparticles> FoF{Communicator, Sp, &Domain};
+
+  LastSnapShotNr = lastsnapnr;
+
+  Cats     = (halo_catalogue *)Mem.mymalloc_clear("Cats", sizeof(halo_catalogue) * (LastSnapShotNr + 1));
+  CatTimes = (times_catalogue *)Mem.mymalloc("CatTimes", sizeof(times_catalogue) * (LastSnapShotNr + 1));
+
+  /* Let's load all catalogs. */
+  halotrees_load_catalogues(&FoF);
+  mpi_printf("\nMERGERTREE: Catalogues loaded\n");
+
+  /* As preliminary work, let everybody know the number of subhalos on each rank, and assign a global SubhaloNr to each. */
+  halotrees_assign_global_subhalonr_and_groupnr();
+  mpi_printf("\nMERGERTREE: SubhaloNr assigned\n");
+
+  /* Assign the halos to disjoint trees - initially, each halo is in its own tree, which will then be linked up to form bigger trees.
+   * To get a reasonably memory-balanced distribution, we assign random tasks to them initially. */
+  halotrees_initial_treeassignment();
+  mpi_printf("\nMERGERTREE: Initial tree assignment\n");
+
+  /* We now proceed by linking the halos according to the descendant information */
+  halotrees_link_trees();
+  mpi_printf("\nMERGERTREE: Halo tree linked\n");
+
+  /* because the previously selected first progenitor is not necessarily the most significant branch, we make a new choice
+   * by sorting the progenitors by their significance, and then redetermine the first progenitor among the set of progenitors */
+  halotrees_determine_mainprogenitor();
+  mpi_printf("\nMERGERTREE: Determined the main progenitor\n");
+
+  /* Now the trees are linked, i.e. all halos belonging to the same tree have a common tree-id. We proceed by assigning new indices to
+   * all halos within the same treeid. */
+  halotrees_assign_new_treeindices();
+  mpi_printf("\nMERGERTREE: New tree indices assigned\n");
+
+  /* next, we need to remap the global descendant and first/next progenitor pointers to another set of such pointers that use the local
+   * tree ids */
+  halotrees_remap_treepointers();
+  mpi_printf("\nMERGERTREE: Remapping done\n");
+
+  /* now we can go ahead and move the halos belonging to the same tree together, and set up a table with the trees */
+  halotrees_collect_treehalos();
+  mpi_printf("\nMERGERTREE: Treehalos collected\n\n");
+
+  /* finally, output the trees to files */
+  halotrees_io TreesIO{this, this->Communicator, All.SnapFormat};
+  TreesIO.halotrees_save_trees();
+
+  mpi_printf("\nMERGERTREE: Trees saved:  A total number of %lld subhalos have been assembled in %lld trees.", TotNhalos, TotNtrees);
+  mpi_printf("\nMERGERTREE: Largest tree contains %d halos.\n\n", LargestHaloCount);
+
+  /* now also output links from the subhalo catalogues to the trees */
+  halotrees_save_subhalo_treelinks();
+  mpi_printf("\nMERGERTREE: Tree links saved\n\n");
+}
+
+/* This function is in charge of loading all the group catalogues.
+ */
+void mergertree::halotrees_load_catalogues(fof<simparticles> *FoF)
+{
+  /* let's first get the catalogs */
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    {
+      /* load the group catalog */
+      fof_io<simparticles> FoF_IO{FoF, Communicator, All.SnapFormat};
+      FoF_IO.fof_subfind_load_groups(num);
+
+      /* bring all groups and subhalos into the order in which they were stored in the files, because the
+       * parallel load routine may have mangled up the order.
+       */
+      mycxxsort_parallel(FoF->Group, FoF->Group + FoF->Ngroups, compare_Group_FileOffset, Communicator);
+      mycxxsort_parallel(FoF->Subhalo, FoF->Subhalo + FoF->Nsubhalos, compare_Subhalo_FileOffset, Communicator);
+
+      if(FoF_IO.LegacyFormat)
+        {
+          mpi_printf("\nFOF/SUBFIND: Legacy format from Arepo detected, trying to adjust for this.\n");
+          FoF->subfind_redetermine_groupnr();
+          FoF->fof_assign_group_offset();
+          FoF->subfind_assign_subhalo_offsettype();
+        }
+
+      /* save the groups info for later use */
+      Cats[num].Ngroups    = FoF->Ngroups;
+      Cats[num].TotNgroups = FoF->TotNgroups;
+
+      Cats[num].Nsubhalos    = FoF->Nsubhalos;
+      Cats[num].TotNsubhalos = FoF->TotNsubhalos;
+
+      CatTimes[num].Time     = FoF->Time;
+      CatTimes[num].Redshift = FoF->Redshift;
+
+      // we copy the group catalogue data here. Simply setting the pointer will violate the rule to not
+      // copy a (movable) memory pointer into another variable, and would later give corruption once the IO object is destroyed...
+      Cats[num].Group = (fof<simparticles>::group_properties *)Mem.mymalloc_movable(
+          &Cats[num].Group, "Cats[num].Group", FoF->Ngroups * sizeof(fof<simparticles>::group_properties));
+      memcpy(Cats[num].Group, FoF->Group, FoF->Ngroups * sizeof(fof<simparticles>::group_properties));
+      Mem.myfree_movable(FoF->Group);
+
+      Cats[num].Subhalo = (fof<simparticles>::subhalo_properties *)Mem.mymalloc_movable(
+          &Cats[num].Subhalo, "Subhalo", Cats[num].Nsubhalos * sizeof(fof<simparticles>::subhalo_properties));
+      memcpy(Cats[num].Subhalo, FoF->Subhalo, Cats[num].Nsubhalos * sizeof(fof<simparticles>::subhalo_properties));
+      Mem.myfree_movable(FoF->Subhalo);
+
+      /* allocate some storage for extra subhalo info */
+      Cats[num].SubExt =
+          (subhalo_extension *)Mem.mymalloc_movable(&Cats[num].SubExt, "Cats[num].SubExt", FoF->Nsubhalos * sizeof(subhalo_extension));
+    }
+
+  /* now let's load the descendant tree information (except for the last snapshot) */
+  for(int num = 0; num < LastSnapShotNr; num++)
+    {
+      /* fetch the descendants for the catalogs  */
+      descendant_io DescIO{this, this->Communicator, All.SnapFormat};
+      DescIO.mergertree_read_descendants(num);
+
+      /* if needed, restore proper order as in files */
+      mycxxsort_parallel(Descendants, Descendants + DescIO.Nsubhalos, compare_Desc_FileOffset, Communicator);
+
+      if(DescIO.TotNsubhalos != Cats[num].TotNsubhalos)
+        Terminate("inconsistency: DescIO.TotNsubhalos=%lld != Cats[num].TotNsubhalos=%lld", DescIO.TotNsubhalos,
+                  Cats[num].TotNsubhalos);
+
+      /* it's possible that the local number of descendants, Nsubhalos, does not match Cats[num].Nsubhalos,
+       * implying that we need to reshuffle how things are distributed over processors.
+       */
+      halotrees_reshuffle((char **)&Descendants, sizeof(desc_list), DescIO.Nsubhalos, Cats[num].Nsubhalos);
+
+      /* retain info for later use
+       * we copy the data here. Simply setting the pointer will violate the rule to not
+       * copy a (movable) memory pointer into another variable, and would later give corruption once the IO object is destroyed...
+       */
+      Cats[num].Descendants =
+          (desc_list *)Mem.mymalloc_movable(&Cats[num].Descendants, "Cats[num].Descendants", Cats[num].Nsubhalos * sizeof(desc_list));
+      memcpy(Cats[num].Descendants, Descendants, Cats[num].Nsubhalos * sizeof(desc_list));
+      Mem.myfree_movable(Descendants);
+    }
+
+  /* load the progenitor tree information (except for the first snapshot) */
+  for(int num = 1; num <= LastSnapShotNr; num++)
+    {
+      /* fetch from files */
+      progenitors_io ProgIO{this, this->Communicator, All.SnapFormat};
+      ProgIO.mergertree_read_progenitors(num);
+
+      /* if needed, restore proper order as in files */
+      mycxxsort_parallel(Progenitors, Progenitors + ProgIO.Nsubhalos, compare_Prog_FileOffset, Communicator);
+
+      if(ProgIO.TotNsubhalos != Cats[num].TotNsubhalos)
+        Terminate("inconsistency: ProgIO.TotNsubhalos=%lld != Cats[num].TotNsubhalos=%lld", ProgIO.TotNsubhalos,
+                  Cats[num].TotNsubhalos);
+
+      /* it's possible that the local number of descendants, Nsubhalos, does not match Cats[num].Nsubhalos,
+       * implying that we need to reshuffle how things are distributed over processors.
+       */
+      halotrees_reshuffle((char **)&Progenitors, sizeof(prog_list), ProgIO.Nsubhalos, Cats[num].Nsubhalos);
+
+      /* retain info for later use
+       * we copy the data here. Simply setting the pointer will violate the rule to not
+       * copy a (movable) memory pointer into another variable, and would later give corruption once the IO object is destroyed...
+       */
+      Cats[num].Progenitors =
+          (prog_list *)Mem.mymalloc_movable(&Cats[num].Progenitors, "Cats[num].Progenitors", Cats[num].Nsubhalos * sizeof(prog_list));
+      memcpy(Cats[num].Progenitors, Progenitors, Cats[num].Nsubhalos * sizeof(prog_list));
+      Mem.myfree_movable(Progenitors);
+    }
+}
+
+void mergertree::halotrees_save_subhalo_treelinks(void)
+{
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    {
+      treelinks_io TreeLinkIO{this, this->Communicator, All.SnapFormat};
+
+      TreeLinkIO.Nsubhalos    = Cats[num].Nsubhalos;
+      TreeLinkIO.TotNsubhalos = Cats[num].TotNsubhalos;
+
+      TreeLink = (treelink_data *)Mem.mymalloc("TreeLink", TreeLinkIO.Nsubhalos * sizeof(treelink_data));
+      for(int i = 0; i < TreeLinkIO.Nsubhalos; i++)
+        {
+          TreeLink[i].TreeID    = Cats[num].Subhalo[i].TreeID;
+          TreeLink[i].TreeIndex = Cats[num].Subhalo[i].TreeIndex;
+        }
+
+      /* save the tree-link info */
+      TreeLinkIO.treelinks_save(num);
+
+      Mem.myfree(TreeLink);
+    }
+}
+
+/* Here we collect the group and subhalo numbers stored on each processor for each snapshot, for communication purposes.
+ * We also assign for the subhalos of each catalogue a running global subhalo number which is then a unique identifier within this
+ * snapshot's catalogue.
+ */
+void mergertree::halotrees_assign_global_subhalonr_and_groupnr(void)
+{
+  long long grprev = 0;
+
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    {
+      Cats[num].TabNgroups = (int *)Mem.mymalloc("Cats[num].TabNgroups", NTask * sizeof(int));
+      MPI_Allgather(&Cats[num].Ngroups, 1, MPI_INT, Cats[num].TabNgroups, 1, MPI_INT, Communicator);
+
+      Cats[num].TabNsubhalos = (int *)Mem.mymalloc("Cats[num].TabNsubhalos", NTask * sizeof(int));
+      MPI_Allgather(&Cats[num].Nsubhalos, 1, MPI_INT, Cats[num].TabNsubhalos, 1, MPI_INT, Communicator);
+
+      long long subprev = 0;
+
+      for(int i = 0; i < ThisTask; i++)
+        subprev += Cats[num].TabNsubhalos[i];
+
+      for(int i = 0; i < Cats[num].Nsubhalos; i++)
+        Cats[num].Subhalo[i].SubhaloNr = subprev + i;
+
+      /* Note: SubhaloNr should be now the quantity to which Descendant/FirstProgenitor/NextProgenitor refer to.
+       */
+
+      /* also set the Group[].GroupNr field (was not stored in the field that were read in)
+       * In contrast, Subhalo[].GroupNr was read in  */
+      long long nbefore = 0;
+
+      for(int i = 0; i < ThisTask; i++)
+        nbefore += Cats[num].TabNgroups[i];
+
+      for(int i = 0; i < Cats[num].Ngroups; i++)
+        Cats[num].Group[i].GroupNr = nbefore + i;
+
+      /* define a GroupNr field for a unique group number */
+      for(int i = 0; i < Cats[num].Nsubhalos; i++)
+        Cats[num].Subhalo[i].UniqueGroupNr = Cats[num].Subhalo[i].GroupNr + grprev;
+
+      grprev += Cats[num].TotNgroups;
+
+      /* assign some special properties to the subhalos in the tree which come from the FOF group catalogue (like M200, etc.) */
+
+      for(int i = 0; i < Cats[num].Nsubhalos; i++)
+        Cats[num].Subhalo[i].M_Crit200 = 0;
+
+      int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+      int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+      int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+      int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+      struct exch_data
+      {
+        long long GroupNr;
+        MyFloat M_Crit200;
+        int loc_index;
+      };
+
+      exch_data *export_data = NULL, *import_data = NULL;
+      int nimport = 0, nexport = 0;
+
+      /* for communication bookkeeping reasons, we traverse the counting pattern twice */
+      for(int mode = 0; mode < 2; mode++)
+        {
+          for(int i = 0; i < NTask; i++)
+            Send_count[i] = 0;
+
+          int target                = 0;
+          long long ngroup_previous = 0;
+
+          for(int i = 0; i < Cats[num].Nsubhalos; i++)
+            {
+              /* select only the main subhalos */
+              if(Cats[num].Subhalo[i].SubRankInGr == 0)
+                {
+                  while(target < NTask - 1 && Cats[num].Subhalo[i].GroupNr >= (ngroup_previous + Cats[num].TabNgroups[target]))
+                    {
+                      ngroup_previous += Cats[num].TabNgroups[target];
+                      target++;
+                    }
+
+                  if(mode == 0)
+                    Send_count[target]++;
+                  else
+                    {
+                      int off = Send_offset[target] + Send_count[target]++;
+
+                      export_data[off].loc_index = i;
+                      export_data[off].GroupNr   = Cats[num].Subhalo[i].GroupNr;
+                    }
+                }
+            }
+
+          if(mode == 0)
+            {
+              MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+              Recv_offset[0] = Send_offset[0] = 0;
+              for(int j = 0; j < NTask; j++)
+                {
+                  nimport += Recv_count[j];
+                  nexport += Send_count[j];
+                  if(j > 0)
+                    {
+                      Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                      Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                    }
+                }
+
+              export_data = (exch_data *)Mem.mymalloc("export_data", nexport * sizeof(exch_data));
+              import_data = (exch_data *)Mem.mymalloc("import_data", nimport * sizeof(exch_data));
+            }
+        }
+
+      /* send data to target processors  */
+      for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+        {
+          int recvTask = ThisTask ^ ngrp;
+          if(recvTask < NTask)
+            if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+              MPI_Sendrecv(&export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(exch_data), MPI_BYTE, recvTask,
+                           TAG_DENS_B, &import_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(exch_data), MPI_BYTE,
+                           recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+        }
+
+      long long firstgrnr = 0;
+      for(int i = 0; i < ThisTask; i++)
+        firstgrnr += Cats[num].TabNgroups[i];
+
+      /* now read out the information we want from the groups */
+
+      for(int i = 0; i < nimport; i++)
+        {
+          int index = import_data[i].GroupNr - firstgrnr;
+
+          if(Cats[num].Group[index].GroupNr != import_data[i].GroupNr)
+            Terminate(
+                "bummer: num=%d i=%d Cats[num].Ngroups=%d nimport=%d  index=%d Cats[num].Group[index].GroupNr=%lld "
+                "import_data[i].GroupNr=%lld\n",
+                num, i, Cats[num].Ngroups, nimport, index, Cats[num].Group[index].GroupNr, import_data[i].GroupNr);
+
+          import_data[i].M_Crit200 = Cats[num].Group[index].M_Crit200;
+        }
+
+      /* send the results back */
+      for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+        {
+          int recvTask = ThisTask ^ ngrp;
+          if(recvTask < NTask)
+            if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+              MPI_Sendrecv(&import_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(exch_data), MPI_BYTE, recvTask,
+                           TAG_DENS_B, &export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(exch_data), MPI_BYTE,
+                           recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+        }
+
+      /* now read it out and assign the data */
+      for(int i = 0; i < nexport; i++)
+        Cats[num].Subhalo[export_data[i].loc_index].M_Crit200 = export_data[i].M_Crit200;
+
+      Mem.myfree(import_data);
+      Mem.myfree(export_data);
+
+      Mem.myfree(Recv_offset);
+      Mem.myfree(Recv_count);
+      Mem.myfree(Send_offset);
+      Mem.myfree(Send_count);
+    }
+}
+
+/* Give initially each subhalo its own unique TreeID, and a randomly placed processor.
+ */
+void mergertree::halotrees_initial_treeassignment(void)
+{
+  long long previous = 0;
+  for(int num = LastSnapShotNr; num >= 0; num--)
+    {
+      long long prevtask = 0;
+      for(int i = 0; i < ThisTask; i++)
+        prevtask += Cats[num].TabNsubhalos[i];
+
+      for(int i = 0; i < Cats[num].Nsubhalos; i++)
+        {
+          Cats[num].Subhalo[i].TreeTask = (int)(get_random_number() * NTask);
+          Cats[num].Subhalo[i].TreeID   = previous + prevtask + i;
+        }
+
+      previous += Cats[num].TotNsubhalos;
+    }
+}
+
+void mergertree::halotrees_select_interior_min_newtreeid(int mode, tlink *treehalos, long long totnsubs)
+{
+  for(int backforth = -1; backforth <= 1; backforth += 2)
+    {
+      long long i;
+
+      if(backforth == -1)
+        i = 0;
+      else
+        i = totnsubs - 2;
+
+      while(i >= 0 && i <= totnsubs - 2)
+        {
+          if((mode == 0 && treehalos[i].UniqueGroupNr == treehalos[i + 1].UniqueGroupNr) ||
+             (mode == 1 && treehalos[i].TreeID == treehalos[i + 1].TreeID))
+            {
+              if(treehalos[i].NewTreeID > treehalos[i + 1].NewTreeID)
+                {
+                  treehalos[i].NewTreeID   = treehalos[i + 1].NewTreeID;
+                  treehalos[i].NewTreeTask = treehalos[i + 1].NewTreeTask;
+                }
+              else if(treehalos[i].NewTreeID < treehalos[i + 1].NewTreeID)
+                {
+                  treehalos[i + 1].NewTreeID   = treehalos[i].NewTreeID;
+                  treehalos[i + 1].NewTreeTask = treehalos[i].NewTreeTask;
+                }
+            }
+          if(backforth == -1)
+            i++;
+          else
+            i--;
+        }
+    }
+}
+
+long long mergertree::halotrees_join_trees_via_fof_or_mostboundid_bridges(int mode)
+{
+  long long totnsubs = 0;
+
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    totnsubs += Cats[num].Nsubhalos;
+
+  tlink *treehalos = (tlink *)Mem.mymalloc("treehalos", (totnsubs + 1) * sizeof(tlink));
+
+  long long count = 0;
+
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    for(int i = 0; i < Cats[num].Nsubhalos; i++)
+      {
+        treehalos[count].TreeID   = Cats[num].Subhalo[i].TreeID;
+        treehalos[count].TreeTask = Cats[num].Subhalo[i].TreeTask;
+
+        treehalos[count].NewTreeID   = Cats[num].Subhalo[i].TreeID;
+        treehalos[count].NewTreeTask = Cats[num].Subhalo[i].TreeTask;
+
+        treehalos[count].OrigTask   = ThisTask;
+        treehalos[count].OrderIndex = count;
+
+        if(mode == 0)
+          treehalos[count].UniqueGroupNr = Cats[num].Subhalo[i].UniqueGroupNr;
+        else
+          treehalos[count].UniqueGroupNr = Cats[num].Subhalo[i].SubMostBoundID;
+
+        count++;
+      }
+
+  long long *count_list = (long long *)Mem.mymalloc("count_list", NTask * sizeof(long long));
+  MPI_Allgather(&totnsubs, sizeof(long long), MPI_BYTE, count_list, sizeof(long long), MPI_BYTE, Communicator);
+
+  int next_task = -1;
+  for(int i = ThisTask + 1; i < NTask; i++)
+    if(count_list[i] > 0)
+      {
+        next_task = i;
+        break;
+      }
+
+  int prev_task = -1;
+  for(int i = ThisTask - 1; i >= 0; i--)
+    if(count_list[i] > 0)
+      {
+        prev_task = i;
+        break;
+      }
+
+  tlink *elem_first = (tlink *)Mem.mymalloc("elem_first", NTask * sizeof(tlink));
+  tlink *elem_last  = (tlink *)Mem.mymalloc("elem_last", NTask * sizeof(tlink));
+
+  for(int mode = 0; mode < 2; mode++)
+    {
+      if(mode == 0)
+        {
+          /* bring halos in the same group together */
+          mycxxsort_parallel(treehalos, treehalos + totnsubs, compare_tlink_GroupNr, Communicator);
+          halotrees_select_interior_min_newtreeid(mode, treehalos, totnsubs);
+        }
+      else
+        {
+          /* bring halos in the same tree together */
+          mycxxsort_parallel(treehalos, treehalos + totnsubs, compare_tlink_TreeID, Communicator);
+          halotrees_select_interior_min_newtreeid(mode, treehalos, totnsubs);
+        }
+
+      long long totchanges = 0;
+      int iter             = 0;
+      do
+        {
+          int changes = 0;
+
+          /* note: the 0th element is guaranteed to be allocated even on ranks with zero totnsubs */
+          MPI_Allgather(&treehalos[0], sizeof(tlink), MPI_BYTE, elem_first, sizeof(tlink), MPI_BYTE, Communicator);
+          MPI_Allgather(&treehalos[totnsubs > 0 ? totnsubs - 1 : 0], sizeof(tlink), MPI_BYTE, elem_last, sizeof(tlink), MPI_BYTE,
+                        Communicator);
+
+          if(prev_task >= 0 && totnsubs > 0 &&
+             ((mode == 0 && elem_last[prev_task].UniqueGroupNr == treehalos[0].UniqueGroupNr) ||
+              (mode == 1 && elem_last[prev_task].TreeID == treehalos[0].TreeID)) &&
+             elem_last[prev_task].NewTreeID < treehalos[0].NewTreeID)
+            {
+              treehalos[0].NewTreeID   = elem_last[prev_task].NewTreeID;
+              treehalos[0].NewTreeTask = elem_last[prev_task].NewTreeTask;
+              changes++;
+            }
+
+          if(next_task >= 0 && totnsubs > 0 &&
+             ((mode == 0 && elem_first[next_task].UniqueGroupNr == treehalos[totnsubs - 1].UniqueGroupNr) ||
+              (mode == 1 && elem_first[next_task].TreeID == treehalos[totnsubs - 1].TreeID)) &&
+             elem_first[next_task].NewTreeID < treehalos[totnsubs - 1].NewTreeID)
+            {
+              treehalos[totnsubs - 1].NewTreeID   = elem_first[next_task].NewTreeID;
+              treehalos[totnsubs - 1].NewTreeTask = elem_first[next_task].NewTreeTask;
+              changes++;
+            }
+
+          halotrees_select_interior_min_newtreeid(mode, treehalos, totnsubs);
+
+          sumup_large_ints(1, &changes, &totchanges, Communicator);
+
+          if(++iter > MAXITER)
+            Terminate("too many iterations. mode=%d changes=%d", mode, changes);
+        }
+      while(totchanges > 0);
+    }
+
+  Mem.myfree(elem_last);
+  Mem.myfree(elem_first);
+  Mem.myfree(count_list);
+
+  /* now bring halos into original order */
+  mycxxsort_parallel(treehalos, treehalos + totnsubs, compare_tlink_OrigTask_OrderIndex, Communicator);
+
+  /* transfer new TreeIDs back */
+  count = 0;
+
+  long long flips = 0;
+
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    for(int i = 0; i < Cats[num].Nsubhalos; i++)
+      {
+        if(treehalos[count].NewTreeID != treehalos[count].TreeID)
+          {
+            Cats[num].Subhalo[i].TreeID   = treehalos[count].NewTreeID;
+            Cats[num].Subhalo[i].TreeTask = treehalos[count].NewTreeTask;
+
+            flips++;
+          }
+
+        count++;
+      }
+
+  Mem.myfree(treehalos);
+
+  return flips;
+}
+
+/* This function is in charge of linking the halos according to the descendant information.
+ */
+void mergertree::halotrees_link_trees(void)
+{
+  /*  we will build up the trees by assigning any two subhalos which are in the same tree the same TreeID, and the same TreeTask */
+
+  long long changesA = 0, changesB = 0;
+  int iter = 0;
+
+  do
+    {
+      changesA = 0;
+
+      /* propagate the TreeIDs from level num to level num-1, via the descendant relationship from num-1 to num */
+      if(iter & 1)
+        for(int num = 1; num <= LastSnapShotNr; num++)
+          changesA += halotrees_join_via_descendants(num);
+      else
+        for(int num = LastSnapShotNr; num > 0; num--)
+          changesA += halotrees_join_via_descendants(num);
+
+      changesB = 0;
+
+      /* propagate the TreeIDs from level num to level num+1, via the progenitor relationship from num+1 to num */
+      if(iter & 1)
+        for(int num = LastSnapShotNr - 1; num >= 0; num--)
+          changesB += halotrees_join_via_progenitors(num);
+      else
+        for(int num = 0; num < LastSnapShotNr; num++)
+          changesB += halotrees_join_via_progenitors(num);
+
+      MPI_Allreduce(MPI_IN_PLACE, &changesA, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
+      MPI_Allreduce(MPI_IN_PLACE, &changesB, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
+
+      mpi_printf("MERGERTREE: iteration %d of joining trees via descendants and progenitor,  %lld   %lld  links\n", iter, changesA,
+                 changesB);
+
+      if(++iter > MAXITER)
+        Terminate("too many iterations");
+    }
+  while(changesA + changesB > 0);
+
+  /* Now link all subhalos that are in a common FOF halo, if they have most bound IDs in common, or if they are linked by a progenitor
+   * relation, so that they appear in the same tree */
+  for(int mode = 0; mode < 2; mode++)
+    {
+      long long totlinks = 0;
+      int iter           = 0;
+      do
+        {
+          long long links = halotrees_join_trees_via_fof_or_mostboundid_bridges(mode);
+
+          MPI_Allreduce(&links, &totlinks, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
+
+          if(mode == 0)
+            mpi_printf("MERGERTREE: iteration %d of joining trees via common FOF bridges yielded %lld links\n", iter, totlinks);
+          else
+            mpi_printf("MERGERTREE: iteration %d of joining trees via common MostboundID bridges yielded %lld links\n", iter,
+                       totlinks);
+
+          if(++iter > MAXITER)
+            Terminate("too many iterations");
+        }
+      while(totlinks > 0);
+    }
+}
+
+/* this functions brings the halos of trees together according to the TreeID, so that they are ready for output to file.
+ * we are also assigning new TreeIDs to get a contiguous numbering, and we set up a table that gives the number of
+ * halos for each tree, and a cumulative offset into the output file where the tree starts
+ */
+void mergertree::halotrees_collect_treehalos(void)
+{
+  Nhalos = 0;
+
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    Nhalos += Cats[num].Nsubhalos;
+
+  /* get the total number of all subhalos in all trees */
+  sumup_large_ints(1, &Nhalos, &TotNhalos, Communicator);
+
+  /* we now release some memory that is not needed any more in order to reduce peak memory usage */
+  for(int num = LastSnapShotNr; num >= 0; num--)
+    {
+      Mem.myfree(Cats[num].TabNsubhalos);
+      Cats[num].TabNsubhalos = NULL;
+
+      Mem.myfree(Cats[num].TabNgroups);
+      Cats[num].TabNgroups = NULL;
+    }
+
+  for(int num = LastSnapShotNr; num >= 1; num--)
+    {
+      Mem.myfree(Cats[num].Progenitors);
+      Cats[num].Progenitors = NULL;
+    }
+
+  for(int num = LastSnapShotNr - 1; num >= 0; num--)
+    {
+      Mem.myfree(Cats[num].Descendants);
+      Cats[num].Descendants = NULL;
+    }
+
+  for(int num = LastSnapShotNr; num >= 0; num--)
+    {
+      Mem.myfree_movable(Cats[num].Group);
+      Cats[num].Group = NULL;
+    }
+
+  Halos = (treehalo_type *)Mem.mymalloc_movable(&Halos, "Halos", (Nhalos + 1) * sizeof(treehalo_type));
+
+  /* set up the halo data for the tree output */
+  long long off = 0;
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    for(int i = 0; i < Cats[num].Nsubhalos; i++)
+      {
+        Halos[off].TreeID    = Cats[num].Subhalo[i].TreeID;
+        Halos[off].TreeIndex = Cats[num].Subhalo[i].TreeIndex;
+
+        Halos[off].TreeMainProgenitor  = Cats[num].SubExt[i].TreeMainProgenitor;
+        Halos[off].TreeFirstProgenitor = Cats[num].SubExt[i].TreeFirstProgenitor;
+        Halos[off].TreeNextProgenitor  = Cats[num].SubExt[i].TreeNextProgenitor;
+        Halos[off].TreeDescendant      = Cats[num].SubExt[i].TreeDescendant;
+
+        Halos[off].TreeFirstDescendant = Cats[num].SubExt[i].TreeFirstDescendant;
+        Halos[off].TreeNextDescendant  = Cats[num].SubExt[i].TreeNextDescendant;
+        Halos[off].TreeProgenitor      = Cats[num].SubExt[i].TreeProgenitor;
+
+        Halos[off].SnapNum       = num;
+        Halos[off].SubhaloNr     = Cats[num].Subhalo[i].SubhaloNr;
+        Halos[off].GroupNr       = Cats[num].Subhalo[i].GroupNr;
+        Halos[off].UniqueGroupNr = Cats[num].Subhalo[i].UniqueGroupNr;
+
+        Halos[off].SubProp = Cats[num].Subhalo[i];
+
+        off++;
+      }
+
+  /* release some memory to reduce peak memory usage */
+  for(int num = LastSnapShotNr; num >= 0; num--)
+    {
+      Mem.myfree_movable(Cats[num].SubExt);
+      Cats[num].SubExt = NULL;
+    }
+
+  long long *count_list = (long long *)Mem.mymalloc("count_list", NTask * sizeof(long long));
+  long long totnsubs    = Nhalos;
+  MPI_Allgather(&totnsubs, sizeof(long long), MPI_BYTE, count_list, sizeof(long long), MPI_BYTE, Communicator);
+
+  /* first set the fields TreeFirstHaloInFOFgroup and TreeNextHaloInFOFgroup */
+  mycxxsort_parallel(Halos, Halos + Nhalos, compare_Halos_UniqueGroupNr_SubhaloNr, Communicator);
+
+  long long previous_UniqueGroupNr = HALONR_MAX;
+  long long new_treeindex          = HALONR_MAX;
+  for(int i = 0; i < Nhalos; i++)
+    {
+      if(Halos[i].UniqueGroupNr != previous_UniqueGroupNr)
+        {
+          previous_UniqueGroupNr = Halos[i].UniqueGroupNr;
+          new_treeindex          = Halos[i].TreeIndex;
+        }
+
+      Halos[i].TreeFirstHaloInFOFgroup = new_treeindex;
+
+      if(i < Nhalos - 1 && Halos[i].UniqueGroupNr == Halos[i + 1].UniqueGroupNr)
+        Halos[i].TreeNextHaloInFOFgroup = Halos[i + 1].TreeIndex;
+      else
+        Halos[i].TreeNextHaloInFOFgroup = -1;
+    }
+
+  treehalo_type *efirst = (treehalo_type *)Mem.mymalloc("efirst", NTask * sizeof(treehalo_type));
+  treehalo_type *elast  = (treehalo_type *)Mem.mymalloc("elast", NTask * sizeof(treehalo_type));
+  /* note: the 0th element is guaranteed to be allocated even on ranks with zero totnsubs */
+  MPI_Allgather(&Halos[0], sizeof(treehalo_type), MPI_BYTE, efirst, sizeof(treehalo_type), MPI_BYTE, Communicator);
+  MPI_Allgather(&Halos[Nhalos > 0 ? Nhalos - 1 : 0], sizeof(treehalo_type), MPI_BYTE, elast, sizeof(treehalo_type), MPI_BYTE,
+                Communicator);
+
+  if(Nhalos > 0)
+    for(int task = ThisTask + 1; task < NTask; task++)
+      if(count_list[task] > 0)
+        {
+          if(Halos[Nhalos - 1].UniqueGroupNr == efirst[task].UniqueGroupNr)
+            Halos[Nhalos - 1].TreeNextHaloInFOFgroup = efirst[task].TreeIndex;
+
+          break;
+        }
+
+  if(Nhalos > 0)
+    {
+      long long previous_UniqueGroupNr = HALONR_MAX;
+      long long new_treeindex          = HALONR_MAX;
+
+      for(int task = ThisTask - 1; task >= 0; task--)
+        {
+          if(count_list[task] > 0)
+            {
+              if(elast[task].UniqueGroupNr == Halos[0].UniqueGroupNr)
+                {
+                  previous_UniqueGroupNr = elast[task].UniqueGroupNr;
+                  new_treeindex          = elast[task].TreeFirstHaloInFOFgroup;
+                }
+            }
+        }
+
+      for(int i = 0; i < Nhalos; i++)
+        {
+          if(Halos[i].UniqueGroupNr != previous_UniqueGroupNr)
+            {
+              previous_UniqueGroupNr = Halos[i].UniqueGroupNr;
+              new_treeindex          = Halos[i].TreeIndex;
+              break;
+            }
+
+          Halos[i].TreeFirstHaloInFOFgroup = new_treeindex;
+        }
+    }
+
+  Mem.myfree(elast);
+  Mem.myfree(efirst);
+
+  /* parallel sort according to treeid and treeindex - this brings the tree halos together, in ascending order */
+  mycxxsort_parallel(Halos, Halos + Nhalos, compare_Halos_TreeID_TreeIndex, Communicator);
+
+  treehalo_type *elem_last = (treehalo_type *)Mem.mymalloc("elem_last", NTask * sizeof(treehalo_type));
+  /* now count the trees and overwrite the TreeIDs with new continuous IDs starting from zero */
+  MPI_Allgather(&Halos[Nhalos > 0 ? Nhalos - 1 : 0], sizeof(treehalo_type), MPI_BYTE, elem_last, sizeof(treehalo_type), MPI_BYTE,
+                Communicator);
+
+  long long treeid_previous = -1;
+  for(int task = ThisTask - 1; task >= 0; task--)
+    {
+      if(count_list[task] > 0)
+        {
+          treeid_previous = elem_last[task].TreeID;
+          break;
+        }
+    }
+
+  Ntrees = 0;
+  for(int i = 0; i < Nhalos; i++)
+    {
+      if(Halos[i].TreeID != treeid_previous)
+        Ntrees++;
+
+      treeid_previous = Halos[i].TreeID;
+      Halos[i].TreeID = Ntrees - 1;
+    }
+
+  int *ntrees_list = (int *)Mem.mymalloc("ntrees_list", NTask * sizeof(int));
+  MPI_Allgather(&Ntrees, 1, MPI_INT, ntrees_list, 1, MPI_INT, Communicator);
+
+  TotNtrees               = 0;
+  long long ntrees_before = 0;
+  for(int i = 0; i < NTask; i++)
+    {
+      TotNtrees += ntrees_list[i];
+      if(i < ThisTask)
+        ntrees_before += ntrees_list[i];
+    }
+
+  for(int i = 0; i < Nhalos; i++)
+    Halos[i].TreeID += ntrees_before;
+
+  /* let's now allocate the table of trees, with an extra [-1] element */
+  TreeTable = (halotrees_table *)Mem.mymalloc_movable(&TreeTable, "TreeTable", (Ntrees + 1) * sizeof(halotrees_table));
+  memset(TreeTable, 0, (Ntrees + 1) * sizeof(halotrees_table));
+  TreeTable += 1;
+
+  /* update what we have stored for the last element */
+  MPI_Allgather(&Halos[Nhalos > 0 ? Nhalos - 1 : 0], sizeof(treehalo_type), MPI_BYTE, elem_last, sizeof(treehalo_type), MPI_BYTE,
+                Communicator);
+
+  treeid_previous = -1;
+  for(int task = ThisTask - 1; task >= 0; task--)
+    {
+      if(count_list[task] > 0)
+        {
+          treeid_previous = elem_last[task].TreeID;
+          break;
+        }
+    }
+
+  Ntrees = 0;
+  for(int i = 0; i < Nhalos; i++)
+    {
+      if(Halos[i].TreeID != treeid_previous)
+        Ntrees++;
+
+      treeid_previous = Halos[i].TreeID;
+      TreeTable[Ntrees - 1].HaloCount++;
+      TreeTable[Ntrees - 1].TreeID = Halos[i].TreeID;
+    }
+
+  /* in the TreeTable[-1] element we have counted segments of trees that do not start on the local processor */
+
+  halotrees_table *elem_first = (halotrees_table *)Mem.mymalloc("elem_first", NTask * sizeof(halotrees_table));
+
+  MPI_Allgather(&TreeTable[-1], sizeof(halotrees_table), MPI_BYTE, elem_first, sizeof(halotrees_table), MPI_BYTE, Communicator);
+
+  if(Ntrees > 0)
+    for(int task = ThisTask + 1; task < NTask; task++)
+      {
+        if(TreeTable[Ntrees - 1].TreeID == elem_first[task].TreeID)
+          TreeTable[Ntrees - 1].HaloCount += elem_first[task].HaloCount;
+        else
+          break;
+      }
+
+  Mem.myfree(elem_first);
+
+  long long sumhalos = 0;
+  LargestHaloCount   = 0;
+  for(int i = 0; i < Ntrees; i++)
+    {
+      sumhalos += TreeTable[i].HaloCount;
+      if(TreeTable[i].HaloCount > LargestHaloCount)
+        LargestHaloCount = TreeTable[i].HaloCount;
+    }
+  MPI_Allreduce(MPI_IN_PLACE, &LargestHaloCount, 1, MPI_INT, MPI_MAX, Communicator);
+
+  long long *list_sumhalos = (long long *)Mem.mymalloc("list_sumhalos", NTask * sizeof(long long));
+  MPI_Allgather(&sumhalos, sizeof(long long), MPI_BYTE, list_sumhalos, sizeof(long long), MPI_BYTE, Communicator);
+
+  long long sum_check = 0;
+  for(int i = 0; i < NTask; i++)
+    sum_check += list_sumhalos[i];
+
+  if(sum_check != TotNhalos)
+    Terminate("TotNTrees=%lld  sum_check=%lld  !=  TotNhalos=%lld", (long long)TotNtrees, sum_check, (long long)TotNhalos);
+
+  if(Ntrees > 0)
+    {
+      TreeTable[0].FirstHalo = 0;
+      for(int task = 0; task < ThisTask; task++)
+        TreeTable[0].FirstHalo += list_sumhalos[task];
+    }
+
+  Mem.myfree(list_sumhalos);
+
+  for(int i = 1; i < Ntrees; i++)
+    TreeTable[i].FirstHalo = TreeTable[i - 1].FirstHalo + TreeTable[i - 1].HaloCount;
+
+  Mem.myfree_movable(ntrees_list);
+  Mem.myfree_movable(elem_last);
+  Mem.myfree_movable(count_list);
+}
+
+/*--------------------------------------------------------------------------------------------------------------*/
+
+/* This function redistributes a subdivided global array with local pieces stored in an address pointed to by ptr.
+ * On the local processor, there are currently 'ncurrent' elements, but we want 'ntarget' elements. The size of
+ * one element is given by 'len'. The buffer, whose address is stored in *ptr, must be resizable and will be
+ * reallocated to the new size of ntarget*len bytes.
+ */
+void mergertree::halotrees_reshuffle(char **ptr, size_t len, int ncurrent, int ntarget)
+{
+  int *Send_count  = (int *)Mem.mymalloc_movable(&Send_count, "Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc_movable(&Send_offset, "Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc_movable(&Recv_count, "Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc_movable(&Recv_offset, "Recv_offset", sizeof(int) * NTask);
+
+  /* copy current data to an auxiliary buffer */
+  char *buf = (char *)Mem.mymalloc_movable(&buf, "buf", ncurrent * len);
+  memcpy(buf, *ptr, ncurrent * len);
+
+  /* now resize source buffer of data to be able to accommodate the data of the desired target size */
+  *ptr = (char *)Mem.myrealloc_movable(*ptr, ntarget * len);
+
+  /* collect the current and target layout of the array */
+  int *tab_ncurrent = (int *)Mem.mymalloc("tab_ncurrent", NTask * sizeof(int));
+  MPI_Allgather(&ncurrent, 1, MPI_INT, tab_ncurrent, 1, MPI_INT, Communicator);
+
+  int *tab_ntarget = (int *)Mem.mymalloc("tab_ntarget", NTask * sizeof(int));
+  MPI_Allgather(&ntarget, 1, MPI_INT, tab_ntarget, 1, MPI_INT, Communicator);
+
+  /* now work out where our local data should go */
+  int nexport = 0, nimport = 0;
+
+  for(int i = 0; i < NTask; i++)
+    Send_count[i] = 0;
+
+  int nbefore = 0;
+  for(int i = 0; i < ThisTask; i++)
+    nbefore += tab_ncurrent[i];
+
+  int target = 0, ncum = 0;
+  for(int i = 0; i < ncurrent; i++)
+    {
+      while(target < NTask - 1 && nbefore + i >= ncum + tab_ntarget[target])
+        ncum += tab_ntarget[target++];
+
+      Send_count[target]++;
+    }
+
+  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+  Recv_offset[0] = Send_offset[0] = 0;
+
+  for(int j = 0; j < NTask; j++)
+    {
+      nimport += Recv_count[j];
+      nexport += Send_count[j];
+
+      if(j > 0)
+        {
+          Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+          Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+        }
+    }
+
+  if(nimport != ntarget)
+    Terminate("nimport != ntarget");
+
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&buf[Send_offset[recvTask] * len], Send_count[recvTask] * len, MPI_BYTE, recvTask, TAG_DENS_B,
+                       *ptr + Recv_offset[recvTask] * len, Recv_count[recvTask] * len, MPI_BYTE, recvTask, TAG_DENS_B, Communicator,
+                       MPI_STATUS_IGNORE);
+    }
+
+  Mem.myfree(tab_ntarget);
+  Mem.myfree(tab_ncurrent);
+  Mem.myfree(buf);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+/* This function sets up new pointer for navigating within a given tree based on TreeIndex.
+ * This is done by translating the old pointers to the new ones within a tree.
+ */
+void mergertree::halotrees_remap_treepointers(void)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  /* initialize new pointers to default values */
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    for(int i = 0; i < Cats[num].Nsubhalos; i++)
+      {
+        Cats[num].SubExt[i].TreeMainProgenitor  = -1;
+        Cats[num].SubExt[i].TreeFirstProgenitor = -1;
+        Cats[num].SubExt[i].TreeNextProgenitor  = -1;
+        Cats[num].SubExt[i].TreeDescendant      = -1;
+        Cats[num].SubExt[i].TreeFirstDescendant = -1;
+        Cats[num].SubExt[i].TreeNextDescendant  = -1;
+        Cats[num].SubExt[i].TreeProgenitor      = -1;
+      }
+
+  /* the three pointers we have to set point either one snapshot back (e.g. FirstProgenitor, delta=-1),
+   * they are in the same snapshot (NextProgenitor, delta=0), or they point into the future (Descendant, delta=+1)
+   */
+  for(int set = 0; set < 3; set++)
+    for(int delta = 1; delta >= -1; delta--)
+      {
+        if(set == 2 && delta != -1)
+          continue;
+
+        /* note that we take advantage of the fact that the Subgroups are ordered according to their global SubhaloNr */
+
+        /* in snapshot number 'base' we want to remap the pointers */
+        for(int base = 0; base <= LastSnapShotNr; base++)
+          {
+            /* this is the snapshot we are pointing to */
+            int num = base + delta;
+
+            if(set == 1 && delta != 1 && num == 0)
+              continue;
+
+            if((delta == -1 && num >= 0) || (delta >= 0 && base < LastSnapShotNr))
+              {
+                /* lets get the minimum and maximum subhalo numbers for the target snapshot number as a function of the task */
+                long long *list_min_subhalonr = (long long *)Mem.mymalloc("list_min_subhalonr", NTask * sizeof(long long));
+                long long *list_max_subhalonr = (long long *)Mem.mymalloc("list_max_subhalonr", NTask * sizeof(long long));
+
+                long long min_subhalonr =
+                    Cats[num].Nsubhalos > 0 ? Cats[num].Subhalo[0].SubhaloNr : HALONR_MAX;  // HALONR_MAX means empty here
+                long long max_subhalonr = Cats[num].Nsubhalos > 0 ? Cats[num].Subhalo[Cats[num].Nsubhalos - 1].SubhaloNr
+                                                                  : HALONR_MAX;  // HALONR_MAX means empty here
+
+                MPI_Allgather(&min_subhalonr, sizeof(long long), MPI_BYTE, list_min_subhalonr, sizeof(long long), MPI_BYTE,
+                              Communicator);
+                MPI_Allgather(&max_subhalonr, sizeof(long long), MPI_BYTE, list_max_subhalonr, sizeof(long long), MPI_BYTE,
+                              Communicator);
+
+                /* prepare a list of the current pointer values on the local processor */
+                data_list *list = (data_list *)Mem.mymalloc("list", Cats[base].Nsubhalos * sizeof(data_list));
+                int count       = 0;
+                for(int i = 0; i < Cats[base].Nsubhalos; i++)
+                  {
+                    if(set == 0)
+                      switch(delta)
+                        {
+                          case -1:
+                            list[count].targetsubhalonr = Cats[base].Progenitors[i].FirstProgSubhaloNr;
+                            break;
+                          case 0:
+                            list[count].targetsubhalonr = Cats[base].Descendants[i].NextProgSubhaloNr;
+                            break;
+                          case +1:
+                            list[count].targetsubhalonr = Cats[base].Descendants[i].DescSubhaloNr;
+                            break;
+                        }
+                    else if(set == 1)
+                      switch(delta)
+                        {
+                          case -1:
+                            list[count].targetsubhalonr = Cats[base].Progenitors[i].ProgSubhaloNr;
+                            break;
+                          case 0:
+                            list[count].targetsubhalonr = Cats[base].Progenitors[i].NextDescSubhaloNr;
+                            break;
+                          case +1:
+                            list[count].targetsubhalonr = Cats[base].Descendants[i].FirstDescSubhaloNr;
+                            break;
+                        }
+                    else
+                      switch(delta)
+                        {
+                          case -1:
+                            list[count].targetsubhalonr = Cats[base].Progenitors[i].MainProgSubhaloNr;
+                            break;
+                        }
+
+                    list[count].originsubhalonr = Cats[base].Subhalo[i].SubhaloNr;
+
+                    if(list[count].targetsubhalonr >= 0 &&
+                       list[count].targetsubhalonr < (long long)HALONR_MAX)  // need to only process existing pointers
+                      {
+                        if(list[count].targetsubhalonr >= Cats[num].TotNsubhalos)
+                          Terminate("set=%d delta=%d num=%d base=%d list[count].targetsubhalonr=%lld >= Cats[num].TotNsubhalos=%lld\n",
+                                    set, delta, num, base, list[count].targetsubhalonr, Cats[num].TotNsubhalos);
+
+                        list[count].origin   = i;
+                        list[count].intreeid = Cats[base].Subhalo[i].TreeID;
+                        count++;
+                      }
+                  }
+
+                /* sort it by current pointer value (which is the subhalonr) */
+                mycxxsort(list, list + count, compare_data_list_subhalonnr);
+
+                /* we now need to send them to other target processors since they may contain the corresponding subhalos */
+
+                int nexport = 0, nimport = 0;
+
+                remap_data *import_data = NULL, *export_data = NULL;
+
+                for(int mode = 0; mode < 2; mode++)  // go through this twice to simplify bookkeeping
+                  {
+                    for(int i = 0; i < NTask; i++)
+                      Send_count[i] = 0;
+
+                    int target = 0;
+
+                    for(int i = 0; i < count; i++)
+                      {
+                        while(target < NTask - 1 &&
+                              (list_min_subhalonr[target] == HALONR_MAX || list[i].targetsubhalonr > list_max_subhalonr[target]))
+                          target++;
+
+                        if(list_min_subhalonr[target] != HALONR_MAX && list[i].targetsubhalonr >= list_min_subhalonr[target] &&
+                           list[i].targetsubhalonr <= list_max_subhalonr[target])
+                          {
+                            if(mode == 0)
+                              Send_count[target]++;
+                            else
+                              {
+                                int off = Send_offset[target] + Send_count[target]++;
+
+                                export_data[off].loc_index       = i;
+                                export_data[off].targetsubhalonr = list[i].targetsubhalonr;
+                                export_data[off].originsubhalonr = list[i].originsubhalonr;
+                                export_data[off].intreeid        = list[i].intreeid;
+                              }
+                          }
+                        else
+                          Terminate(
+                              "this shouldn't happen:  set=%d delta=%d num=%d base=%d delta=%d   i=%d|count=%d   "
+                              "list[i].targetsubhalonr=%lld   "
+                              "target=%d  "
+                              "list_min_subhalonr[target]=%lld  list_max_subhalonr[target]=%lld\n",
+                              set, delta, num, base, delta, i, count, (long long)list[i].targetsubhalonr, target,
+                              list_min_subhalonr[target], list_max_subhalonr[target]);
+                      }
+
+                    if(mode == 0)  // prepare offset tables
+                      {
+                        MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+                        Recv_offset[0] = Send_offset[0] = 0;
+                        for(int j = 0; j < NTask; j++)
+                          {
+                            nimport += Recv_count[j];
+                            nexport += Send_count[j];
+                            if(j > 0)
+                              {
+                                Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                                Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                              }
+                          }
+
+                        export_data = (remap_data *)Mem.mymalloc("export_data", nexport * sizeof(remap_data));
+                        import_data = (remap_data *)Mem.mymalloc("import_data", nimport * sizeof(remap_data));
+                      }
+                  }
+
+                for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+                  {
+                    /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+                    int recvTask = ThisTask ^ ngrp;
+                    if(recvTask < NTask)
+                      if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+                        MPI_Sendrecv(&export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(remap_data), MPI_BYTE,
+                                     recvTask, TAG_DENS_B, &import_data[Recv_offset[recvTask]],
+                                     Recv_count[recvTask] * sizeof(remap_data), MPI_BYTE, recvTask, TAG_DENS_B, Communicator,
+                                     MPI_STATUS_IGNORE);
+                  }
+
+                /* incoming data is not necessarily be sorted according to subhalorn, that's why we need to sort it now */
+                for(int i = 0; i < nimport; i++)
+                  import_data[i].orig_index = i;
+
+                mycxxsort(import_data, import_data + nimport, compare_remap_data_subhalonr);
+
+                /* now we check the incoming data, and prepare new values for the pointers */
+                for(int i = 0, j = 0; i < Cats[num].Nsubhalos && j < nimport;)
+                  {
+                    if(Cats[num].Subhalo[i].SubhaloNr < import_data[j].targetsubhalonr)
+                      i++;
+                    else if(Cats[num].Subhalo[i].SubhaloNr > import_data[j].targetsubhalonr)
+                      {
+                        Terminate("Can't find targetsubhalonr=%lld Cats[num].Subhalo[i].SubhaloNr=%lld  i=%d|%d j=%d|%d",
+                                  import_data[j].targetsubhalonr, Cats[num].Subhalo[i].SubhaloNr, i, Cats[num].Nsubhalos, j, nimport);
+                      }
+                    else  // we have a match
+                      {
+                        import_data[j].new_treeindexptr = Cats[num].Subhalo[i].TreeIndex;
+                        import_data[j].treeid           = Cats[num].Subhalo[i].TreeID;
+
+                        if(Cats[num].Subhalo[i].TreeID != import_data[j].intreeid)
+                          Terminate(
+                              "\nWe are not in the same tree, which shouldn't be the case:  set=%d delta=%d  num=%d\n"
+                              "Cats[num].Subhalo[i].SubhaloNr=%lld \nCats[num].Subhalo[i].DescSubhaloNr=%lld\n"
+                              "Cats[num].Subhalo[i].NextProgSubhalor=%lld\nCats[num].Subhalo[i].FirstProgSubhalor=%lld\n"
+                              "Cats[num].Subhalo[i].NextDescSubhalor=%lld\nCats[num].Subhalo[i].FirstDescSubhalor=%lld\n"
+                              "Cats[num].Subhalo[i].ProgSubhalorNr=%lld\n"
+                              "originsubgalonr=%lld targetsubgalonr=%lld Cats[num].Subhalo[i].TreeID=%lld   "
+                              "import_data[j].intreeid=%lld   i=%d|%d  j=%d|%d  num=%d",
+                              set, delta, num, Cats[num].Subhalo[i].SubhaloNr, Cats[num].Descendants[i].DescSubhaloNr,
+                              Cats[num].Descendants[i].NextProgSubhaloNr, Cats[num].Progenitors[i].FirstProgSubhaloNr,
+                              Cats[num].Progenitors[i].NextDescSubhaloNr, Cats[num].Descendants[i].FirstDescSubhaloNr,
+                              Cats[num].Progenitors[i].ProgSubhaloNr, import_data[j].originsubhalonr, import_data[j].targetsubhalonr,
+                              Cats[num].Subhalo[i].TreeID, import_data[j].intreeid, i, Cats[num].Nsubhalos, j, nimport, num);
+
+                        j++;
+                      }
+                  }
+
+                mycxxsort(import_data, import_data + nimport, compare_remap_data_orig_index);
+
+                /* send the results back */
+                for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+                  {
+                    /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+                    int recvTask = ThisTask ^ ngrp;
+                    if(recvTask < NTask)
+                      if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+                        MPI_Sendrecv(&import_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(remap_data), MPI_BYTE,
+                                     recvTask, TAG_DENS_B, &export_data[Send_offset[recvTask]],
+                                     Send_count[recvTask] * sizeof(remap_data), MPI_BYTE, recvTask, TAG_DENS_B, Communicator,
+                                     MPI_STATUS_IGNORE);
+                  }
+
+                for(int i = 0; i < nexport; i++)
+                  {
+                    int idx = list[export_data[i].loc_index].origin;
+
+                    if(Cats[base].Subhalo[idx].TreeID != export_data[i].treeid)
+                      Terminate("something is wrong: delta=%d  i=%d|%d  idx=%d|%d     %lld %lld %lld", delta, i, nexport, idx,
+                                Cats[base].Nsubhalos, Cats[base].Subhalo[idx].TreeID, export_data[i].treeid, export_data[i].intreeid);
+
+                    if(set == 0)
+                      switch(delta)
+                        {
+                          case -1:
+                            Cats[base].SubExt[idx].TreeFirstProgenitor = export_data[i].new_treeindexptr;
+                            break;
+
+                          case 0:
+                            Cats[base].SubExt[idx].TreeNextProgenitor = export_data[i].new_treeindexptr;
+                            break;
+
+                          case +1:
+                            Cats[base].SubExt[idx].TreeDescendant = export_data[i].new_treeindexptr;
+                            break;
+                        }
+                    else if(set == 1)
+                      switch(delta)
+                        {
+                          case -1:
+                            Cats[base].SubExt[idx].TreeProgenitor = export_data[i].new_treeindexptr;
+                            break;
+
+                          case 0:
+                            Cats[base].SubExt[idx].TreeNextDescendant = export_data[i].new_treeindexptr;
+                            break;
+
+                          case +1:
+                            Cats[base].SubExt[idx].TreeFirstDescendant = export_data[i].new_treeindexptr;
+                            break;
+                        }
+                    else
+                      switch(delta)
+                        {
+                          case -1:
+                            Cats[base].SubExt[idx].TreeMainProgenitor = export_data[i].new_treeindexptr;
+                            break;
+                        }
+                  }
+
+                Mem.myfree(import_data);
+                Mem.myfree(export_data);
+                Mem.myfree(list);
+                Mem.myfree(list_max_subhalonr);
+                Mem.myfree(list_min_subhalonr);
+              }
+          }
+      }
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+/*--------------------------------------------------------------------------------------------------------------*/
+
+/* This function creates a new subhalo numbering within each tree, as identified by the set of subhalos with the same treeid.
+ */
+void mergertree::halotrees_assign_new_treeindices(void)
+{
+  long long totnsubs = 0;
+
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    totnsubs += Cats[num].Nsubhalos;
+
+  assign_data *a_data = (assign_data *)Mem.mymalloc("a_data", (totnsubs + 1) * sizeof(assign_data));
+
+  long long count = 0;
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    for(int i = 0; i < Cats[num].Nsubhalos; i++)
+      {
+        a_data[count].origin_task  = ThisTask;
+        a_data[count].origin_num   = num;
+        a_data[count].origin_index = i;
+        a_data[count].treeid       = Cats[num].Subhalo[i].TreeID;
+        count++;
+      }
+
+  mycxxsort_parallel(a_data, a_data + totnsubs, compare_assign_data_treeid_origin_num_origin_task_origin_index, Communicator);
+
+  long long newid = 0;
+
+  for(long long i = 0, treeindex = 0; i < totnsubs; i++)
+    {
+      if(i == 0 || a_data[i].treeid != a_data[i - 1].treeid)
+        treeindex = 0;
+
+      a_data[i].treeindex = treeindex++;
+
+      if(i > 0 && a_data[i].treeid != a_data[i - 1].treeid)
+        newid++;
+
+      a_data[i].newtreeid = newid;
+    }
+
+  long long *count_list = (long long *)Mem.mymalloc("count_list", NTask * sizeof(long long));
+  long long *newid_list = (long long *)Mem.mymalloc("newid_list", NTask * sizeof(long long));
+  MPI_Allgather(&totnsubs, sizeof(long long), MPI_BYTE, count_list, sizeof(long long), MPI_BYTE, Communicator);
+  MPI_Allgather(&newid, sizeof(long long), MPI_BYTE, newid_list, sizeof(long long), MPI_BYTE, Communicator);
+
+  assign_data *elem_last  = (assign_data *)Mem.mymalloc("elem_last", NTask * sizeof(assign_data));
+  assign_data *elem_first = (assign_data *)Mem.mymalloc("elem_first", NTask * sizeof(assign_data));
+
+  /* note: the 0th element is guaranteed to be allocated even on ranks with zero totnsubs */
+  MPI_Allgather(&a_data[totnsubs > 0 ? totnsubs - 1 : 0], sizeof(assign_data), MPI_BYTE, elem_last, sizeof(assign_data), MPI_BYTE,
+                Communicator);
+  MPI_Allgather(&a_data[0], sizeof(assign_data), MPI_BYTE, elem_first, sizeof(assign_data), MPI_BYTE, Communicator);
+
+  if(count_list[ThisTask] > 0)
+    {
+      long long count_before = 0;
+      for(int task = 0; task < ThisTask; task++)
+        if(count_list[task] > 0)
+          {
+            if(a_data[0].treeid == elem_last[task].treeid)
+              count_before += (elem_last[task].treeindex + 1);
+          }
+      if(count_before > 0)
+        {
+          for(long long i = 0; i < totnsubs; i++)
+            {
+              if(a_data[i].treeid != a_data[0].treeid)
+                break;
+
+              a_data[i].treeindex += count_before;
+            }
+        }
+
+      long long newidoff = 0;
+      for(int task = 0; task < ThisTask; task++)
+        newidoff += newid_list[task];
+
+      /* now also need to check whether there are treeid changes that line up with task boundaries. Be aware that that some tasks could
+       * be empty */
+
+      int taleft = 0;
+
+      do
+        {
+          while(taleft < ThisTask && count_list[taleft] == 0)
+            taleft++;
+
+          if(taleft < ThisTask)
+            {
+              int taright = taleft + 1;
+
+              while(count_list[taright] == 0)
+                taright++;
+
+              // taright may be at most equal to ThisTask here
+
+              if(elem_last[taleft].treeid != elem_first[taright].treeid)
+                newidoff++;
+
+              taleft = taright;
+            }
+        }
+      while(taleft < ThisTask);
+
+      /* now assignn new TreeIDs that form consecutive numbers without gaps */
+      for(long long i = 0; i < totnsubs; i++)
+        a_data[i].newtreeid += newidoff;
+    }
+
+  Mem.myfree(elem_first);
+  Mem.myfree(elem_last);
+  Mem.myfree(newid_list);
+  Mem.myfree(count_list);
+
+  mycxxsort_parallel(a_data, a_data + totnsubs, compare_assign_data_origin_task_origin_num_origin_index, Communicator);
+
+  count = 0;
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    for(int i = 0; i < Cats[num].Nsubhalos; i++)
+      {
+        Cats[num].Subhalo[i].TreeIndex = a_data[count].treeindex;
+        Cats[num].Subhalo[i].TreeID    = a_data[count].newtreeid;
+        count++;
+      }
+
+  Mem.myfree(a_data);
+}
+
+/*--------------------------------------------------------------------------------------------------------------*/
+
+/* This function puts subhalos from snapshot "num" that are linked by a descendant relation from snapshot "num-1" into the same tree
+ * by unifying their tree-id.
+ */
+int mergertree::halotrees_join_via_descendants(int num)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  /* note: we take advantage of the fact that the Subgroups are ordered according to their global SubhaloNr for each output number */
+
+  /* the following lists store the minimum and maximum subhalo number to be found on a given processor */
+  long long *list_min_subhalonr = (long long *)Mem.mymalloc("list_min_subhalonr", NTask * sizeof(long long));
+  long long *list_max_subhalonr = (long long *)Mem.mymalloc("list_max_subhalonr", NTask * sizeof(long long));
+
+  /* this value flags that there are no subhalos on the corresponding processor */
+  long long empty = HALONR_MAX;
+
+  MPI_Allgather(Cats[num].Nsubhalos > 0 ? &Cats[num].Subhalo[0].SubhaloNr : &empty, sizeof(long long), MPI_BYTE, list_min_subhalonr,
+                sizeof(long long), MPI_BYTE, Communicator);
+  MPI_Allgather(Cats[num].Nsubhalos > 0 ? &Cats[num].Subhalo[Cats[num].Nsubhalos - 1].SubhaloNr : &empty, sizeof(long long), MPI_BYTE,
+                list_max_subhalonr, sizeof(long long), MPI_BYTE, Communicator);
+
+  int nexport = 0, nimport = 0;
+
+  halotrees_data *import_data = NULL, *export_data = NULL;
+
+  /* for efficiency reasons, we need to traverse the local descendant pointers in increasing order of their target subhalo number, so
+   * let's create an auxiliary list for facilitating this.
+   */
+  descnr_data *sorted_list = (descnr_data *)Mem.mymalloc("sorted_list", Cats[num - 1].Nsubhalos * sizeof(descnr_data));
+
+  for(int i = 0; i < Cats[num - 1].Nsubhalos; i++)
+    {
+      sorted_list[i].DescSubhaloNr = Cats[num - 1].Descendants[i].DescSubhaloNr;
+      sorted_list[i].TreeID        = Cats[num - 1].Subhalo[i].TreeID;
+      sorted_list[i].TreeTask      = Cats[num - 1].Subhalo[i].TreeTask;
+      sorted_list[i].orig_index    = i;
+    }
+
+  mycxxsort(sorted_list, sorted_list + Cats[num - 1].Nsubhalos, compare_sorted_list_descsubhalonr);
+
+  /* for communication bookkeeping reasons, we traverse the counting pattern twice */
+  for(int mode = 0; mode < 2; mode++)
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int target = 0;
+
+      for(int i = 0; i < Cats[num - 1].Nsubhalos; i++)
+        {
+          while(target < NTask - 1 &&
+                (list_min_subhalonr[target] == empty || sorted_list[i].DescSubhaloNr > list_max_subhalonr[target]))
+            target++;
+
+          if(list_min_subhalonr[target] != empty && sorted_list[i].DescSubhaloNr >= list_min_subhalonr[target] &&
+             sorted_list[i].DescSubhaloNr <= list_max_subhalonr[target])
+            {
+              if(mode == 0)
+                Send_count[target]++;
+              else
+                {
+                  int off = Send_offset[target] + Send_count[target]++;
+
+                  export_data[off].loc_index    = sorted_list[i].orig_index;
+                  export_data[off].descendantnr = sorted_list[i].DescSubhaloNr;
+                  export_data[off].treeid       = sorted_list[i].TreeID;
+                  export_data[off].treetask     = sorted_list[i].TreeTask;
+                }
+            }
+        }
+
+      if(mode == 0)
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+          Recv_offset[0] = Send_offset[0] = 0;
+          for(int j = 0; j < NTask; j++)
+            {
+              nimport += Recv_count[j];
+              nexport += Send_count[j];
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          export_data = (halotrees_data *)Mem.mymalloc("export_data", nexport * sizeof(halotrees_data));
+          import_data = (halotrees_data *)Mem.mymalloc("import_data", nimport * sizeof(halotrees_data));
+        }
+    }
+
+  /* send data to those target processors that hold the descendant subhalos in order to fetch their tree-ids */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(halotrees_data), MPI_BYTE, recvTask,
+                       TAG_DENS_B, &import_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(halotrees_data), MPI_BYTE,
+                       recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* the collection of incoming data is not necessarily sorted according to descendantnr, so we need to sort it for efficient matching
+   */
+  for(int i = 0; i < nimport; i++)
+    import_data[i].orig_order = i;
+
+  mycxxsort(import_data, import_data + nimport, compare_halotrees_data_descendantnr);
+
+  int changes = 0;
+
+  /* now do the matching */
+  for(int i = 0, j = 0; i < Cats[num].Nsubhalos && j < nimport;)
+    {
+      if(Cats[num].Subhalo[i].SubhaloNr < import_data[j].descendantnr)
+        i++;
+      else if(Cats[num].Subhalo[i].SubhaloNr > import_data[j].descendantnr)
+        j++;
+      else
+        {
+          if(import_data[j].treeid > Cats[num].Subhalo[i].TreeID)
+            {
+              import_data[j].treeid   = Cats[num].Subhalo[i].TreeID;
+              import_data[j].treetask = Cats[num].Subhalo[i].TreeTask;
+              changes++;
+            }
+          else if(import_data[j].treeid < Cats[num].Subhalo[i].TreeID)
+            {
+              Cats[num].Subhalo[i].TreeID   = import_data[j].treeid;
+              Cats[num].Subhalo[i].TreeTask = import_data[j].treetask;
+              changes++;
+            }
+          j++;
+        }
+    }
+
+  /* reestablish original order */
+  mycxxsort(import_data, import_data + nimport, compare_halotrees_data_orig_order);
+
+  /* send the results back */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&import_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(halotrees_data), MPI_BYTE, recvTask,
+                       TAG_DENS_B, &export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(halotrees_data), MPI_BYTE,
+                       recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* now read it out and assign the new treeid/treetask value to the halos in the previous output (which are the progenitors) */
+  for(int i = 0; i < nexport; i++)
+    {
+      if(export_data[i].treeid != HALONR_MAX)
+        {
+          Cats[num - 1].Subhalo[export_data[i].loc_index].TreeID   = export_data[i].treeid;
+          Cats[num - 1].Subhalo[export_data[i].loc_index].TreeTask = export_data[i].treetask;
+        }
+    }
+
+  Mem.myfree(import_data);
+  Mem.myfree(export_data);
+  Mem.myfree(sorted_list);
+  Mem.myfree(list_max_subhalonr);
+  Mem.myfree(list_min_subhalonr);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  return changes;
+}
+
+int mergertree::halotrees_join_via_progenitors(int num)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  /* note: we take advantage of the fact that the Subgroups are ordered according to their global SubhaloNr for each output number */
+
+  /* the following lists store the minimum and maximum subhalo number to be found on a given processor */
+  long long *list_min_subhalonr = (long long *)Mem.mymalloc("list_min_subhalonr", NTask * sizeof(long long));
+  long long *list_max_subhalonr = (long long *)Mem.mymalloc("list_max_subhalonr", NTask * sizeof(long long));
+
+  /* this value flags that there are no subhalos on the corresponding processor */
+  long long empty = HALONR_MAX;
+
+  MPI_Allgather(Cats[num].Nsubhalos > 0 ? &Cats[num].Subhalo[0].SubhaloNr : &empty, sizeof(long long), MPI_BYTE, list_min_subhalonr,
+                sizeof(long long), MPI_BYTE, Communicator);
+  MPI_Allgather(Cats[num].Nsubhalos > 0 ? &Cats[num].Subhalo[Cats[num].Nsubhalos - 1].SubhaloNr : &empty, sizeof(long long), MPI_BYTE,
+                list_max_subhalonr, sizeof(long long), MPI_BYTE, Communicator);
+
+  int nexport = 0, nimport = 0;
+
+  halotrees_data *import_data = NULL, *export_data = NULL;
+
+  /* for efficiency reasons, we need to traverse the local progenitor pointers in increasing order of their target subhalo number, so
+   * let's create an auxiliary list for facilitating this.
+   */
+  prognr_data *sorted_list = (prognr_data *)Mem.mymalloc("sorted_list", Cats[num + 1].Nsubhalos * sizeof(prognr_data));
+
+  for(int i = 0; i < Cats[num + 1].Nsubhalos; i++)
+    {
+      sorted_list[i].ProgSubhaloNr = Cats[num + 1].Progenitors[i].ProgSubhaloNr;
+      sorted_list[i].TreeID        = Cats[num + 1].Subhalo[i].TreeID;
+      sorted_list[i].TreeTask      = Cats[num + 1].Subhalo[i].TreeTask;
+      sorted_list[i].orig_index    = i;
+    }
+
+  mycxxsort(sorted_list, sorted_list + Cats[num + 1].Nsubhalos, compare_sorted_list_progsubhalonr);
+
+  /* for communication bookkeeping reasons, we traverse the counting pattern twice */
+  for(int mode = 0; mode < 2; mode++)
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int target = 0;
+
+      for(int i = 0; i < Cats[num + 1].Nsubhalos; i++)
+        {
+          while(target < NTask - 1 &&
+                (list_min_subhalonr[target] == empty || sorted_list[i].ProgSubhaloNr > list_max_subhalonr[target]))
+            target++;
+
+          if(list_min_subhalonr[target] != empty && sorted_list[i].ProgSubhaloNr >= list_min_subhalonr[target] &&
+             sorted_list[i].ProgSubhaloNr <= list_max_subhalonr[target])
+            {
+              if(mode == 0)
+                Send_count[target]++;
+              else
+                {
+                  int off = Send_offset[target] + Send_count[target]++;
+
+                  export_data[off].loc_index    = sorted_list[i].orig_index;
+                  export_data[off].progenitornr = sorted_list[i].ProgSubhaloNr;
+                  export_data[off].treeid       = sorted_list[i].TreeID;
+                  export_data[off].treetask     = sorted_list[i].TreeTask;
+                }
+            }
+        }
+
+      if(mode == 0)
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+          Recv_offset[0] = Send_offset[0] = 0;
+          for(int j = 0; j < NTask; j++)
+            {
+              nimport += Recv_count[j];
+              nexport += Send_count[j];
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          export_data = (halotrees_data *)Mem.mymalloc("export_data", nexport * sizeof(halotrees_data));
+          import_data = (halotrees_data *)Mem.mymalloc("import_data", nimport * sizeof(halotrees_data));
+        }
+    }
+
+  /* send data to those target processors that hold the descendant subhalos in order to fetch their tree-ids */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(halotrees_data), MPI_BYTE, recvTask,
+                       TAG_DENS_B, &import_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(halotrees_data), MPI_BYTE,
+                       recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* the collection of incoming data is not necessarily sorted according to descendantnr, so we need to sort it for efficient
+   * matching
+   */
+  for(int i = 0; i < nimport; i++)
+    import_data[i].orig_order = i;
+
+  mycxxsort(import_data, import_data + nimport, compare_halotrees_data_progenitornr);
+
+  int changes = 0;
+
+  /* now do the matching */
+  for(int i = 0, j = 0; i < Cats[num].Nsubhalos && j < nimport;)
+    {
+      if(Cats[num].Subhalo[i].SubhaloNr < import_data[j].progenitornr)
+        i++;
+      else if(Cats[num].Subhalo[i].SubhaloNr > import_data[j].progenitornr)
+        j++;
+      else
+        {
+          if(import_data[j].treeid > Cats[num].Subhalo[i].TreeID)
+            {
+              import_data[j].treeid   = Cats[num].Subhalo[i].TreeID;
+              import_data[j].treetask = Cats[num].Subhalo[i].TreeTask;
+              changes++;
+            }
+          else if(import_data[j].treeid < Cats[num].Subhalo[i].TreeID)
+            {
+              Cats[num].Subhalo[i].TreeID   = import_data[j].treeid;
+              Cats[num].Subhalo[i].TreeTask = import_data[j].treetask;
+              changes++;
+            }
+          j++;
+        }
+    }
+
+  /* reestablish original order */
+  mycxxsort(import_data, import_data + nimport, compare_halotrees_data_orig_order);
+
+  /* send the results back */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&import_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(halotrees_data), MPI_BYTE, recvTask,
+                       TAG_DENS_B, &export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(halotrees_data), MPI_BYTE,
+                       recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* now read it out and assign the new treeid/treetask value to the halos in the previous output (which are the progenitors) */
+  for(int i = 0; i < nexport; i++)
+    {
+      if(export_data[i].treeid != HALONR_MAX)
+        {
+          Cats[num + 1].Subhalo[export_data[i].loc_index].TreeID   = export_data[i].treeid;
+          Cats[num + 1].Subhalo[export_data[i].loc_index].TreeTask = export_data[i].treetask;
+        }
+    }
+
+  Mem.myfree(import_data);
+  Mem.myfree(export_data);
+  Mem.myfree(sorted_list);
+  Mem.myfree(list_max_subhalonr);
+  Mem.myfree(list_min_subhalonr);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  return changes;
+}
+
+void mergertree::halotrees_determine_mainprogenitor(void)
+{
+  for(int num = 1; num <= LastSnapShotNr; num++)
+    for(int i = 0; i < Cats[num].Nsubhalos; i++)
+      Cats[num].Progenitors[i].MainProgSubhaloNr = HALONR_MAX;
+
+  /* initialize the maximum branch length field */
+  for(int num = 0; num <= LastSnapShotNr; num++)
+    for(int i = 0; i < Cats[num].Nsubhalos; i++)
+      Cats[num].SubExt[i].MaxLenProgBranch = Cats[num].Subhalo[i].Len;
+
+  /* propagate the branch length of subhalos from snapshot "num-1" to those in snapshot "num" via the
+   * descendant information */
+  for(int num = 1; num <= LastSnapShotNr; num++)
+    halotrees_propagate_max_branch_length_descendants(num);
+
+  /* propagate the branch length of subhalos from snapshot "num-1" to those in snapshot "num" via the
+   * progenitor information */
+  /* we disable this as it could cause a spurious jump to a larger progenitor that shed a few particles to a newly formed group */
+  // for(int num = 1; num <= LastSnapShotNr; num++)
+  //   halotrees_propagate_max_branch_length_progenitors(num);
+
+  mpi_printf("MERGERTREE: determination of main progenitor branch done\n");
+}
+
+/* This function determines the maximum branch size of subhalos from snapshot "num-1" to those snapshot "num" via the
+ * descendant information
+ */
+void mergertree::halotrees_propagate_max_branch_length_descendants(int num)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  /* note: we take advantage of the fact that the Subgroups are ordered according to their global SubhaloNr for each output number */
+
+  /* the following lists store the minimum and maximum subhalo number to be found on a given processor */
+  long long *list_min_subhalonr = (long long *)Mem.mymalloc("list_min_subhalonr", NTask * sizeof(long long));
+  long long *list_max_subhalonr = (long long *)Mem.mymalloc("list_max_subhalonr", NTask * sizeof(long long));
+
+  /* this value flags that there are no subhalos on the corresponding processor */
+  long long empty = HALONR_MAX;
+
+  MPI_Allgather(Cats[num].Nsubhalos > 0 ? &Cats[num].Subhalo[0].SubhaloNr : &empty, sizeof(long long), MPI_BYTE, list_min_subhalonr,
+                sizeof(long long), MPI_BYTE, Communicator);
+  MPI_Allgather(Cats[num].Nsubhalos > 0 ? &Cats[num].Subhalo[Cats[num].Nsubhalos - 1].SubhaloNr : &empty, sizeof(long long), MPI_BYTE,
+                list_max_subhalonr, sizeof(long long), MPI_BYTE, Communicator);
+
+  int nexport = 0, nimport = 0;
+
+  halotrees_propagate_data *import_data = NULL, *export_data = NULL;
+
+  /* for efficiency reasons, we need to traverse the local descendant pointers in increasing order of their target subhalo number, so
+   * let's create an auxiliary list for facilitating this.
+   */
+  halotrees_propagate_data *sorted_list =
+      (halotrees_propagate_data *)Mem.mymalloc("sorted_list", Cats[num - 1].Nsubhalos * sizeof(halotrees_propagate_data));
+
+  for(int i = 0; i < Cats[num - 1].Nsubhalos; i++)
+    {
+      sorted_list[i].DescSubhaloNr    = Cats[num - 1].Descendants[i].DescSubhaloNr;
+      sorted_list[i].SubhaloNr        = Cats[num - 1].Descendants[i].PrevSubhaloNr;
+      sorted_list[i].MaxLenProgBranch = Cats[num - 1].SubExt[i].MaxLenProgBranch;
+    }
+
+  mycxxsort(sorted_list, sorted_list + Cats[num - 1].Nsubhalos, compare_halotrees_propagate_data_DescSubhaloNr);
+
+  /* for communication bookkeeping reasons, we traverse the counting pattern twice */
+  for(int mode = 0; mode < 2; mode++)
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int target = 0;
+
+      for(int i = 0; i < Cats[num - 1].Nsubhalos; i++)
+        {
+          while(target < NTask - 1 &&
+                (list_min_subhalonr[target] == empty || sorted_list[i].DescSubhaloNr > list_max_subhalonr[target]))
+            target++;
+
+          if(list_min_subhalonr[target] != empty && sorted_list[i].DescSubhaloNr >= list_min_subhalonr[target] &&
+             sorted_list[i].DescSubhaloNr <= list_max_subhalonr[target])
+            {
+              if(mode == 0)
+                Send_count[target]++;
+              else
+                {
+                  int off = Send_offset[target] + Send_count[target]++;
+
+                  export_data[off].DescSubhaloNr    = sorted_list[i].DescSubhaloNr;
+                  export_data[off].SubhaloNr        = sorted_list[i].SubhaloNr;
+                  export_data[off].MaxLenProgBranch = sorted_list[i].MaxLenProgBranch;
+                }
+            }
+        }
+
+      if(mode == 0)
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+          Recv_offset[0] = Send_offset[0] = 0;
+          for(int j = 0; j < NTask; j++)
+            {
+              nimport += Recv_count[j];
+              nexport += Send_count[j];
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          export_data = (halotrees_propagate_data *)Mem.mymalloc("export_data", nexport * sizeof(halotrees_propagate_data));
+          import_data = (halotrees_propagate_data *)Mem.mymalloc("import_data", nimport * sizeof(halotrees_propagate_data));
+        }
+    }
+
+  /* send data to those target processors that hold the descendant subhalos in order to fetch their tree-ids */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(halotrees_propagate_data), MPI_BYTE,
+                       recvTask, TAG_DENS_B, &import_data[Recv_offset[recvTask]],
+                       Recv_count[recvTask] * sizeof(halotrees_propagate_data), MPI_BYTE, recvTask, TAG_DENS_B, Communicator,
+                       MPI_STATUS_IGNORE);
+    }
+
+  /* the collection of incoming data is not necessarily sorted according to DescSubhaloNr, so we need to sort it for efficient
+   * matching
+   */
+  mycxxsort(import_data, import_data + nimport, compare_halotrees_propagate_data_DescSubhaloNr);
+
+  /* now do the matching */
+  for(int i = 0, j = 0; i < Cats[num].Nsubhalos && j < nimport;)
+    {
+      if(Cats[num].Subhalo[i].SubhaloNr < import_data[j].DescSubhaloNr)
+        i++;
+      else if(Cats[num].Subhalo[i].SubhaloNr > import_data[j].DescSubhaloNr)
+        j++;
+      else
+        {
+          if(import_data[j].MaxLenProgBranch > Cats[num].SubExt[i].MaxLenProgBranch - Cats[num].Subhalo[i].Len)
+            {
+              Cats[num].SubExt[i].MaxLenProgBranch       = import_data[j].MaxLenProgBranch + Cats[num].Subhalo[i].Len;
+              Cats[num].Progenitors[i].MainProgSubhaloNr = import_data[j].SubhaloNr;
+            }
+
+          j++;
+        }
+    }
+
+  Mem.myfree(import_data);
+  Mem.myfree(export_data);
+  Mem.myfree(sorted_list);
+  Mem.myfree(list_max_subhalonr);
+  Mem.myfree(list_min_subhalonr);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+/* This function determines the maximum branch size of subhalos from snapshot "num-1" to those snapshot "num" via the
+ * progenitor information
+ */
+void mergertree::halotrees_propagate_max_branch_length_progenitors(int num)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  /* note: we take advantage of the fact that the Subgroups are ordered according to their global SubhaloNr for each output number */
+
+  /* the following lists store the minimum and maximum subhalo number to be found on a given processor */
+  long long *list_min_subhalonr = (long long *)Mem.mymalloc("list_min_subhalonr", NTask * sizeof(long long));
+  long long *list_max_subhalonr = (long long *)Mem.mymalloc("list_max_subhalonr", NTask * sizeof(long long));
+
+  /* this value flags that there are no subhalos on the corresponding processor */
+  long long empty = HALONR_MAX;
+
+  MPI_Allgather(Cats[num - 1].Nsubhalos > 0 ? &Cats[num - 1].Subhalo[0].SubhaloNr : &empty, sizeof(long long), MPI_BYTE,
+                list_min_subhalonr, sizeof(long long), MPI_BYTE, Communicator);
+  MPI_Allgather(Cats[num - 1].Nsubhalos > 0 ? &Cats[num - 1].Subhalo[Cats[num - 1].Nsubhalos - 1].SubhaloNr : &empty,
+                sizeof(long long), MPI_BYTE, list_max_subhalonr, sizeof(long long), MPI_BYTE, Communicator);
+
+  int nexport = 0, nimport = 0;
+
+  halotrees_propagate_data *import_data = NULL, *export_data = NULL;
+
+  /* for efficiency reasons, we need to traverse the local descendant pointers in increasing order of their target subhalo number, so
+   * let's create an auxiliary list for facilitating this.
+   */
+  halotrees_propagate_data *sorted_list =
+      (halotrees_propagate_data *)Mem.mymalloc("sorted_list", Cats[num].Nsubhalos * sizeof(halotrees_propagate_data));
+
+  for(int i = 0; i < Cats[num].Nsubhalos; i++)
+    {
+      sorted_list[i].ProgSubhaloNr = Cats[num].Progenitors[i].ProgSubhaloNr;
+      sorted_list[i].index         = i;
+    }
+
+  mycxxsort(sorted_list, sorted_list + Cats[num].Nsubhalos, compare_halotrees_propagate_data_ProgSubhaloNr);
+
+  /* for communication bookkeeping reasons, we traverse the counting pattern twice */
+  for(int mode = 0; mode < 2; mode++)
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int target = 0;
+
+      for(int i = 0; i < Cats[num].Nsubhalos; i++)
+        {
+          while(target < NTask - 1 &&
+                (list_min_subhalonr[target] == empty || sorted_list[i].ProgSubhaloNr > list_max_subhalonr[target]))
+            target++;
+
+          if(list_min_subhalonr[target] != empty && sorted_list[i].ProgSubhaloNr >= list_min_subhalonr[target] &&
+             sorted_list[i].ProgSubhaloNr <= list_max_subhalonr[target])
+            {
+              if(mode == 0)
+                Send_count[target]++;
+              else
+                {
+                  int off = Send_offset[target] + Send_count[target]++;
+
+                  export_data[off].ProgSubhaloNr = sorted_list[i].ProgSubhaloNr;
+                  export_data[off].index         = sorted_list[i].index;
+                }
+            }
+        }
+
+      if(mode == 0)
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+          Recv_offset[0] = Send_offset[0] = 0;
+          for(int j = 0; j < NTask; j++)
+            {
+              nimport += Recv_count[j];
+              nexport += Send_count[j];
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          export_data = (halotrees_propagate_data *)Mem.mymalloc("export_data", nexport * sizeof(halotrees_propagate_data));
+          import_data = (halotrees_propagate_data *)Mem.mymalloc("import_data", nimport * sizeof(halotrees_propagate_data));
+        }
+    }
+
+  /* send data to those target processors that hold the descendant subhalos in order to fetch their tree-ids */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(halotrees_propagate_data), MPI_BYTE,
+                       recvTask, TAG_DENS_B, &import_data[Recv_offset[recvTask]],
+                       Recv_count[recvTask] * sizeof(halotrees_propagate_data), MPI_BYTE, recvTask, TAG_DENS_B, Communicator,
+                       MPI_STATUS_IGNORE);
+    }
+
+  for(int i = 0; i < nimport; i++)
+    import_data[i].orig_order = i;
+
+  /* the collection of incoming data is not necessarily sorted according to ProgSubhaloNr, so we need to sort it for efficient
+   * matching
+   */
+  mycxxsort(import_data, import_data + nimport, compare_halotrees_propagate_data_ProgSubhaloNr);
+
+  /* now do the matching */
+  for(int i = 0, j = 0; i < Cats[num - 1].Nsubhalos && j < nimport;)
+    {
+      if(Cats[num - 1].Subhalo[i].SubhaloNr < import_data[j].ProgSubhaloNr)
+        i++;
+      else if(Cats[num - 1].Subhalo[i].SubhaloNr > import_data[j].ProgSubhaloNr)
+        j++;
+      else
+        {
+          import_data[j].MaxLenProgBranch = Cats[num - 1].SubExt[i].MaxLenProgBranch;
+          import_data[j].SubhaloNr        = Cats[num - 1].Subhalo[i].SubhaloNr;
+
+          if(Cats[num - 1].Subhalo[i].SubhaloNr != import_data[j].ProgSubhaloNr)
+            Terminate("Cats[num - 1].Subhalo[i].SubhaloNr != import_data[j].ProgSubhaloNr)");
+
+          j++;
+        }
+    }
+
+  /* reestablish original order */
+  mycxxsort(import_data, import_data + nimport, compare_halotrees_propagate_data_orig_order);
+
+  /* send data back */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&import_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(halotrees_propagate_data), MPI_BYTE,
+                       recvTask, TAG_DENS_B, &export_data[Send_offset[recvTask]],
+                       Send_count[recvTask] * sizeof(halotrees_propagate_data), MPI_BYTE, recvTask, TAG_DENS_B, Communicator,
+                       MPI_STATUS_IGNORE);
+    }
+
+  /* now read it out and assign the new treeid/treetask value to the halos in the previous output (which are the progenitors) */
+  for(int i = 0; i < nexport; i++)
+    {
+      int q = export_data[i].index;
+
+      if(export_data[i].MaxLenProgBranch > Cats[num].SubExt[q].MaxLenProgBranch - Cats[num].Subhalo[q].Len)
+        {
+          Cats[num].SubExt[q].MaxLenProgBranch       = export_data[i].MaxLenProgBranch + Cats[num].Subhalo[q].Len;
+          Cats[num].Progenitors[q].MainProgSubhaloNr = export_data[i].SubhaloNr;
+        }
+    }
+
+  Mem.myfree(import_data);
+  Mem.myfree(export_data);
+  Mem.myfree(sorted_list);
+  Mem.myfree(list_max_subhalonr);
+  Mem.myfree(list_min_subhalonr);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+#endif
diff --git a/src/mergertree/io_descendant.cc b/src/mergertree/io_descendant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4883393d9d517cb69bc5c9245c0306d6c49038d1
--- /dev/null
+++ b/src/mergertree/io_descendant.cc
@@ -0,0 +1,283 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_descendant.cc
+ *
+ *  \brief defines routines for the I/O of the descendant fields of subhalos
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_descendant.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+descendant_io::descendant_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format) : IO_Def(comm, format)
+{
+  MergerTree = MergerTree_ptr;
+
+  this->N_IO_Fields  = 0;
+  this->N_DataGroups = 1;
+  this->header_size  = sizeof(header);
+  this->header_buf   = &header;
+  this->type_of_file = FILE_IS_DESCCAT;
+  sprintf(this->info, "MERGERTREE: writing descendant information");
+
+  init_field("DSNR", "DescSubhaloNr", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_DESC, NULL, io_func_descsubhalonr, PREVSUBS, 0, 0,
+             0, 0, 0, 0, 0, true);
+
+  init_field("FDNR", "FirstDescSubhaloNr", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_DESC, NULL, io_func_firstdescsubhalonr,
+             PREVSUBS, 0, 0, 0, 0, 0, 0, 0, true);
+
+  init_field("NSNR", "NextProgSubhaloNr", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_DESC, NULL, io_func_nextsubhalonr, PREVSUBS, 0,
+             0, 0, 0, 0, 0, 0, true);
+
+  init_field("SHNR", "SubhaloNr", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_DESC, &MergerTree->Descendants[0].PrevSubhaloNr, NULL,
+             PREVSUBS, 0, 0, 0, 0, 0, 0, 0, true);  // this field can in principle be deleted
+
+  init_field("DFLO", "DescFileOffset", MEM_MY_FILEOFFSET, FILE_NONE, SKIP_ON_READ, 1, A_DESC, &MergerTree->Descendants[0].FileOffset,
+             NULL, PREVSUBS, 0, 0, 0, 0, 0, 0, 0, true);
+}
+
+void descendant_io::mergertree_save_descendants(int num)
+{
+  char buf[MAXLEN_PATH_EXTRA];
+
+  /* write Descendants and Nextsubhalos */
+
+  if(All.NumFilesPerSnapshot > 1)
+    {
+      if(ThisTask == 0)
+        {
+          sprintf(buf, "%s/groups_%03d", All.OutputDir, num - 1);
+          mkdir(buf, 02755);
+        }
+      MPI_Barrier(Communicator);
+    }
+
+  if(All.NumFilesPerSnapshot > 1)
+    sprintf(buf, "%s/groups_%03d/%s_%03d", All.OutputDir, num - 1, "subhalo_desc", num - 1);
+  else
+    sprintf(buf, "%s%s_%03d", All.OutputDir, "subhalo_desc", num - 1);
+
+  write_multiple_files(buf, All.NumFilesPerSnapshot);
+}
+
+void descendant_io::mergertree_read_descendants(int num)
+{
+  char fname[MAXLEN_PATH_EXTRA], fname_multiple[MAXLEN_PATH_EXTRA];
+
+  sprintf(fname_multiple, "%s/groups_%03d/%s_%03d", All.OutputDir, num, "subhalo_desc", num);
+  sprintf(fname, "%s%s_%03d", All.OutputDir, "subhalo_desc", num);
+
+  TotNsubhalos = 0;
+
+  int num_files = find_files(fname, fname_multiple);
+
+  if(num_files > 1)
+    strcpy(fname, fname_multiple);
+
+  /* we repeat reading the headers of the files two times. In the first iteration, only the
+   * particle numbers ending up on each processor are assembled, followed by memory allocation.
+   * In the second iteration, the data is actually read in.
+   */
+  for(int rep = 0; rep < 2; rep++)
+    {
+      Nsubhalos = 0;
+
+      read_files_driver(fname, rep, num_files);
+
+      /* now do the memory allocation */
+      if(rep == 0)
+        {
+          MergerTree->Descendants = (mergertree::desc_list *)Mem.mymalloc_movable(&MergerTree->Descendants, "Descendants",
+                                                                                  Nsubhalos * sizeof(mergertree::desc_list));
+        }
+    }
+
+  MPI_Barrier(Communicator);
+}
+
+void descendant_io::fill_file_header(int writeTask, int lastTask, long long *n_type, long long *ntot_type)
+{
+  /* determine group/id numbers of each type in file */
+
+  n_type[0] = MergerTree->PrevNsubhalos;
+
+  if(ThisTask == writeTask)
+    {
+      for(int n = 0; n < 1; n++)
+        ntot_type[n] = n_type[n];
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        {
+          long long nn[3];
+          MPI_Recv(&nn[0], 1, MPI_LONG_LONG, task, TAG_LOCALN, Communicator, MPI_STATUS_IGNORE);
+          for(int n = 0; n < 1; n++)
+            ntot_type[n] += nn[n];
+        }
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        MPI_Send(&ntot_type[0], 1, MPI_LONG_LONG, task, TAG_N, Communicator);
+    }
+  else
+    {
+      MPI_Send(&n_type[0], 1, MPI_LONG_LONG, writeTask, TAG_LOCALN, Communicator);
+      MPI_Recv(&ntot_type[0], 1, MPI_LONG_LONG, writeTask, TAG_N, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* fill file header */
+
+  header.Nsubhalos    = ntot_type[0];
+  header.TotNsubhalos = MergerTree->PrevTotNsubhalos;
+  header.num_files    = All.NumFilesPerSnapshot;
+}
+
+void descendant_io::read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *n_type,
+                                     long long *ntot_type, int *nstart)
+{
+  if(ThisTask == readTask)
+    {
+      if(filenr == 0 && nstart == NULL)
+        {
+          mpi_printf("\nREAD-DESCENDANTS: filenr=%d, '%s' contains  (subhalos):  %8d\n", filenr, fname, header.Nsubhalos);
+        }
+    }
+
+  if(TotNsubhalos == 0)
+    TotNsubhalos = header.TotNsubhalos;
+
+  for(int k = 0; k < 1; k++)
+    n_type[k] = ntot_type[k] = 0;
+
+  /* to collect the gas particles all at the beginning (in case several
+     snapshot files are read on the current CPU) we move the collisionless
+     particles such that a gap of the right size is created */
+
+  ntot_type[0] = header.Nsubhalos;
+
+  long long n_in_file = header.Nsubhalos;
+  int ntask           = lastTask - readTask + 1;
+  int n_for_this_task = n_in_file / ntask;
+  if((ThisTask - readTask) < (n_in_file % ntask))
+    n_for_this_task++;
+
+  n_type[0] = n_for_this_task;
+
+  if(nstart)
+    {
+      memmove(&MergerTree->Descendants[n_for_this_task], &MergerTree->Descendants[0], Nsubhalos * sizeof(mergertree::desc_list));
+      *nstart = 0;
+    }
+}
+
+void descendant_io::write_header_fields(hid_t handle)
+{
+  write_scalar_attribute(handle, "Nsubhalos_ThisFile", &header.Nsubhalos, H5T_NATIVE_UINT64);
+
+  write_scalar_attribute(handle, "Nsubhalos_Total", &header.TotNsubhalos, H5T_NATIVE_UINT64);
+
+  write_scalar_attribute(handle, "NumFiles", &header.num_files, H5T_NATIVE_INT);
+}
+
+/*! \brief This function reads the snapshot header in case of hdf5 files (i.e. format 3)
+ *
+ * \param fname file name of the snapshot as given in the parameter file
+ */
+void descendant_io::read_header_fields(const char *fname)
+{
+  memset(&header, 0, sizeof(io_header));
+
+  hid_t hdf5_file = my_H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
+  hid_t handle    = my_H5Gopen(hdf5_file, "/Header");
+
+  /* now read the header fields */
+  read_scalar_attribute(handle, "Nsubhalos_ThisFile", "Nsubgroups_ThisFile", &header.Nsubhalos, H5T_NATIVE_UINT64);
+
+  read_scalar_attribute(handle, "Nsubhalos_Total", "Nsubgroups_Total", &header.TotNsubhalos, H5T_NATIVE_UINT64);
+
+  read_scalar_attribute(handle, "NumFiles", &header.num_files, H5T_NATIVE_INT);
+
+  my_H5Gclose(handle, "/Header");
+  my_H5Fclose(hdf5_file, fname);
+}
+
+int descendant_io::get_filenr_from_header(void) { return header.num_files; }
+
+void descendant_io::set_filenr_in_header(int numfiles) { header.num_files = numfiles; }
+
+void descendant_io::read_increase_numbers(int type, int n_for_this_task)
+{
+  switch(type)
+    {
+      case 0:
+        Nsubhalos += n_for_this_task;
+        break;
+      default:
+        Terminate("wrong group");
+        break;
+    }
+}
+
+void descendant_io::get_datagroup_name(int type, char *buf)
+{
+  switch(type)
+    {
+      case 0:
+        sprintf(buf, "/Subhalo");
+        break;
+      default:
+        Terminate("wrong group");
+        break;
+    }
+}
+
+int descendant_io::get_type_of_element(int index)
+{
+  /* empty */
+  return 0;
+}
+
+void descendant_io::set_type_of_element(int index, int type)
+{ /* empty */
+}
+
+void *descendant_io::get_base_address_of_structure(enum arrays array, int index)
+{
+  switch(array)
+    {
+      case A_DESC:
+        return (void *)(MergerTree->Descendants + index);
+      default:
+        Terminate("strange, we don't expect to get here");
+    }
+}
+
+#endif
diff --git a/src/mergertree/io_descendant.h b/src/mergertree/io_descendant.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4b1fb47120d6436da21bc764776a06f6665565f
--- /dev/null
+++ b/src/mergertree/io_descendant.h
@@ -0,0 +1,135 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_descendant.h
+ *
+ *  \brief declares the class needed for the I/O of descendants relationships
+ */
+
+#ifndef DESCENDANT_IO_H
+#define DESCENDANT_IO_H
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+class descendant_io : public IO_Def
+{
+ private:
+  mergertree *MergerTree;
+
+ public:
+  descendant_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format);
+
+  void mergertree_save_descendants(int num);
+  void mergertree_read_descendants(int num);
+
+  /* supplied virtual functions */
+  void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart);
+  void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                        int *nstart);
+  void get_datagroup_name(int grnr, char *gname);
+  void write_header_fields(hid_t);
+  void read_header_fields(const char *fname);
+  void read_increase_numbers(int type, int n_for_this_task);
+  int get_filenr_from_header(void);
+  void set_filenr_in_header(int);
+  void *get_base_address_of_structure(enum arrays array, int index);
+  int get_type_of_element(int index);
+  void set_type_of_element(int index, int type);
+
+  /** Header for the standard file format.
+   */
+  struct io_header
+  {
+    long long Nsubhalos;
+    long long TotNsubhalos;
+    int num_files;
+  };
+  io_header header;
+
+  int Nsubhalos;
+  long long TotNsubhalos;
+
+ private:
+  static void io_func_descsubhalonr(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    descendant_io *thisobj = (descendant_io *)ptr;
+
+    if(mode == 0)
+      {
+        long long *out_buffer = (long long *)buffer;
+        out_buffer[0]         = thisobj->MergerTree->Descendants[particle].DescSubhaloNr;
+        if(out_buffer[0] == HALONR_MAX)
+          out_buffer[0] = -1;
+      }
+    else
+      {
+        long long *in_buffer                                     = (long long *)buffer;
+        thisobj->MergerTree->Descendants[particle].DescSubhaloNr = in_buffer[0];
+        if(thisobj->MergerTree->Descendants[particle].DescSubhaloNr == -1)
+          thisobj->MergerTree->Descendants[particle].DescSubhaloNr = HALONR_MAX;
+      }
+  }
+
+  static void io_func_firstdescsubhalonr(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    descendant_io *thisobj = (descendant_io *)ptr;
+
+    if(mode == 0)
+      {
+        long long *out_buffer = (long long *)buffer;
+        out_buffer[0]         = thisobj->MergerTree->Descendants[particle].FirstDescSubhaloNr;
+        if(out_buffer[0] == HALONR_MAX)
+          out_buffer[0] = -1;
+      }
+    else
+      {
+        long long *in_buffer                                          = (long long *)buffer;
+        thisobj->MergerTree->Descendants[particle].FirstDescSubhaloNr = in_buffer[0];
+        if(thisobj->MergerTree->Descendants[particle].FirstDescSubhaloNr == -1)
+          thisobj->MergerTree->Descendants[particle].FirstDescSubhaloNr = HALONR_MAX;
+      }
+  }
+
+  static void io_func_nextsubhalonr(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    descendant_io *thisobj = (descendant_io *)ptr;
+
+    if(mode == 0)
+      {
+        long long *out_buffer = (long long *)buffer;
+        out_buffer[0]         = thisobj->MergerTree->Descendants[particle].NextProgSubhaloNr;
+        if(out_buffer[0] == HALONR_MAX)
+          out_buffer[0] = -1;
+      }
+    else
+      {
+        long long *in_buffer                                         = (long long *)buffer;
+        thisobj->MergerTree->Descendants[particle].NextProgSubhaloNr = in_buffer[0];
+        if(thisobj->MergerTree->Descendants[particle].NextProgSubhaloNr == -1)
+          thisobj->MergerTree->Descendants[particle].NextProgSubhaloNr = HALONR_MAX;
+      }
+  }
+};
+
+#endif
+
+#endif /* DESCENDANT_IO_H */
diff --git a/src/mergertree/io_halotrees.cc b/src/mergertree/io_halotrees.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06bc6c22609a538f09a59e839ccb8e8da3befcf8
--- /dev/null
+++ b/src/mergertree/io_halotrees.cc
@@ -0,0 +1,367 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_halotrees.cc
+ *
+ *  \brief routines to stores the constructed trees made of subhalos
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_halotrees.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+halotrees_io::halotrees_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format) : IO_Def(comm, format)
+{
+  MergerTree = MergerTree_ptr;
+
+  this->N_IO_Fields  = 0;
+  this->N_DataGroups = 3;
+  this->header_size  = sizeof(header);
+  this->header_buf   = &header;
+  this->type_of_file = FILE_IS_TREECAT;
+  sprintf(this->info, "MERGERTREE: writing mergertrees");
+
+  /* overview table for trees in the file */
+
+  init_field("MTRL", "Length", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_TT, &MergerTree->TreeTable[0].HaloCount, NULL, TREELENGTH, 0, 0,
+             0, 0, 0, 0, 0);
+  init_field("MTRS", "StartOffset", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_TT, &MergerTree->TreeTable[0].FirstHalo, NULL,
+             TREELENGTH, 0, 0, 0, 0, 0, 0, 0);
+  init_field("MTRI", "TreeID", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_TT, &MergerTree->TreeTable[0].TreeID, NULL, TREELENGTH, 0, 0,
+             0, 0, 0, 0, 0);
+
+  /* link pointers of each subhalo in the trees */
+
+  init_field("TMPR", "TreeMainProgenitor", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].TreeMainProgenitor, NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0, true);
+  init_field("TDES", "TreeDescendant", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].TreeDescendant, NULL, TREEHALOS,
+             0, 0, 0, 0, 0, 0, 0, true);
+  init_field("TFPR", "TreeFirstProgenitor", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].TreeFirstProgenitor, NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0, true);
+  init_field("TNPR", "TreeNextProgenitor", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].TreeNextProgenitor, NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0, true);
+  init_field("TFHF", "TreeFirstHaloInFOFgroup", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].TreeFirstHaloInFOFgroup,
+             NULL, TREEHALOS, 0, 0, 0, 0, 0, 0, 0, true);
+  init_field("TNHF", "TreeNextHaloInFOFgroup", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].TreeNextHaloInFOFgroup,
+             NULL, TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("TPRO", "TreeProgenitor", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].TreeProgenitor, NULL, TREEHALOS,
+             0, 0, 0, 0, 0, 0, 0, true);
+  init_field("TFDE", "TreeFirstDescendant", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].TreeFirstDescendant, NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0, true);
+  init_field("TNDE", "TreeNextDescendant", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].TreeNextDescendant, NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0, true);
+
+  /* properties of each subhalo in the trees */
+
+  init_field("SLEN", "SubhaloLen", mem_len_type, file_len_type, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].SubProp.Len, NULL,
+             SUBGROUPS, 0, 0, 0, 0, 0, 0, 0, true);
+  init_field("MASS", "SubhaloMass", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].SubProp.Mass, NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("FMC2", "Group_M_Crit200", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].SubProp.M_Crit200,
+             NULL, TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("SPOS", "SubhaloPos", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 3, A_H, &MergerTree->Halos[0].SubProp.Pos[0], NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("SVEL", "SubhaloVel", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 3, A_H, &MergerTree->Halos[0].SubProp.Vel[0], NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("SSPI", "SubhaloVelDisp", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].SubProp.SubVelDisp,
+             NULL, TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("SVMX", "SubhaloVmax", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].SubProp.SubVmax, NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("SVRX", "SubhaloVmaxRad", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].SubProp.SubVmaxRad,
+             NULL, TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("SHMR", "SubhaloHalfmassRad", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 1, A_H,
+             &MergerTree->Halos[0].SubProp.SubHalfMassRad, NULL, TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("SSPI", "SubhaloSpin", MEM_MY_FLOAT, FILE_MY_IO_FLOAT, SKIP_ON_READ, 3, A_H, &MergerTree->Halos[0].SubProp.Spin[0], NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("SIDM", "SubhaloIDMostbound", MEM_MY_ID_TYPE, FILE_MY_ID_TYPE, READ_IF_PRESENT, 1, A_H,
+             &MergerTree->Halos[0].SubProp.SubMostBoundID, NULL, TREEHALOS, 0, 0, 0, 0, 0, 0, 0, true);
+
+  /* where this subhalo came from */
+
+  init_field("TSNP", "SnapNum", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].SnapNum, NULL, TREEHALOS, 0, 0, 0, 0, 0,
+             0, 0, true);
+  init_field("TSNR", "SubhaloNr", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].SubhaloNr, NULL, TREEHALOS, 0, 0,
+             0, 0, 0, 0, 0, true);
+  init_field("GRNR", "GroupNr", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].GroupNr, NULL, TREEHALOS, 0, 0, 0,
+             0, 0, 0, 0, true);
+
+  /* the fields TreeID and TreeIndex are in principle redundant but kept for convenience */
+
+  init_field("TRID", "TreeID", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].TreeID, NULL, TREEHALOS, 0, 0, 0, 0,
+             0, 0, 0, true);
+  init_field("TRIX", "TreeIndex", MEM_INT, FILE_INT, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].TreeIndex, NULL, TREEHALOS, 0, 0, 0,
+             0, 0, 0, 0, true);
+
+  /* we don't need to store the links in terms of the global subhalo numbers for full timeslices */
+  /*
+  init_field("UGRN", "UniqueGroupNr", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].UniqueGroupNr, NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("DESC", "DescSubhaloNr", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].DescSubhaloNr, NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("NEXT", "NextProgSubhaloNr", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].NextProgSubhaloNr, NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+  init_field("FRST", "FirstProgSubhaloNr", MEM_INT64, FILE_INT64, SKIP_ON_READ, 1, A_H, &MergerTree->Halos[0].FirstProgSubhaloNr, NULL,
+             TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+*/
+
+  /**** output times */
+
+  init_field("REDS", "Redshift", MEM_DOUBLE, FILE_DOUBLE, SKIP_ON_READ, 1, A_CT, &MergerTree->CatTimes[0].Redshift, NULL, TREETIMES, 0,
+             0, 0, 0, 0, 0, 0);
+
+  init_field("OUTT", "Time", MEM_DOUBLE, FILE_DOUBLE, SKIP_ON_READ, 1, A_CT, &MergerTree->CatTimes[0].Time, NULL, TREETIMES, 0, 0, 0,
+             0, 0, 0, 0);
+}
+
+void halotrees_io::halotrees_save_trees(void)
+{
+  char buf[MAXLEN_PATH_EXTRA];
+
+  /* write trees */
+  if(All.NumFilesPerSnapshot > 1)
+    {
+      if(ThisTask == 0)
+        {
+          sprintf(buf, "%s/treedata", All.OutputDir);
+          mkdir(buf, 02755);
+        }
+      MPI_Barrier(Communicator);
+    }
+
+  if(All.NumFilesPerSnapshot > 1)
+    sprintf(buf, "%s/treedata/%s", All.OutputDir, "trees");
+  else
+    sprintf(buf, "%s%s", All.OutputDir, "trees");
+
+  write_multiple_files(buf, All.NumFilesPerSnapshot);
+}
+
+void halotrees_io::fill_file_header(int writeTask, int lastTask, long long *n_type, long long *ntot_type)
+{
+  /* determine group/id numbers of each type in file */
+
+  n_type[0] = MergerTree->Ntrees;
+  n_type[1] = MergerTree->Nhalos;
+  if(ThisTask == writeTask)
+    n_type[2] = MergerTree->LastSnapShotNr + 1;
+  else
+    n_type[2] = 0;
+
+  if(ThisTask == writeTask)
+    {
+      for(int n = 0; n < N_DataGroups; n++)
+        ntot_type[n] = n_type[n];
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        {
+          long long nn[N_DataGroups];
+          MPI_Recv(&nn[0], N_DataGroups, MPI_LONG_LONG, task, TAG_LOCALN, Communicator, MPI_STATUS_IGNORE);
+          for(int n = 0; n < N_DataGroups; n++)
+            ntot_type[n] += nn[n];
+        }
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        MPI_Send(&ntot_type[0], N_DataGroups, MPI_LONG_LONG, task, TAG_N, Communicator);
+    }
+  else
+    {
+      MPI_Send(&n_type[0], N_DataGroups, MPI_LONG_LONG, writeTask, TAG_LOCALN, Communicator);
+      MPI_Recv(&ntot_type[0], N_DataGroups, MPI_LONG_LONG, writeTask, TAG_N, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* fill file header */
+  header.Ntrees = ntot_type[0];
+  header.Nhalos = ntot_type[1];
+
+  header.TotNtrees = MergerTree->TotNtrees;
+  header.TotNhalos = MergerTree->TotNhalos;
+
+  header.lastsnapshotnr = MergerTree->LastSnapShotNr;
+
+  header.num_files = All.NumFilesPerSnapshot;
+}
+
+void halotrees_io::write_header_fields(hid_t handle)
+{
+  write_scalar_attribute(handle, "Ntrees_ThisFile", &header.Ntrees, H5T_NATIVE_UINT64);
+  write_scalar_attribute(handle, "Ntrees_Total", &header.TotNtrees, H5T_NATIVE_UINT64);
+
+  write_scalar_attribute(handle, "Nhalos_ThisFile", &header.Nhalos, H5T_NATIVE_UINT64);
+  write_scalar_attribute(handle, "Nhalos_Total", &header.TotNhalos, H5T_NATIVE_UINT64);
+
+  write_scalar_attribute(handle, "NumFiles", &header.num_files, H5T_NATIVE_INT);
+
+  write_scalar_attribute(handle, "LastSnapShotNr", &header.lastsnapshotnr, H5T_NATIVE_INT);
+}
+
+int halotrees_io::get_filenr_from_header(void) { return header.num_files; }
+
+void halotrees_io::set_filenr_in_header(int numfiles) { header.num_files = numfiles; }
+
+void halotrees_io::read_increase_numbers(int type, int n_for_this_task)
+{
+  switch(type)
+    {
+      case 0:
+        MergerTree->Ntrees += n_for_this_task;
+        break;
+      case 1:
+        MergerTree->Nhalos += n_for_this_task;
+        break;
+      case 2:
+        MergerTree->CatTimes += n_for_this_task;
+        break;
+
+      default:
+        Terminate("wrong group");
+        break;
+    }
+}
+
+void halotrees_io::read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *n_type, long long *ntot_type,
+                                    int *nstart)
+{
+  if(ThisTask == readTask)
+    {
+      if(filenr == 0 && nstart == NULL)
+        {
+          mpi_printf(
+              "\nREAD-TREES: filenr=%d, '%s' contains %lld trees out of a total of %lld, and %lld halos out of a total of %lld\n",
+              filenr, fname, header.Ntrees, header.TotNtrees, header.Nhalos, header.TotNhalos);
+        }
+    }
+
+  if(MergerTree->TotNtrees == 0)
+    {
+      MergerTree->TotNtrees = header.TotNtrees;
+      MergerTree->TotNhalos = header.TotNhalos;
+    }
+
+  for(int k = 0; k < 2; k++)
+    n_type[k] = ntot_type[k] = 0;
+
+  {
+    ntot_type[0]        = header.Ntrees;
+    long long n_in_file = header.Ntrees;
+    int ntask           = lastTask - readTask + 1;
+    int n_for_this_task = n_in_file / ntask;
+    if((ThisTask - readTask) < (n_in_file % ntask))
+      n_for_this_task++;
+    n_type[0] = n_for_this_task;
+
+    if(nstart)
+      memmove(&MergerTree->TreeTable[n_for_this_task], &MergerTree->TreeTable[0], MergerTree->Ntrees * sizeof(halotrees_table));
+  }
+
+  {
+    ntot_type[1]        = header.Nhalos;
+    long long n_in_file = header.Nhalos;
+    int ntask           = lastTask - readTask + 1;
+    int n_for_this_task = n_in_file / ntask;
+    if((ThisTask - readTask) < (n_in_file % ntask))
+      n_for_this_task++;
+    n_type[1] = n_for_this_task;
+
+    if(nstart)
+      memmove(&MergerTree->Halos[n_for_this_task], &MergerTree->Halos[0], MergerTree->Nhalos * sizeof(treehalo_type));
+  }
+
+  if(nstart)
+    *nstart = 0;
+}
+
+void halotrees_io::read_header_fields(const char *fname)
+{
+  memset(&header, 0, sizeof(header));
+
+  hid_t hdf5_file = my_H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
+  hid_t handle    = my_H5Gopen(hdf5_file, "/Header");
+
+  read_scalar_attribute(handle, "Ntrees_ThisFile", &header.Ntrees, H5T_NATIVE_UINT64);
+  read_scalar_attribute(handle, "Ntrees_Total", &header.TotNtrees, H5T_NATIVE_UINT64);
+
+  read_scalar_attribute(handle, "Nhalos_ThisFile", &header.Nhalos, H5T_NATIVE_UINT64);
+  read_scalar_attribute(handle, "Nhalos_Total", &header.TotNhalos, H5T_NATIVE_UINT64);
+
+  read_scalar_attribute(handle, "NumFiles", &header.num_files, H5T_NATIVE_INT);
+
+  read_scalar_attribute(handle, "LastSnapShotNr", &header.lastsnapshotnr, H5T_NATIVE_INT);
+
+  my_H5Gclose(handle, "/Header");
+  my_H5Fclose(hdf5_file, fname);
+}
+
+void halotrees_io::get_datagroup_name(int type, char *buf)
+{
+  switch(type)
+    {
+      case 0:
+        sprintf(buf, "/TreeTable");
+        break;
+      case 1:
+        sprintf(buf, "/TreeHalos");
+        break;
+      case 2:
+        sprintf(buf, "/TreeTimes");
+        break;
+      default:
+        Terminate("wrong group");
+        break;
+    }
+}
+
+int halotrees_io::get_type_of_element(int index)
+{
+  /* empty */
+  return 0;
+}
+
+void halotrees_io::set_type_of_element(int index, int type)
+{ /* empty */
+}
+
+void *halotrees_io::get_base_address_of_structure(enum arrays array, int index)
+{
+  switch(array)
+    {
+      case A_TT:
+        return (void *)(MergerTree->TreeTable + index);
+      case A_H:
+        return (void *)(MergerTree->Halos + index);
+      case A_CT:
+        return (void *)(MergerTree->CatTimes + index);
+      default:
+        Terminate("strange, we don't expect to get here");
+    }
+}
+
+#endif
diff --git a/src/mergertree/io_halotrees.h b/src/mergertree/io_halotrees.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b6721451cb269ab122151c0b7e28cf80546af17
--- /dev/null
+++ b/src/mergertree/io_halotrees.h
@@ -0,0 +1,76 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_halotrees.h
+ *
+ *  \brief definitions of a class for storing the halo trees
+ */
+
+#ifndef HALOTREES_IO_H
+#define HALOTREES_IO_H
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+class halotrees_io : public IO_Def
+{
+ private:
+  mergertree *MergerTree;
+
+ public:
+  halotrees_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format);
+
+  void halotrees_save_trees(void);
+
+  /* supplied virtual functions */
+  void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart);
+  void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                        int *nstart);
+  void get_datagroup_name(int grnr, char *gname);
+  void write_header_fields(hid_t);
+  void read_header_fields(const char *fname);
+  void read_increase_numbers(int type, int n_for_this_task);
+  int get_filenr_from_header(void);
+  void set_filenr_in_header(int);
+  void *get_base_address_of_structure(enum arrays array, int index);
+  int get_type_of_element(int index);
+  void set_type_of_element(int index, int type);
+
+  /** Header for the standard file format.
+   */
+
+  struct io_header
+  {
+    long long Nhalos;
+    long long TotNhalos;
+
+    long long Ntrees;
+    long long TotNtrees;
+
+    int num_files;
+    int lastsnapshotnr;
+  };
+  io_header header;
+};
+
+#endif
+
+#endif /* HALOTREES_IO_H */
diff --git a/src/mergertree/io_progenitors.cc b/src/mergertree/io_progenitors.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f490ff28031bedc39bb8ccc728f78c1ae591e9bd
--- /dev/null
+++ b/src/mergertree/io_progenitors.cc
@@ -0,0 +1,283 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_progenitors.cc
+ *
+ *  \brief routines for I/O of the progenitor links between two subhalo catalogues
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_progenitors.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+progenitors_io::progenitors_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format) : IO_Def(comm, format)
+{
+  MergerTree = MergerTree_ptr;
+
+  this->N_IO_Fields  = 0;
+  this->N_DataGroups = 1;
+  this->header_size  = sizeof(header);
+  this->header_buf   = &header;
+  this->type_of_file = FILE_IS_PROGCAT;
+  sprintf(this->info, "MERGERTREE: writing progenitor information");
+
+  init_field("PSNR", "ProgSubhaloNr", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_PROG, NULL, io_func_progsubhalonr, CURRSUBS, 0, 0,
+             0, 0, 0, 0, 0, true);
+
+  init_field("FPNR", "FirstProgSubhaloNr", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_PROG, NULL, io_func_firstprogsubhalonr,
+             CURRSUBS, 0, 0, 0, 0, 0, 0, 0, true);
+
+  init_field("NDNR", "NextDescSubhaloNr", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_PROG, NULL, io_func_nextdescsubhalonr, CURRSUBS,
+             0, 0, 0, 0, 0, 0, 0, true);
+
+  init_field("SHNR", "SubhaloNr", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_PROG, &MergerTree->Progenitors[0].SubhaloNr, NULL,
+             CURRSUBS, 0, 0, 0, 0, 0, 0, 0, true);  // this field can in principle be deleted
+
+  init_field("PFLO", "ProgFileOffset", MEM_MY_FILEOFFSET, FILE_NONE, SKIP_ON_READ, 1, A_PROG, &MergerTree->Progenitors[0].FileOffset,
+             NULL, CURRSUBS, 0, 0, 0, 0, 0, 0, 0, true);
+}
+
+void progenitors_io::mergertree_save_progenitors(int num)
+{
+  char buf[MAXLEN_PATH_EXTRA];
+
+  /* write FirstProgenitor */
+  if(All.NumFilesPerSnapshot > 1)
+    {
+      if(ThisTask == 0)
+        {
+          sprintf(buf, "%s/groups_%03d", All.OutputDir, num);
+          mkdir(buf, 02755);
+        }
+      MPI_Barrier(Communicator);
+    }
+
+  if(All.NumFilesPerSnapshot > 1)
+    sprintf(buf, "%s/groups_%03d/%s_%03d", All.OutputDir, num, "subhalo_prog", num);
+  else
+    sprintf(buf, "%s%s_%03d", All.OutputDir, "subhalo_prog", num);
+
+  write_multiple_files(buf, All.NumFilesPerSnapshot);
+}
+
+void progenitors_io::mergertree_read_progenitors(int num)
+{
+  char fname[MAXLEN_PATH_EXTRA], fname_multiple[MAXLEN_PATH_EXTRA];
+
+  sprintf(fname_multiple, "%s/groups_%03d/%s_%03d", All.OutputDir, num, "subhalo_prog", num);
+  sprintf(fname, "%s%s_%03d", All.OutputDir, "subhalo_prog", num);
+
+  TotNsubhalos = 0;
+
+  int num_files = find_files(fname, fname_multiple);
+
+  if(num_files > 1)
+    strcpy(fname, fname_multiple);
+
+  /* we repeat reading the headers of the files two times. In the first iteration, only the
+   * particle numbers ending up on each processor are assembled, followed by memory allocation.
+   * In the second iteration, the data is actually read in.
+   */
+  for(int rep = 0; rep < 2; rep++)
+    {
+      Nsubhalos = 0;
+
+      read_files_driver(fname, rep, num_files);
+
+      /* now do the memory allocation */
+      if(rep == 0)
+        {
+          MergerTree->Progenitors = (mergertree::prog_list *)Mem.mymalloc_movable(&MergerTree->Progenitors, "Progenitors",
+                                                                                  Nsubhalos * sizeof(mergertree::prog_list));
+        }
+    }
+
+  MPI_Barrier(Communicator);
+}
+
+void progenitors_io::fill_file_header(int writeTask, int lastTask, long long *n_type, long long *ntot_type)
+{
+  /* determine group/id numbers of each type in file */
+
+  n_type[0] = MergerTree->CurrNsubhalos;
+
+  if(ThisTask == writeTask)
+    {
+      for(int n = 0; n < 1; n++)
+        ntot_type[n] = n_type[n];
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        {
+          long long nn[3];
+          MPI_Recv(&nn[0], 1, MPI_LONG_LONG, task, TAG_LOCALN, Communicator, MPI_STATUS_IGNORE);
+          for(int n = 0; n < 1; n++)
+            ntot_type[n] += nn[n];
+        }
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        MPI_Send(&ntot_type[0], 1, MPI_LONG_LONG, task, TAG_N, Communicator);
+    }
+  else
+    {
+      MPI_Send(&n_type[0], 1, MPI_LONG_LONG, writeTask, TAG_LOCALN, Communicator);
+      MPI_Recv(&ntot_type[0], 1, MPI_LONG_LONG, writeTask, TAG_N, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* fill file header */
+
+  header.Nsubhalos    = ntot_type[0];
+  header.TotNsubhalos = MergerTree->CurrTotNsubhalos;
+  header.num_files    = All.NumFilesPerSnapshot;
+}
+
+void progenitors_io::read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *n_type,
+                                      long long *ntot_type, int *nstart)
+{
+  if(ThisTask == readTask)
+    {
+      if(filenr == 0 && nstart == NULL)
+        {
+          mpi_printf("\nREAD-PROGENITORS: filenr=%d, '%s' contains  (subhalos):  %8d  out of a total of %lld\n", filenr, fname,
+                     header.Nsubhalos, header.TotNsubhalos);
+        }
+    }
+
+  if(TotNsubhalos == 0)
+    TotNsubhalos = header.TotNsubhalos;
+
+  for(int k = 0; k < 1; k++)
+    n_type[k] = ntot_type[k] = 0;
+
+  /* to collect the gas particles all at the beginning (in case several
+     snapshot files are read on the current CPU) we move the collisionless
+     particles such that a gap of the right size is created */
+
+  ntot_type[0] = header.Nsubhalos;
+
+  long long n_in_file = header.Nsubhalos;
+  int ntask           = lastTask - readTask + 1;
+  int n_for_this_task = n_in_file / ntask;
+  if((ThisTask - readTask) < (n_in_file % ntask))
+    n_for_this_task++;
+
+  n_type[0] = n_for_this_task;
+
+  if(nstart)
+    {
+      memmove(&MergerTree->Progenitors[n_for_this_task], &MergerTree->Progenitors[0], Nsubhalos * sizeof(mergertree::prog_list));
+      *nstart = 0;
+    }
+}
+
+void progenitors_io::write_header_fields(hid_t handle)
+{
+  write_scalar_attribute(handle, "Nsubhalos_ThisFile", &header.Nsubhalos, H5T_NATIVE_UINT64);
+
+  write_scalar_attribute(handle, "Nsubhalos_Total", &header.TotNsubhalos, H5T_NATIVE_UINT64);
+
+  write_scalar_attribute(handle, "NumFiles", &header.num_files, H5T_NATIVE_INT);
+}
+
+/*! \brief This function reads the snapshot header in case of hdf5 files (i.e. format 3)
+ *
+ * \param fname file name of the snapshot as given in the parameter file
+ */
+void progenitors_io::read_header_fields(const char *fname)
+{
+  memset(&header, 0, sizeof(io_header));
+
+  hid_t hdf5_file = my_H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
+  hid_t handle    = my_H5Gopen(hdf5_file, "/Header");
+
+  /* now read the header fields */
+  read_scalar_attribute(handle, "Nsubhalos_ThisFile", "Nsubgroups_ThisFile", &header.Nsubhalos, H5T_NATIVE_UINT64);
+
+  read_scalar_attribute(handle, "Nsubhalos_Total", "Nsubgroups_Total", &header.TotNsubhalos, H5T_NATIVE_UINT64);
+
+  read_scalar_attribute(handle, "NumFiles", &header.num_files, H5T_NATIVE_INT);
+
+  my_H5Gclose(handle, "/Header");
+  my_H5Fclose(hdf5_file, fname);
+}
+
+int progenitors_io::get_filenr_from_header(void) { return header.num_files; }
+
+void progenitors_io::set_filenr_in_header(int numfiles) { header.num_files = numfiles; }
+
+void progenitors_io::read_increase_numbers(int type, int n_for_this_task)
+{
+  switch(type)
+    {
+      case 0:
+        Nsubhalos += n_for_this_task;
+        break;
+      default:
+        Terminate("wrong group");
+        break;
+    }
+}
+
+void progenitors_io::get_datagroup_name(int type, char *buf)
+{
+  switch(type)
+    {
+      case 0:
+        sprintf(buf, "/Subhalo");
+        break;
+      default:
+        Terminate("wrong group");
+        break;
+    }
+}
+
+int progenitors_io::get_type_of_element(int index)
+{
+  /* empty */
+  return 0;
+}
+
+void progenitors_io::set_type_of_element(int index, int type)
+{ /* empty */
+}
+
+void *progenitors_io::get_base_address_of_structure(enum arrays array, int index)
+{
+  switch(array)
+    {
+      case A_PROG:
+        return (void *)(MergerTree->Progenitors + index);
+      default:
+        Terminate("strange, we don't expect to get here");
+    }
+}
+
+#endif
diff --git a/src/mergertree/io_progenitors.h b/src/mergertree/io_progenitors.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d1f94591da1cec75f02f68e6cdda14df3238289
--- /dev/null
+++ b/src/mergertree/io_progenitors.h
@@ -0,0 +1,135 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_progenitors.h
+ *
+ *  \brief declaration of the class used for I/O of the progenitor links
+ */
+
+#ifndef PROGENITORS_IO_H
+#define PROGENITORS_IO_H
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+class progenitors_io : public IO_Def
+{
+ private:
+  mergertree *MergerTree;
+
+ public:
+  progenitors_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format);
+
+  void mergertree_read_progenitors(int num);
+  void mergertree_save_progenitors(int num);
+
+  /* supplied virtual functions */
+  void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart);
+  void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                        int *nstart);
+  void get_datagroup_name(int grnr, char *gname);
+  void write_header_fields(hid_t);
+  void read_header_fields(const char *fname);
+  void read_increase_numbers(int type, int n_for_this_task);
+  int get_filenr_from_header(void);
+  void set_filenr_in_header(int);
+  void *get_base_address_of_structure(enum arrays array, int index);
+  int get_type_of_element(int index);
+  void set_type_of_element(int index, int type);
+
+  /** Header for the standard file format.
+   */
+  struct io_header
+  {
+    long long Nsubhalos;
+    long long TotNsubhalos;
+    int num_files;
+  };
+  io_header header;
+
+  int Nsubhalos;
+  long long TotNsubhalos;
+
+ private:
+  static void io_func_progsubhalonr(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    progenitors_io *thisobj = (progenitors_io *)ptr;
+
+    if(mode == 0)
+      {
+        long long *out_buffer = (long long *)buffer;
+        out_buffer[0]         = thisobj->MergerTree->Progenitors[particle].ProgSubhaloNr;
+        if(out_buffer[0] == HALONR_MAX)
+          out_buffer[0] = -1;
+      }
+    else
+      {
+        long long *in_buffer                                     = (long long *)buffer;
+        thisobj->MergerTree->Progenitors[particle].ProgSubhaloNr = in_buffer[0];
+        if(thisobj->MergerTree->Progenitors[particle].ProgSubhaloNr == -1)
+          thisobj->MergerTree->Progenitors[particle].ProgSubhaloNr = HALONR_MAX;
+      }
+  }
+
+  static void io_func_firstprogsubhalonr(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    progenitors_io *thisobj = (progenitors_io *)ptr;
+
+    if(mode == 0)
+      {
+        long long *out_buffer = (long long *)buffer;
+        out_buffer[0]         = thisobj->MergerTree->Progenitors[particle].FirstProgSubhaloNr;
+        if(out_buffer[0] == HALONR_MAX)
+          out_buffer[0] = -1;
+      }
+    else
+      {
+        long long *in_buffer                                          = (long long *)buffer;
+        thisobj->MergerTree->Progenitors[particle].FirstProgSubhaloNr = in_buffer[0];
+        if(thisobj->MergerTree->Progenitors[particle].FirstProgSubhaloNr == -1)
+          thisobj->MergerTree->Progenitors[particle].FirstProgSubhaloNr = HALONR_MAX;
+      }
+  }
+
+  static void io_func_nextdescsubhalonr(IO_Def *ptr, int particle, int components, void *buffer, int mode)
+  {
+    progenitors_io *thisobj = (progenitors_io *)ptr;
+
+    if(mode == 0)
+      {
+        long long *out_buffer = (long long *)buffer;
+        out_buffer[0]         = thisobj->MergerTree->Progenitors[particle].NextDescSubhaloNr;
+        if(out_buffer[0] == HALONR_MAX)
+          out_buffer[0] = -1;
+      }
+    else
+      {
+        long long *in_buffer                                         = (long long *)buffer;
+        thisobj->MergerTree->Progenitors[particle].NextDescSubhaloNr = in_buffer[0];
+        if(thisobj->MergerTree->Progenitors[particle].NextDescSubhaloNr == -1)
+          thisobj->MergerTree->Progenitors[particle].NextDescSubhaloNr = HALONR_MAX;
+      }
+  }
+};
+
+#endif
+
+#endif /* DESCENDANT_IO_H */
diff --git a/src/mergertree/io_readsnap.cc b/src/mergertree/io_readsnap.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11b07714fcfa3384c6dc8f5866d58b9671a0b40d
--- /dev/null
+++ b/src/mergertree/io_readsnap.cc
@@ -0,0 +1,248 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_readsnap.cc
+ *
+ *  \brief routines for allowing reading of snapshot data for merger tree building
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include <errno.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_readsnap.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../system/system.h"
+
+readsnap_io::readsnap_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format) : IO_Def(comm, format)
+{
+  MergerTree = MergerTree_ptr;
+
+  this->N_IO_Fields  = 0;
+  this->N_DataGroups = NTYPES;
+  this->header_size  = sizeof(header);
+  this->header_buf   = &header;
+  this->type_of_file = FILE_IS_SNAPSHOT;
+  sprintf(this->info, "MERGERTREE: reading snapshot IDs");
+
+  init_field("ID  ", "ParticleIDs", MEM_MY_ID_TYPE, FILE_MY_ID_TYPE, READ_IF_PRESENT, 1, A_MTRP, &MergerTree->MtrP[0].ID, NULL,
+             ALL_TYPES, 0, 0, 0, 0, 0, 0, 0);
+
+  init_field("FLOF", "FileOffset", MEM_MY_FILEOFFSET, FILE_NONE, SKIP_ON_READ, 1, A_MTRP, &MergerTree->MtrP[0].FileOffset, NULL,
+             ALL_TYPES, 0, 0, 0, 0, 0, 0, 0);
+}
+
+/*! \brief This function reads initial conditions that are in on of the default file formats
+ * of Gadget.
+ *
+ * Snapshot files can be used as input files.  However, when a
+ * snapshot file is used as input, not all the information in the header is
+ * used: THE STARTING TIME NEEDS TO BE SET IN THE PARAMETERFILE.
+ * Alternatively, the code can be started with restartflag 2, then snapshots
+ * from the code can be used as initial conditions-files without having to
+ * change the parameter file.  For gas particles, only the internal energy is
+ * read, the density and mean molecular weight will be recomputed by the code.
+ * When InitGasTemp>0 is given, the gas temperature will be initialized to this
+ * value assuming a mean molecular weight either corresponding to complete
+ * neutrality, or full ionization.
+ *
+ * \param fname file name of the ICs
+ * \param readTypes readTypes is a bitfield that
+ * determines what particle types to read, only if the bit
+ * corresponding to a particle type is set, the corresponding data is
+ * loaded, otherwise its particle number is set to zero. (This is
+ * only implemented for HDF5 files.)
+ */
+void readsnap_io::mergertree_read_snap_ids(int num)
+{
+  if(All.ICFormat < 1 || All.ICFormat > 4)
+    Terminate("ICFormat=%d not supported.\n", All.ICFormat);
+
+  char fname[MAXLEN_PATH_EXTRA], fname_multiple[MAXLEN_PATH_EXTRA];
+  sprintf(fname_multiple, "%s/snapdir_%03d/%s_%03d", All.OutputDir, num, All.SnapshotFileBase, num);
+  sprintf(fname, "%s%s_%03d", All.OutputDir, All.SnapshotFileBase, num);
+
+  TIMER_START(CPU_SNAPSHOT);
+
+  int num_files = find_files(fname, fname_multiple);
+
+  if(num_files > 1)
+    strcpy(fname, fname_multiple);
+
+  /* we repeat reading the headers of the files two times. In the first iteration, only the
+   * particle numbers ending up on each processor are assembled, followed by memory allocation.
+   * In the second iteration, the data is actually read in.
+   */
+  for(int rep = 0; rep < 2; rep++)
+    {
+      MergerTree->MtrP_NumPart = 0;
+
+      read_files_driver(fname, rep, num_files);
+
+      /* now do the memory allocation */
+      if(rep == 0)
+        {
+          MergerTree->MtrP = (mergertree::mergertree_particle_data *)Mem.mymalloc_movable_clear(
+              &MergerTree->MtrP, "MtrP", (MergerTree->MtrP_NumPart + 1) * sizeof(mergertree::mergertree_particle_data));
+        }
+    }
+
+  MPI_Barrier(Communicator);
+
+  mpi_printf("READSNAPID: reading done.\n");
+
+  TIMER_STOP(CPU_SNAPSHOT);
+}
+
+void readsnap_io::fill_file_header(int writeTask, int lastTask, long long *n_type, long long *ntot_type)
+{ /* empty */
+}
+
+void readsnap_io::read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *n_type, long long *ntot_type,
+                                   int *nstart)
+{
+  if(ThisTask == readTask)
+    {
+      if(filenr == 0 && nstart == NULL)
+        {
+          mpi_printf(
+              "\nREADSNAPID: filenr=%d, '%s' contains:\n"
+              "READSNAPID: Type 0 (gas):   %8lld  (tot=%15lld) masstab= %g\n",
+              filenr, fname, (long long)header.npart[0], (long long)header.npartTotal[0], All.MassTable[0]);
+
+          for(int type = 1; type < NTYPES; type++)
+            {
+              mpi_printf("READSNAPID: Type %d:         %8lld  (tot=%15lld) masstab= %g\n", type, (long long)header.npart[type],
+                         (long long)header.npartTotal[type], All.MassTable[type]);
+            }
+          mpi_printf("\n");
+        }
+    }
+
+  /* to collect the gas particles all at the beginning (in case several
+     snapshot files are read on the current CPU) we move the collisionless
+     particles such that a gap of the right size is created */
+
+  long long nall = 0;
+  for(int type = 0; type < NTYPES; type++)
+    {
+      ntot_type[type] = header.npart[type];
+
+      long long n_in_file = header.npart[type];
+      int ntask           = lastTask - readTask + 1;
+      int n_for_this_task = n_in_file / ntask;
+      if((ThisTask - readTask) < (n_in_file % ntask))
+        n_for_this_task++;
+
+      n_type[type] = n_for_this_task;
+
+      nall += n_for_this_task;
+    }
+
+  if(nstart)
+    {
+      memmove(&MergerTree->MtrP[nall], &MergerTree->MtrP[0], MergerTree->MtrP_NumPart * sizeof(mergertree::mergertree_particle_data));
+      *nstart = 0;
+    }
+}
+
+void readsnap_io::write_header_fields(hid_t handle)
+{ /* empty */
+}
+
+/*! \brief This function reads the snapshot header in case of hdf5 files (i.e. format 3)
+ *
+ * \param fname file name of the snapshot as given in the parameter file
+ */
+void readsnap_io::read_header_fields(const char *fname)
+{
+  for(int i = 0; i < NTYPES; i++)
+    {
+      header.npart[i]      = 0;
+      header.npartTotal[i] = 0;
+      header.mass[i]       = 0;
+    }
+
+  hsize_t ntypes = NTYPES;
+
+  hid_t hdf5_file = my_H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
+  hid_t handle    = my_H5Gopen(hdf5_file, "/Header");
+
+  /* check if the file in question actually has this number of types */
+  hid_t hdf5_attribute = my_H5Aopen_name(handle, "NumPart_ThisFile");
+  hid_t space          = H5Aget_space(hdf5_attribute);
+  hsize_t dims, len;
+  H5Sget_simple_extent_dims(space, &dims, &len);
+  H5Sclose(space);
+  if(len != ntypes)
+    Terminate("Length of NumPart_ThisFile attribute (%d) does not match NTYPES(ICS) (%d)", (int)len, (int)ntypes);
+  my_H5Aclose(hdf5_attribute, "NumPart_ThisFile");
+
+  /* now read the header fields */
+
+#ifdef GADGET2_HEADER
+  read_vector_attribute(handle, "NumPart_ThisFile", header.npart, H5T_NATIVE_UINT, ntypes);
+#else
+  read_vector_attribute(handle, "NumPart_ThisFile", header.npart, H5T_NATIVE_UINT64, ntypes);
+#endif
+
+  read_vector_attribute(handle, "NumPart_Total", header.npartTotal, H5T_NATIVE_UINT64, ntypes);
+
+  read_scalar_attribute(handle, "BoxSize", &header.BoxSize, H5T_NATIVE_DOUBLE);
+  read_vector_attribute(handle, "MassTable", header.mass, H5T_NATIVE_DOUBLE, ntypes);
+  read_scalar_attribute(handle, "Time", &header.time, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "Redshift", &header.redshift, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "NumFilesPerSnapshot", &header.num_files, H5T_NATIVE_INT);
+
+  my_H5Gclose(handle, "/Header");
+  my_H5Fclose(hdf5_file, fname);
+}
+
+int readsnap_io::get_filenr_from_header(void) { return header.num_files; }
+
+void readsnap_io::set_filenr_in_header(int numfiles) { header.num_files = numfiles; }
+
+void readsnap_io::read_increase_numbers(int type, int n_for_this_task) { MergerTree->MtrP_NumPart += n_for_this_task; }
+
+void readsnap_io::get_datagroup_name(int type, char *buf) { sprintf(buf, "/PartType%d", type); }
+
+int readsnap_io::get_type_of_element(int index) { return MergerTree->MtrP[index].Type; }
+
+void readsnap_io::set_type_of_element(int index, int type) { MergerTree->MtrP[index].Type = type; }
+
+void *readsnap_io::get_base_address_of_structure(enum arrays array, int index)
+{
+  switch(array)
+    {
+      case A_MTRP:
+        return (void *)(MergerTree->MtrP + index);
+      default:
+        Terminate("we don't expect to get here");
+    }
+
+  return NULL;
+}
+#endif
diff --git a/src/mergertree/io_readsnap.h b/src/mergertree/io_readsnap.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cbc3d650aeda9d34878a9f9d3f6ec15e7df10d0
--- /dev/null
+++ b/src/mergertree/io_readsnap.h
@@ -0,0 +1,114 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_readsnap.h
+ *
+ *  \brief declaration of class for reading snapshot files for purposes of merger tree construction
+ */
+
+#ifndef READSNAP_IO_H
+#define READSNAP_IO_H
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+class readsnap_io : public IO_Def
+{
+ private:
+  mergertree *MergerTree;
+
+ public:
+  readsnap_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format);
+
+  void mergertree_read_snap_ids(int num);
+
+  /* supplied virtual functions */
+  void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart);
+  void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                        int *nstart);
+  void get_datagroup_name(int grnr, char *gname);
+  void write_header_fields(hid_t);
+  void read_header_fields(const char *fname);
+  void read_increase_numbers(int type, int n_for_this_task);
+  int get_filenr_from_header(void);
+  void set_filenr_in_header(int);
+  void *get_base_address_of_structure(enum arrays array, int index);
+  int get_type_of_element(int index);
+  void set_type_of_element(int index, int type);
+
+  /** Header for the standard file format.
+   */
+#ifdef GADGET2_HEADER
+  struct io_header
+  {
+    int npart[NTYPES_HEADER];                       /**< number of particles of each type in this file */
+    double mass[NTYPES_HEADER];                     /**< mass of particles of each type. If 0, then the masses are explicitly
+                                                           stored in the mass-block of the snapshot file, otherwise they are omitted */
+    double time;                                    /**< time of snapshot file */
+    double redshift;                                /**< redshift of snapshot file */
+    int flag_sfr;                                   /**< flags whether the simulation was including star formation */
+    int flag_feedback;                              /**< flags whether feedback was included (obsolete) */
+    unsigned int npartTotalLowWord[NTYPES_HEADER];  /**< total number of particles of each type in this snapshot. This can be
+                                        different from npart if one is dealing with a multi-file snapshot. */
+    int flag_cooling;                               /**< flags whether cooling was included  */
+    int num_files;                                  /**< number of files in multi-file snapshot */
+    double BoxSize;                                 /**< box-size of simulation in case periodic boundaries were used */
+    double Omega0;                                  /**< matter density in units of critical density */
+    double OmegaLambda;                             /**< cosmological constant parameter */
+    double HubbleParam;                             /**< Hubble parameter in units of 100 km/sec/Mpc */
+    double Hubble;                                  /**< Hubble constant in internal units */
+    unsigned int npartTotalHighWord[NTYPES_HEADER]; /**< High word of the total number of particles of each type */
+    int flag_entropy_instead_u;                     /**< flags that IC-file contains entropy instead of u */
+    int flag_doubleprecision;                       /**< flags that snapshot contains double-precision instead of single precision */
+    int flag_ic_info;                    /*!< flag to inform whether IC files are generated with ordinary Zeldovich approximation,
+                                                or whether they ocontains 2nd order lagrangian perturbation theory initial conditions.
+                                                For snapshots files, the value informs whether the simulation was evolved from
+                                                Zeldoch or 2lpt ICs. Encoding is as follows:
+                                                  FLAG_ZELDOVICH_ICS     (1)   - IC file based on Zeldovich
+                                                  FLAG_SECOND_ORDER_ICS  (2)   - Special IC-file containing 2lpt masses
+                                                 All other values, including 0 are interpreted as "don't know" for backwards compatability.
+                                            */
+    float lpt_scalingfactor;             /*!< scaling factor for 2lpt initial conditions */
+    long long npartTotal[NTYPES_HEADER]; /**< fills to 256 Bytes, and for compatability with Gadget2/3 */
+  };
+  io_header header; /**< holds header for snapshot files */
+#else
+
+  /* new simplified header */
+  struct io_header
+  {
+    long long npart[NTYPES_HEADER]; /**< number of particles of each type in this file */
+    long long npartTotal[NTYPES_HEADER];
+    double mass[NTYPES_HEADER]; /**< mass of particles of each type. If 0, then the masses are explicitly
+                                       stored in the mass-block of the snapshot file, otherwise they are omitted */
+    double time;                /**< time of snapshot file */
+    double redshift;            /**< redshift of snapshot file */
+    double BoxSize;             /**< box-size of simulation in case periodic boundaries were used */
+    int num_files;              /**< number of files in multi-file snapshot */
+  };
+  io_header header; /**< holds header for snapshot files */
+
+#endif
+};
+
+#endif
+
+#endif /* READSNAP_IO_H */
diff --git a/src/mergertree/io_readtrees_mbound.cc b/src/mergertree/io_readtrees_mbound.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af6d0c00d2d37d79e846e40b27c58105b6530377
--- /dev/null
+++ b/src/mergertree/io_readtrees_mbound.cc
@@ -0,0 +1,273 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_readtrees_mbound.cc
+ *
+ *  \brief routines for I/O of most-bound particles belonging to merger trees
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_readtrees_mbound.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+/*
+ struct treehalo_ids_type
+  {
+    MyIDType SubMostBoundID;
+    long long TreeID;
+  };
+  treehalo_ids_type *HaloIDdata;
+*/
+
+readtrees_mbound_io::readtrees_mbound_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format) : IO_Def(comm, format)
+{
+  MergerTree = MergerTree_ptr;
+
+  this->N_IO_Fields  = 0;
+  this->N_DataGroups = 2;
+  this->header_size  = sizeof(header);
+  this->header_buf   = &header;
+  this->type_of_file = FILE_IS_TREECAT;
+  sprintf(this->info, "MERGERTREE: reading/writing mergertrees");
+
+  init_field("MTRL", "Length", MEM_INT, FILE_INT, READ_IF_PRESENT, 1, A_TT, &MergerTree->TreeTable[0].HaloCount, NULL, TREELENGTH, 0,
+             0, 0, 0, 0, 0, 0);
+  init_field("MTRS", "StartOffset", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_TT, &MergerTree->TreeTable[0].FirstHalo, NULL,
+             TREELENGTH, 0, 0, 0, 0, 0, 0, 0);
+  init_field("MTRI", "TreeID", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_TT, &MergerTree->TreeTable[0].TreeID, NULL, TREELENGTH, 0,
+             0, 0, 0, 0, 0, 0);
+
+  /* just read most-bound ID */
+
+  init_field("TRID", "TreeID", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_TID, &MergerTree->HaloIDdata[0].TreeID, NULL, TREEHALOS, 0,
+             0, 0, 0, 0, 0, 0);
+
+  init_field("SIDM", "SubhaloIDMostbound", MEM_MY_ID_TYPE, FILE_MY_ID_TYPE, READ_IF_PRESENT, 1, A_TID,
+             &MergerTree->HaloIDdata[0].SubMostBoundID, NULL, TREEHALOS, 0, 0, 0, 0, 0, 0, 0);
+}
+
+void readtrees_mbound_io::read_trees_mostbound(void)
+{
+  double t0 = Logs.second();
+
+  MergerTree->TotNtrees = 0;
+  MergerTree->TotNhalos = 0;
+
+  char fname[MAXLEN_PATH_EXTRA];
+
+  if(All.NumFilesPerSnapshot > 1)
+    sprintf(fname, "%s/treedata/%s", All.OutputDir, "trees");
+  else
+    sprintf(fname, "%s%s", All.OutputDir, "trees");
+
+  int num_files = find_files(fname, fname);
+
+  reset_io_byte_count();
+
+  for(int rep = 0; rep < 2; rep++)
+    {
+      MergerTree->Ntrees = 0;
+      MergerTree->Nhalos = 0;
+
+      read_files_driver(fname, rep, num_files);
+
+      /* now do the memory allocation */
+      if(rep == 0)
+        {
+          MergerTree->TreeTable  = (halotrees_table *)Mem.mymalloc_movable(&MergerTree->TreeTable, "TreeTable",
+                                                                          (MergerTree->Ntrees + 1) * sizeof(halotrees_table));
+          MergerTree->HaloIDdata = (treehalo_ids_type *)Mem.mymalloc_movable(&MergerTree->HaloIDdata, "HaloIDdata",
+                                                                             (MergerTree->Nhalos + 1) * sizeof(treehalo_ids_type));
+        }
+    }
+
+  MPI_Barrier(Communicator);
+
+  long long byte_count = get_io_byte_count(), byte_count_all;
+  sumup_longs(1, &byte_count, &byte_count_all, Communicator);
+
+  double t1 = Logs.second();
+
+  mpi_printf("MERGERTREE-READ: reading done. Took %g sec, total size %g MB, corresponds to effective I/O rate of %g MB/sec\n",
+             Logs.timediff(t0, t1), byte_count_all / (1024.0 * 1024.0), byte_count_all / (1024.0 * 1024.0) / Logs.timediff(t0, t1));
+
+  mpi_printf("\nMERGERTREE-READ: Total number of trees=%ldd, total number of halos=%lld\n\n", MergerTree->TotNtrees,
+             MergerTree->TotNhalos);
+
+  MPI_Barrier(Communicator);
+
+  for(int i = 0; i < MergerTree->Ntrees; i++)
+    {
+      if(MergerTree->HaloIDdata[i].TreeID > MergerTree->TotNtrees)
+        Terminate("i=%d  MergerTree->Ntrees=%d  MergerTree->HaloIDdata[i].TreeID=%lld   MergerTree->TotNtrees=%lld ", i,
+                  MergerTree->Ntrees, MergerTree->HaloIDdata[i].TreeID, MergerTree->TotNtrees);
+    }
+}
+
+void readtrees_mbound_io::fill_file_header(int writeTask, int lastTask, long long *n_type, long long *ntot_type) {}
+
+void readtrees_mbound_io::write_header_fields(hid_t handle) {}
+
+int readtrees_mbound_io::get_filenr_from_header(void) { return header.num_files; }
+
+void readtrees_mbound_io::set_filenr_in_header(int numfiles) { header.num_files = numfiles; }
+
+void readtrees_mbound_io::read_increase_numbers(int type, int n_for_this_task)
+{
+  switch(type)
+    {
+      case 0:
+        MergerTree->Ntrees += n_for_this_task;
+        break;
+      case 1:
+        MergerTree->Nhalos += n_for_this_task;
+        break;
+
+      default:
+        Terminate("wrong group");
+        break;
+    }
+}
+
+void readtrees_mbound_io::read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *n_type,
+                                           long long *ntot_type, int *nstart)
+{
+  if(ThisTask == readTask)
+    {
+      if(filenr == 0 && nstart == NULL)
+        {
+          mpi_printf(
+              "\nREAD-TREES: filenr=%d, '%s' contains %lld trees out of a total of %lld, and %lld halos out of a total of %lld\n",
+              filenr, fname, header.Ntrees, header.TotNtrees, header.Nhalos, header.TotNhalos);
+        }
+    }
+
+  if(MergerTree->TotNtrees == 0)
+    {
+      MergerTree->TotNtrees = header.TotNtrees;
+      MergerTree->TotNhalos = header.TotNhalos;
+    }
+
+  for(int k = 0; k < 2; k++)
+    n_type[k] = ntot_type[k] = 0;
+
+  {
+    ntot_type[0]        = header.Ntrees;
+    long long n_in_file = header.Ntrees;
+    int ntask           = lastTask - readTask + 1;
+    int n_for_this_task = n_in_file / ntask;
+    if((ThisTask - readTask) < (n_in_file % ntask))
+      n_for_this_task++;
+    n_type[0] = n_for_this_task;
+
+    if(nstart)
+      memmove(&MergerTree->TreeTable[n_for_this_task], &MergerTree->TreeTable[0], MergerTree->Ntrees * sizeof(halotrees_table));
+  }
+
+  {
+    ntot_type[1]        = header.Nhalos;
+    long long n_in_file = header.Nhalos;
+    int ntask           = lastTask - readTask + 1;
+    int n_for_this_task = n_in_file / ntask;
+    if((ThisTask - readTask) < (n_in_file % ntask))
+      n_for_this_task++;
+    n_type[1] = n_for_this_task;
+
+    if(nstart)
+      memmove(&MergerTree->HaloIDdata[n_for_this_task], &MergerTree->HaloIDdata[0], MergerTree->Nhalos * sizeof(treehalo_ids_type));
+  }
+
+  if(nstart)
+    *nstart = 0;
+}
+
+void readtrees_mbound_io::read_header_fields(const char *fname)
+{
+  memset(&header, 0, sizeof(header));
+
+  hid_t hdf5_file = my_H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
+  hid_t handle    = my_H5Gopen(hdf5_file, "/Header");
+
+  read_scalar_attribute(handle, "Ntrees_ThisFile", &header.Ntrees, H5T_NATIVE_UINT64);
+  read_scalar_attribute(handle, "Ntrees_Total", &header.TotNtrees, H5T_NATIVE_UINT64);
+
+  read_scalar_attribute(handle, "Nhalos_ThisFile", &header.Nhalos, H5T_NATIVE_UINT64);
+  read_scalar_attribute(handle, "Nhalos_Total", &header.TotNhalos, H5T_NATIVE_UINT64);
+
+  read_scalar_attribute(handle, "NumFiles", &header.num_files, H5T_NATIVE_INT);
+
+  read_scalar_attribute(handle, "LastSnapShotNr", &header.lastsnapshotnr, H5T_NATIVE_INT);
+
+  my_H5Gclose(handle, "/Header");
+  my_H5Fclose(hdf5_file, fname);
+}
+
+void readtrees_mbound_io::get_datagroup_name(int type, char *buf)
+{
+  switch(type)
+    {
+      case 0:
+        sprintf(buf, "/TreeTable");
+        break;
+      case 1:
+        sprintf(buf, "/TreeHalos");
+        break;
+      default:
+        Terminate("wrong group");
+        break;
+    }
+}
+
+int readtrees_mbound_io::get_type_of_element(int index)
+{
+  /* empty */
+  return 0;
+}
+
+void readtrees_mbound_io::set_type_of_element(int index, int type)
+{ /* empty */
+}
+
+void *readtrees_mbound_io::get_base_address_of_structure(enum arrays array, int index)
+{
+  switch(array)
+    {
+      case A_TT:
+        return (void *)(MergerTree->TreeTable + index);
+      case A_TID:
+        return (void *)(MergerTree->HaloIDdata + index);
+      default:
+        Terminate("strange, we don't expect to get here");
+    }
+}
+
+#endif
diff --git a/src/mergertree/io_readtrees_mbound.h b/src/mergertree/io_readtrees_mbound.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c6545623dc6e5cac4e3161c162f1d4433e71230
--- /dev/null
+++ b/src/mergertree/io_readtrees_mbound.h
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_readtrees_mbound.h
+ *
+ *  \brief definition of I/O class for most-bound particles belonging to merger trees
+ */
+
+#ifndef READTREES_MBOUND_IO_H
+#define READTREES_MBOUND_IO_H
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+class readtrees_mbound_io : public IO_Def
+{
+ private:
+  mergertree *MergerTree;
+
+ public:
+  readtrees_mbound_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format);
+
+  void read_trees_mostbound(void);
+
+  /* supplied virtual functions */
+  void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart);
+  void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                        int *nstart);
+  void get_datagroup_name(int grnr, char *gname);
+  void write_header_fields(hid_t);
+  void read_header_fields(const char *fname);
+  void read_increase_numbers(int type, int n_for_this_task);
+  int get_filenr_from_header(void);
+  void set_filenr_in_header(int);
+  void *get_base_address_of_structure(enum arrays array, int index);
+  int get_type_of_element(int index);
+  void set_type_of_element(int index, int type);
+
+  /** Header for the standard file format.
+   */
+
+  struct io_header
+  {
+    long long Nhalos;
+    long long TotNhalos;
+
+    long long Ntrees;
+    long long TotNtrees;
+
+    int num_files;
+    int lastsnapshotnr;
+  };
+  io_header header;
+
+  typedef mergertree::treehalo_ids_type treehalo_ids_type;
+};
+
+#endif
+
+#endif /* HALOTREES_IO_H */
diff --git a/src/mergertree/io_treelinks.cc b/src/mergertree/io_treelinks.cc
new file mode 100644
index 0000000000000000000000000000000000000000..274fcf07c67bfd91eb446433dd694acd18eb1c20
--- /dev/null
+++ b/src/mergertree/io_treelinks.cc
@@ -0,0 +1,179 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_treelinks.cc
+ *
+ *  \brief routines for the I/O needed for the merger treelink files
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_treelinks.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+treelinks_io::treelinks_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format) : IO_Def(comm, format)
+{
+  MergerTree = MergerTree_ptr;
+
+  this->N_IO_Fields  = 0;
+  this->N_DataGroups = 1;
+  this->header_size  = sizeof(header);
+  this->header_buf   = &header;
+  this->type_of_file = FILE_IS_TREELINK;
+  sprintf(this->info, "TREELINK: writing treelink information");
+
+  init_field("TRNR", "TreeID", MEM_INT64, FILE_INT64, READ_IF_PRESENT, 1, A_TL, &MergerTree->TreeLink[0].TreeID, NULL, TREELINK, 0, 0,
+             0, 0, 0, 0, 0, true);
+
+  init_field("TRIX", "TreeIndex", MEM_INT, FILE_INT, READ_IF_PRESENT, 1, A_TL, &MergerTree->TreeLink[0].TreeIndex, NULL, TREELINK, 0,
+             0, 0, 0, 0, 0, 0, true);
+}
+
+void treelinks_io::treelinks_save(int num)
+{
+  char buf[MAXLEN_PATH_EXTRA];
+
+  /* write treelink info */
+  if(All.NumFilesPerSnapshot > 1)
+    {
+      if(ThisTask == 0)
+        {
+          sprintf(buf, "%s/groups_%03d", All.OutputDir, num);
+          mkdir(buf, 02755);
+        }
+      MPI_Barrier(Communicator);
+    }
+
+  if(All.NumFilesPerSnapshot > 1)
+    sprintf(buf, "%s/groups_%03d/%s_%03d", All.OutputDir, num, "subhalo_treelink", num);
+  else
+    sprintf(buf, "%s%s_%03d", All.OutputDir, "subhalo_treelink", num);
+
+  write_multiple_files(buf, All.NumFilesPerSnapshot);
+}
+
+void treelinks_io::fill_file_header(int writeTask, int lastTask, long long *n_type, long long *ntot_type)
+{
+  /* determine group/id numbers of each type in file */
+
+  n_type[0] = Nsubhalos;
+
+  if(ThisTask == writeTask)
+    {
+      for(int n = 0; n < 1; n++)
+        ntot_type[n] = n_type[n];
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        {
+          long long nn[1];
+          MPI_Recv(&nn[0], 1, MPI_LONG_LONG, task, TAG_LOCALN, Communicator, MPI_STATUS_IGNORE);
+          for(int n = 0; n < 1; n++)
+            ntot_type[n] += nn[n];
+        }
+
+      for(int task = writeTask + 1; task <= lastTask; task++)
+        MPI_Send(&ntot_type[0], 1, MPI_LONG_LONG, task, TAG_N, Communicator);
+    }
+  else
+    {
+      MPI_Send(&n_type[0], 1, MPI_LONG_LONG, writeTask, TAG_LOCALN, Communicator);
+      MPI_Recv(&ntot_type[0], 1, MPI_LONG_LONG, writeTask, TAG_N, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* fill file header */
+
+  header.Nsubhalos    = ntot_type[0];
+  header.TotNsubhalos = TotNsubhalos;
+  header.num_files    = All.NumFilesPerSnapshot;
+}
+
+void treelinks_io::read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *n_type, long long *ntot_type,
+                                    int *nstart)
+{
+  /* empty */
+}
+
+void treelinks_io::write_header_fields(hid_t handle)
+{
+  write_scalar_attribute(handle, "Nsubhalos_ThisFile", &header.Nsubhalos, H5T_NATIVE_UINT64);
+
+  write_scalar_attribute(handle, "Nsubhalos_Total", &header.TotNsubhalos, H5T_NATIVE_UINT64);
+
+  write_scalar_attribute(handle, "NumFiles", &header.num_files, H5T_NATIVE_INT);
+}
+
+void treelinks_io::read_header_fields(const char *fname)
+{ /* empty */
+}
+
+int treelinks_io::get_filenr_from_header(void) { return header.num_files; }
+
+void treelinks_io::set_filenr_in_header(int numfiles) { header.num_files = numfiles; }
+
+void treelinks_io::read_increase_numbers(int type, int n_for_this_task)
+{ /* empty */
+}
+
+void treelinks_io::get_datagroup_name(int type, char *buf)
+{
+  switch(type)
+    {
+      case 0:
+        sprintf(buf, "/Subhalo");
+        break;
+      default:
+        Terminate("wrong group");
+        break;
+    }
+}
+
+int treelinks_io::get_type_of_element(int index)
+{
+  /* empty */
+  return 0;
+}
+
+void treelinks_io::set_type_of_element(int index, int type)
+{ /* empty */
+}
+
+void *treelinks_io::get_base_address_of_structure(enum arrays array, int index)
+{
+  switch(array)
+    {
+      case A_TL:
+        return (void *)(MergerTree->TreeLink + index);
+      default:
+        Terminate("strange, we don't expect to get here");
+    }
+}
+
+#endif
diff --git a/src/mergertree/io_treelinks.h b/src/mergertree/io_treelinks.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd2e5995440958608363561a006fe025c3eff758
--- /dev/null
+++ b/src/mergertree/io_treelinks.h
@@ -0,0 +1,75 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  io_treelinks.h
+ *
+ *  \brief declaration of the class used for I/O of the merger treelink files
+ */
+
+#ifndef TREELINKS_IO_H
+#define TREELINKS_IO_H
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+class treelinks_io : public IO_Def
+{
+ private:
+  mergertree *MergerTree;
+
+ public:
+  treelinks_io(mergertree *MergerTree_ptr, MPI_Comm comm, int format);
+
+  void treelinks_save(int num);
+
+  /* supplied virtual functions */
+  void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart);
+  void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                        int *nstart);
+  void get_datagroup_name(int grnr, char *gname);
+  void write_header_fields(hid_t);
+  void read_header_fields(const char *fname);
+  void read_increase_numbers(int type, int n_for_this_task);
+  int get_filenr_from_header(void);
+  void set_filenr_in_header(int);
+  void *get_base_address_of_structure(enum arrays array, int index);
+  int get_type_of_element(int index);
+  void set_type_of_element(int index, int type);
+
+  /** Header for the standard file format.
+   */
+  struct io_header
+  {
+    long long Nsubhalos;
+    long long TotNsubhalos;
+    int num_files;
+  };
+  io_header header;
+
+  int Nsubhalos;
+  long long TotNsubhalos;
+
+ private:
+};
+
+#endif
+
+#endif /* TREELINKS_IO_H */
diff --git a/src/mergertree/mergertree.h b/src/mergertree/mergertree.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ebda0653eebf013601384c163fe988fb540c70c
--- /dev/null
+++ b/src/mergertree/mergertree.h
@@ -0,0 +1,574 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  mergertree.h
+ *
+ *  \brief definition of a class that provides the merger tree functionality
+ */
+
+#ifndef MERGERTREE_H
+#define MERGERTREE_H
+
+#ifdef MERGERTREE
+
+#include <hdf5.h>
+
+#include "../data/simparticles.h"
+#include "../fof/fof.h"
+#include "../subfind/subfind.h"
+
+#define NUM_MOST_BOUND_PARTICLES_USED_FOR_TRACKING 16  // may not be larger than 255 due to storage limits
+
+class mergertree : public setcomm
+{
+ private:
+  simparticles *Sp;
+
+ public:
+  mergertree(MPI_Comm comm, simparticles *Sp_ptr) : setcomm(comm)
+  {
+    Sp               = Sp_ptr;
+    PrevTotNsubhalos = 0;
+    PrevNsubhalos    = 0;
+    MtrP_NumPart     = 0;
+  }
+
+  void mergertree_determine_descendants_on_the_fly(int num);
+  void descendants_in_postprocessing(int num);
+  void halotrees_construct(int lastsnapnr);
+  void get_previous_size_of_subhalo_for_each_particle(int num);
+
+  unsigned long long CurrTotNsubhalos;
+  unsigned long long PrevTotNsubhalos;
+
+  int CurrNsubhalos;
+  int PrevNsubhalos;
+
+  int MtrP_NumPart;
+  int PrevMtrP_NumPart;
+
+  int Nhalos;
+  long long TotNhalos;
+
+  int Ntrees;
+  long long TotNtrees;
+
+  int LargestHaloCount;
+  int LastSnapShotNr;  // we will construct trees composed of dumps 0 to LastSnapShotNr
+
+  typedef fof<simparticles>::treehalo_t treehalo_type;
+  treehalo_type *Halos; /* This will contain the subhalos making up the merger tree information */
+
+  struct treehalo_ids_type
+  {
+    MyIDType SubMostBoundID;
+    long long TreeID;
+  };
+  treehalo_ids_type *HaloIDdata;
+
+  static bool compare_HaloIDdata_ID(const treehalo_ids_type &a, const treehalo_ids_type &b)
+  {
+    return a.SubMostBoundID < b.SubMostBoundID;
+  }
+
+  struct desc_list
+  {
+    double MaxScore;
+    long long PrevSubhaloNr;
+    long long DescSubhaloNr;
+
+    long long FirstDescSubhaloNr;
+    long long NextProgSubhaloNr;
+
+    long long FileOffset;
+    char FirstProgFlag;
+  };
+  desc_list *Descendants;
+
+  struct prog_list
+  {
+    double MaxScore;
+    long long SubhaloNr;
+
+    long long ProgSubhaloNr;
+
+    long long FirstProgSubhaloNr;
+    long long NextDescSubhaloNr;
+
+    long long MainProgSubhaloNr;
+
+    long long FileOffset;
+
+    char FirstDescFlag;
+  };
+  prog_list *Progenitors;
+
+  struct mergertree_particle_data
+  {
+    int Type;
+    MyIDType ID;
+    long long FileOffset;
+
+    long long PrevGroupNr;
+    long long PrevSubhaloNr;
+
+    long long GroupNr;
+    long long SubhaloNr;
+
+    MyLenType SubhaloLen;
+    MyLenType PrevSubhaloLen;
+
+    unsigned short RankInSubhalo;
+    unsigned short PrevRankInSubhalo;
+  };
+  mergertree_particle_data *MtrP, *PrevMtrP;
+
+  halotrees_table *TreeTable;
+
+  struct treelink_data
+  {
+    long long TreeID;
+    int TreeIndex;
+  };
+  treelink_data *TreeLink;
+
+  times_catalogue *CatTimes;
+
+ private:
+  void mergertree_determine_descendants(int num);
+  void mergertree_determine_descendants_postproc(int num);
+  void mergertree_save_progenitors(int num);
+  void mergertree_read_progenitors(int num);
+  void desc_subfind_init_io_fields(void);
+  void prog_subfind_init_io_fields(void);
+  void treelink_subfind_init_io_fields(void);
+  void mergertree_read_snap_ids(int num);
+  void mergertree_match_ids_of_previous_snap(void);
+  void mrgtr_init_io_fields(void);
+  int halotrees_join_via_descendants(int num);
+  int halotrees_join_via_progenitors(int num);
+  void halotrees_assign_global_subhalonr_and_groupnr(void);
+  void halotrees_load_catalogues(fof<simparticles> *FoF);
+  void halotrees_initial_treeassignment(void);
+  void halotrees_assign_new_treeindices(void);
+  void halotrees_save_trees(void);
+  void halotrees_save_subhalo_treelinks(void);
+  void halotrees_collect_treehalos(void);
+  void halotrees_link_trees(void);
+  void halotrees_propagate_max_branch_length_descendants(int num);
+  void halotrees_propagate_max_branch_length_progenitors(int num);
+  void halotrees_determine_mainprogenitor(void);
+  void halotrees_reshuffle(char **ptr, size_t len, int ncurrent, int ntarget);
+  void halotrees_remap_treepointers(void);
+  void mergertree_assign_group_numbers(fof<simparticles> *FoF);
+  void trees_subfind_init_io_fields(void);
+  long long halotrees_join_trees_via_fof_or_mostboundid_bridges(int mode);
+  void mergertree_match_ids_of_current_snap(void);
+
+  void mergertree_chain_up_progenitors_with_same_descendant(void);
+  void mergertree_set_first_progenitor_with_same_descendant(void);
+  void mergertree_select_maximum_score_progenitors(int nmatch);
+  void mergertree_select_maximum_score_descendants(int nmatch);
+  int mergertree_find_matching_segments_and_scores(void);
+  void mergertree_chain_up_descendants_with_same_progenitor(void);
+  void mergertree_set_first_descendant_with_same_progenitor(void);
+
+  struct subhalo_extension
+  {
+    int TreeDescendant;
+    int TreeFirstProgenitor;
+    int TreeNextProgenitor;
+    int TreeFirstHaloInFOFgroup;
+    int TreeNextHaloInFOFgroup;
+    int TreeProgenitor;
+    int TreeFirstDescendant;
+    int TreeNextDescendant;
+    int TreeMainProgenitor;
+
+    long long MaxLenProgBranch;
+  };
+
+  /* PrevTotNsubhalos is the total number of subhalos in the previous catalogue. This list is split onto
+   * the processors, with 'PrevNsubhalos' being the number of these subhalos assigned to the local MPI task.
+   */
+
+  static bool compare_MtrP_FileOffset(const mergertree_particle_data &a, const mergertree_particle_data &b)
+  {
+    return a.FileOffset < b.FileOffset;
+  }
+
+  static bool compare_MtrP_SubhaloNr(const mergertree_particle_data &a, const mergertree_particle_data &b)
+  {
+    if(a.PrevSubhaloNr < b.PrevSubhaloNr)
+      return true;
+    if(a.PrevSubhaloNr > b.PrevSubhaloNr)
+      return false;
+
+    if(a.PrevRankInSubhalo < b.PrevRankInSubhalo)
+      return true;
+    if(a.PrevRankInSubhalo > b.PrevRankInSubhalo)
+      return false;
+
+    return a.ID < b.ID;
+  }
+
+  static bool compare_Group_FileOffset(const fof<simparticles>::group_properties &a, const fof<simparticles>::group_properties &b)
+  {
+    return a.FileOffset < b.FileOffset;
+  }
+
+  static bool compare_Subhalo_FileOffset(const fof<simparticles>::subhalo_properties &a,
+                                         const fof<simparticles>::subhalo_properties &b)
+  {
+    return a.FileOffset < b.FileOffset;
+  }
+
+  static bool compare_MtrP_ID(const mergertree_particle_data &a, const mergertree_particle_data &b) { return a.ID < b.ID; }
+
+  struct assign_particle_data
+  {
+    MyIDType ID;
+    long long PrevSubhaloNr;
+    MyLenType PrevSubhaloLen;
+    int OriginTask;
+    int OriginIndex;
+  };
+
+  static bool compare_AssignP_ID(const assign_particle_data &a, const assign_particle_data &b) { return a.ID < b.ID; }
+
+  static bool compare_AssignP_Origin(const assign_particle_data &a, const assign_particle_data &b)
+  {
+    if(a.OriginTask < b.OriginTask)
+      return true;
+    if(a.OriginTask > b.OriginTask)
+      return false;
+
+    return a.OriginIndex < b.OriginIndex;
+  }
+
+  /* This structure is used to do the matching/identification of the descendants. First, every particle is listed here
+   * with its subhalo number and rank in the previous subhalo catalogue, while GroupNr/SubRankInGr (or equivalently the combined
+   * SubhaloNr) give the number of the subhalo they are in the new catalogue. By first sorting according to PrevSubhaloNr
+   * and then by SubhaloNr, we get all descendant candidates a previous subhalo can have. We then need to figure out
+   * which one to pick, which we do via the Score variable.
+   */
+  struct desc_partdata
+  {
+    MyHaloNrType PrevSubhaloNr;  // Global subhalo number of the particle in the previous group catalogue
+    MyHaloNrType CurrSubhaloNr;  // New global subhalo number in the current group catalogue
+
+    float DescScore;  // auxiliary variable to score the different possible descendants
+    float ProgScore;  // auxiliary variable to score the different possible progenitors
+
+    compactrank_t PrevRankInSubhalo;  // Rank of particle within previous subhalo
+    compactrank_t CurrRankInSubhalo;  // Rank of particle within current subhalo
+  };
+  desc_partdata *desc;
+
+  /* sort kernel, first by PrevSubhaloNr, then by SubhaloNr */
+  static bool mergertree_compare_PrevSubNr_NewSubNr(const desc_partdata &a, const desc_partdata &b)
+  {
+    if(a.PrevSubhaloNr < b.PrevSubhaloNr)
+      return true;
+    if(a.PrevSubhaloNr > b.PrevSubhaloNr)
+      return false;
+
+    return a.CurrSubhaloNr < b.CurrSubhaloNr;
+  }
+
+  static bool mergertree_compare_NewSubNr_PrevSubNr(const desc_partdata &a, const desc_partdata &b)
+  {
+    if(a.CurrSubhaloNr < b.CurrSubhaloNr)
+      return true;
+    if(a.CurrSubhaloNr > b.CurrSubhaloNr)
+      return false;
+
+    return a.PrevSubhaloNr < b.PrevSubhaloNr;
+  }
+
+  /* sort kernel */
+  static bool mergertree_compare_DescSubhaloNr(const desc_list &a, const desc_list &b)
+  {
+    if(a.DescSubhaloNr < b.DescSubhaloNr)
+      return true;
+    if(a.DescSubhaloNr > b.DescSubhaloNr)
+      return false;
+
+    return a.PrevSubhaloNr < b.PrevSubhaloNr;
+  }
+
+  /* sort kernel */
+  static bool mergertree_compare_ProgSubhaloNr(const prog_list &a, const prog_list &b)
+  {
+    if(a.ProgSubhaloNr < b.ProgSubhaloNr)
+      return true;
+    if(a.ProgSubhaloNr > b.ProgSubhaloNr)
+      return false;
+
+    return a.SubhaloNr < b.SubhaloNr;
+  }
+
+  static bool mergertree_compare_SubhaloNr(const prog_list &a, const prog_list &b) { return a.SubhaloNr < b.SubhaloNr; }
+
+  /* sort kernel */
+  static bool mergertree_compare_PrevSubhaloNr(const desc_list &a, const desc_list &b) { return a.PrevSubhaloNr < b.PrevSubhaloNr; }
+
+  /* This structure is needed to organize the information of all the group and subhalo catalogs that are read in.
+   * The array Cats[] holds a table with all the group catalogs, indexes by the snapshot number.
+   */
+  struct halo_catalogue
+  {
+    fof<simparticles>::group_properties *Group;  // table of local groups
+    long long FirstGroup;                        // first group number on this processor
+    long long TotNgroups;                        // total number of groups in this catalog
+    int Ngroups;                                 // number of groups stored on local processor
+    int *TabNgroups;                             // used to store a table with the group numbers on all processors
+
+    fof<simparticles>::subhalo_properties *Subhalo;  // table of local subhalos
+    long long FirstSubhalo;                          // first subhalo number on this processor
+    long long TotNsubhalos;                          // total number of subhalos in catalog
+    int Nsubhalos;                                   // local number of subhalos on this processor
+    int *TabNsubhalos;                               // used to store a table with the subhalo numbers on all processors
+
+    subhalo_extension *SubExt;  // additional subhalo properties that we determine as part of the tree building
+
+    desc_list *Descendants;  // stores descendant information
+    prog_list *Progenitors;  // stores progenitor information
+  };
+  halo_catalogue *Cats;
+
+  struct tlink
+  {
+    long long UniqueGroupNr;
+    long long OrderIndex;
+
+    long long TreeID;
+    long long NewTreeID;
+
+    int TreeTask;
+    int NewTreeTask;
+
+    int OrigTask;
+  };
+
+  static bool compare_tlink_GroupNr(const tlink &a, const tlink &b) { return a.UniqueGroupNr < b.UniqueGroupNr; }
+
+  static bool compare_tlink_TreeID(const tlink &a, const tlink &b) { return a.TreeID < b.TreeID; }
+
+  static bool compare_tlink_OrigTask_OrderIndex(const tlink &a, const tlink &b)
+  {
+    if(a.OrigTask < b.OrigTask)
+      return true;
+    if(a.OrigTask > b.OrigTask)
+      return false;
+
+    return a.OrderIndex < b.OrderIndex;
+  }
+
+  static bool compare_Halos_TreeID_TreeIndex(const treehalo_type &a, const treehalo_type &b)
+  {
+    if(a.TreeID < b.TreeID)
+      return true;
+    if(a.TreeID > b.TreeID)
+      return false;
+
+    return a.TreeIndex < b.TreeIndex;
+  }
+
+  static bool compare_Halos_UniqueGroupNr_SubhaloNr(const treehalo_type &a, const treehalo_type &b)
+  {
+    if(a.UniqueGroupNr < b.UniqueGroupNr)
+      return true;
+    if(a.UniqueGroupNr > b.UniqueGroupNr)
+      return false;
+
+    return a.SubhaloNr < b.SubhaloNr;
+  }
+
+  static bool compare_Desc_FileOffset(const desc_list &a, const desc_list &b) { return a.FileOffset < b.FileOffset; }
+
+  static bool compare_Prog_FileOffset(const prog_list &a, const prog_list &b) { return a.FileOffset < b.FileOffset; }
+
+  /*--------------------------------------------------------------------------------------------------------------*/
+
+  struct data_list
+  {
+    long long targetsubhalonr;
+    long long intreeid;
+    long long originsubhalonr;
+    int origin;
+  };
+
+  static bool compare_data_list_subhalonnr(const data_list &a, const data_list &b) { return a.targetsubhalonr < b.targetsubhalonr; }
+
+  struct remap_data
+  {
+    int loc_index;
+    int new_treeindexptr;
+    long long targetsubhalonr;
+    long long originsubhalonr;
+    long long treeid;
+    long long intreeid;
+    int orig_index;
+  };
+
+  static bool compare_remap_data_subhalonr(const remap_data &a, const remap_data &b) { return a.targetsubhalonr < b.targetsubhalonr; }
+
+  static bool compare_remap_data_orig_index(const remap_data &a, const remap_data &b) { return a.orig_index < b.orig_index; }
+
+  /*--------------------------------------------------------------------------------------------------------------*/
+
+  /* the following one is used to assign consecutive indices within each tree */
+  struct assign_data
+  {
+    int origin_task;
+    int origin_num;
+    int origin_index;
+
+    long long treeid;
+    long long newtreeid;
+    long long treeindex;
+  };
+
+  /* some sort kernels */
+  static bool compare_assign_data_treeid_origin_num_origin_task_origin_index(const assign_data &a, const assign_data &b)
+  {
+    if(a.treeid < b.treeid)
+      return true;
+    if(a.treeid > b.treeid)
+      return false;
+
+    if(a.origin_num > b.origin_num)
+      return true;
+    if(a.origin_num < b.origin_num)
+      return false;
+
+    if(a.origin_task < b.origin_task)
+      return true;
+    if(a.origin_task > b.origin_task)
+      return false;
+
+    return a.origin_index < b.origin_index;
+  }
+
+  static bool compare_assign_data_origin_task_origin_num_origin_index(const assign_data &a, const assign_data &b)
+  {
+    if(a.origin_task < b.origin_task)
+      return true;
+    if(a.origin_task > b.origin_task)
+      return false;
+
+    if(a.origin_num < b.origin_num)
+      return true;
+    if(a.origin_num > b.origin_num)
+      return false;
+
+    return a.origin_index < b.origin_index;
+  }
+
+  /*--------------------------------------------------------------------------------------------------------------*/
+
+  struct halotrees_data
+  {
+    long long descendantnr;
+    long long progenitornr;
+    int loc_index;
+    int orig_order;
+
+    long long treeid;
+    int treetask;
+  };
+
+  static bool compare_halotrees_data_descendantnr(const halotrees_data &a, const halotrees_data &b)
+  {
+    return a.descendantnr < b.descendantnr;
+  }
+
+  static bool compare_halotrees_data_progenitornr(const halotrees_data &a, const halotrees_data &b)
+  {
+    return a.progenitornr < b.progenitornr;
+  }
+
+  static bool compare_halotrees_data_orig_order(const halotrees_data &a, const halotrees_data &b)
+  {
+    return a.orig_order < b.orig_order;
+  }
+
+  struct halotrees_propagate_data
+  {
+    long long DescSubhaloNr;
+    long long ProgSubhaloNr;
+    long long SubhaloNr;
+    long long MaxLenProgBranch;
+    int index;
+    int orig_order;
+  };
+
+  static bool compare_halotrees_propagate_data_orig_order(const halotrees_propagate_data &a, const halotrees_propagate_data &b)
+  {
+    return a.orig_order < b.orig_order;
+  }
+
+  static bool compare_halotrees_propagate_data_DescSubhaloNr(const halotrees_propagate_data &a, const halotrees_propagate_data &b)
+  {
+    return a.DescSubhaloNr < b.DescSubhaloNr;
+  }
+
+  static bool compare_halotrees_propagate_data_ProgSubhaloNr(const halotrees_propagate_data &a, const halotrees_propagate_data &b)
+  {
+    return a.ProgSubhaloNr < b.ProgSubhaloNr;
+  }
+
+  struct halotrees_firstprog_data
+  {
+    long long DescSubhaloNr;
+    long long SubhaloNr;
+  };
+
+  static bool compare_halotrees_firstprog_data_DescSubhaloNr(const halotrees_firstprog_data &a, const halotrees_firstprog_data &b)
+  {
+    return a.DescSubhaloNr < b.DescSubhaloNr;
+  }
+
+  struct descnr_data
+  {
+    long long DescSubhaloNr;
+    long long SubhaloNr;
+    long long CumulLen;
+    long long TreeID;
+    int TreeTask;
+    int orig_index;
+  };
+
+  static bool compare_sorted_list_descsubhalonr(const descnr_data &a, const descnr_data &b)
+  {
+    return a.DescSubhaloNr < b.DescSubhaloNr;
+  }
+
+  struct prognr_data
+  {
+    long long ProgSubhaloNr;
+    long long SubhaloNr;
+    long long TreeID;
+    int TreeTask;
+    int orig_index;
+  };
+
+  static bool compare_sorted_list_progsubhalonr(const prognr_data &a, const prognr_data &b)
+  {
+    return a.ProgSubhaloNr < b.ProgSubhaloNr;
+  }
+
+  void halotrees_select_interior_min_newtreeid(int mode, tlink *treehalos, long long totnsubs);
+};
+
+/* In the following structure, we accumulate the main result of this routine, namely for each previous
+ * subhalo (identified through PrevSubhaloNr), the number of the descendant subhalo (DescSubhaloNr) in
+ * the present, newly calculated subhalo catalogue. MaxScore is a helper variable that gives the score
+ * of the presently associated descendant. If there is no descendant, DescSubhaloNr has the value -1 (check). */
+
+#endif
+#endif
diff --git a/src/mergertree/postproc_descendants.cc b/src/mergertree/postproc_descendants.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f4232130db026609e809f2eef44a0fe606162df
--- /dev/null
+++ b/src/mergertree/postproc_descendants.cc
@@ -0,0 +1,713 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file postproc_descendants.cc
+ *
+ *  \brief code to determine the descendant subhalo of all subhalos in one catalogue in another one
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../fof/fof_io.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_readsnap.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+// this is used in FOF postprocessing to assign the previous subhalo length to particles
+void mergertree::get_previous_size_of_subhalo_for_each_particle(int num)
+{
+  if(num >= 0)
+    {
+      mpi_printf(
+          "SUBFIND / MERGERTREE: We are loading the previous group catalogue to set the size of previous subhalos for each "
+          "particle\n");
+
+      domain<simparticles> Domain{Communicator, Sp};
+      fof<simparticles> FoF{Communicator, Sp, &Domain};
+
+      /* load previous snapshot and group/subhalo catalogues */
+      fof_io<simparticles> FoF_IO{&FoF, Communicator, All.SnapFormat};
+      FoF_IO.fof_subfind_load_groups(num);
+
+      readsnap_io Snap_IO{this, this->Communicator, All.SnapFormat};
+      Snap_IO.mergertree_read_snap_ids(num);
+
+      /* make sure that group catalog and snapshot are in file order */
+      mycxxsort_parallel(MtrP, MtrP + MtrP_NumPart, compare_MtrP_FileOffset, Communicator);
+      mycxxsort_parallel(FoF.Group, FoF.Group + FoF.Ngroups, compare_Group_FileOffset, Communicator);
+      mycxxsort_parallel(FoF.Subhalo, FoF.Subhalo + FoF.Nsubhalos, compare_Subhalo_FileOffset, Communicator);
+
+      /* now assign (global) group and subhalo numbers to each particle belonging to the particular structure */
+      mergertree_assign_group_numbers(&FoF);
+
+      Mem.myfree_movable(FoF.Subhalo);
+      Mem.myfree_movable(FoF.Group);
+
+      /* get PrevSizeOfSubhalo from PrevMtrP by matching IDs */
+      mergertree_match_ids_of_current_snap();
+    }
+  else
+    {
+      for(int i = 0; i < Sp->NumPart; i++)
+        {
+          Sp->P[i].PrevSubhaloNr.set(HALONR_MAX);
+          Sp->P[i].PrevSizeOfSubhalo.set(0);
+        }
+    }
+}
+
+void mergertree::descendants_in_postprocessing(int num)
+{
+  if(num - 1 < 0)
+    Terminate("cannot execute for the given snapnum");
+
+  domain<simparticles> Domain{Communicator, Sp};
+  fof<simparticles> FoF{Communicator, Sp, &Domain};
+
+  /* load previous snapshot and group/subhalo catalogues */
+
+  fof_io<simparticles> FoF_IO{&FoF, Communicator, All.SnapFormat};
+  FoF_IO.fof_subfind_load_groups(num - 1);
+
+  readsnap_io Snap_IO{this, this->Communicator, All.SnapFormat};
+  Snap_IO.mergertree_read_snap_ids(num - 1);
+
+  /* make sure that group catalog and snapshot are in file order */
+  mycxxsort_parallel(MtrP, MtrP + MtrP_NumPart, compare_MtrP_FileOffset, Communicator);
+  mycxxsort_parallel(FoF.Group, FoF.Group + FoF.Ngroups, compare_Group_FileOffset, Communicator);
+  mycxxsort_parallel(FoF.Subhalo, FoF.Subhalo + FoF.Nsubhalos, compare_Subhalo_FileOffset, Communicator);
+
+  if(FoF_IO.LegacyFormat)
+    {
+      mpi_printf("\nFOF/SUBFIND: Legacy format from Arepo detected, trying to adjust for this.\n");
+      FoF.subfind_redetermine_groupnr();
+    }
+
+  /* now assign (global) group and subhalo numbers to each particle belonging to the particular structure */
+  mergertree_assign_group_numbers(&FoF);
+
+  Mem.myfree_movable(FoF.Subhalo);
+  Mem.myfree_movable(FoF.Group);
+
+  /* save previous data */
+  PrevTotNsubhalos = FoF.TotNsubhalos;
+  PrevNsubhalos    = FoF.Nsubhalos;
+
+  PrevMtrP_NumPart = MtrP_NumPart;
+
+  PrevMtrP =
+      (mergertree_particle_data *)Mem.mymalloc_movable(&PrevMtrP, "PrevMtrP", PrevMtrP_NumPart * sizeof(mergertree_particle_data));
+  memcpy(PrevMtrP, MtrP, PrevMtrP_NumPart * sizeof(mergertree_particle_data));
+  Mem.myfree_movable(MtrP);
+
+  /* load new snapshot and group/subhalo catalogues */
+  FoF_IO.fof_subfind_load_groups(num);
+  CurrTotNsubhalos = FoF.TotNsubhalos;
+  CurrNsubhalos    = FoF.Nsubhalos;
+
+  Snap_IO.mergertree_read_snap_ids(num);
+
+  /* make sure that group catalog and snapshot are in file order */
+  mycxxsort_parallel(MtrP, MtrP + MtrP_NumPart, compare_MtrP_FileOffset, Communicator);
+  mycxxsort_parallel(FoF.Group, FoF.Group + FoF.Ngroups, compare_Group_FileOffset, Communicator);
+  mycxxsort_parallel(FoF.Subhalo, FoF.Subhalo + FoF.Nsubhalos, compare_Subhalo_FileOffset, Communicator);
+
+  if(FoF_IO.LegacyFormat)
+    {
+      mpi_printf("\nFOF/SUBFIND: Legacy format from Arepo detected, trying to adjust for this.\n");
+      FoF.subfind_redetermine_groupnr();
+    }
+
+  /* now assign (global) group and subhalo numbers to each particle belonging to the particular structure */
+  mergertree_assign_group_numbers(&FoF);
+
+  Mem.myfree_movable(FoF.Subhalo);
+  Mem.myfree_movable(FoF.Group);
+
+  /* assign the determined new subhalonr */
+  for(int i = 0; i < MtrP_NumPart; i++)
+    {
+      MtrP[i].SubhaloNr         = MtrP[i].PrevSubhaloNr;
+      MtrP[i].SubhaloLen        = MtrP[i].PrevSubhaloLen;
+      MtrP[i].GroupNr           = MtrP[i].PrevGroupNr;
+      MtrP[i].RankInSubhalo     = MtrP[i].PrevRankInSubhalo;
+      MtrP[i].PrevSubhaloNr     = HALONR_MAX;
+      MtrP[i].PrevGroupNr       = HALONR_MAX;
+      MtrP[i].PrevSubhaloLen    = 0;
+      MtrP[i].PrevRankInSubhalo = 0;
+    }
+
+  /* get PrevSubhaloNr from PrevMtrP by matching IDs */
+  mergertree_match_ids_of_previous_snap();
+
+  mergertree_determine_descendants_postproc(num);
+}
+
+void mergertree::mergertree_match_ids_of_previous_snap(void)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  mycxxsort_parallel(MtrP, MtrP + MtrP_NumPart, compare_MtrP_ID, Communicator);
+  mycxxsort_parallel(PrevMtrP, PrevMtrP + PrevMtrP_NumPart, compare_MtrP_ID, Communicator);
+
+  MyIDType *list_min_id = (MyIDType *)Mem.mymalloc("list_min_id", NTask * sizeof(MyIDType));
+  MyIDType *list_max_id = (MyIDType *)Mem.mymalloc("list_max_id", NTask * sizeof(MyIDType));
+  int *list_numpart     = (int *)Mem.mymalloc("list_mumpart", NTask * sizeof(int));
+
+  MPI_Allgather(&MtrP[0].ID, sizeof(MyIDType), MPI_BYTE, list_min_id, sizeof(MyIDType), MPI_BYTE, Communicator);
+  MPI_Allgather(&MtrP[MtrP_NumPart > 0 ? MtrP_NumPart - 1 : 0].ID, sizeof(MyIDType), MPI_BYTE, list_max_id, sizeof(MyIDType), MPI_BYTE,
+                Communicator);
+  MPI_Allgather(&MtrP_NumPart, sizeof(int), MPI_BYTE, list_numpart, sizeof(int), MPI_BYTE, Communicator);
+
+  int nexport = 0, nimport = 0;
+  mergertree_particle_data *import_data = NULL, *export_data = NULL;
+
+  for(int mode = 0; mode < 2; mode++)
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int target = 0;
+
+      for(int i = 0; i < PrevMtrP_NumPart; i++)
+        {
+          while(target < NTask - 1 && (list_numpart[target] == 0 || PrevMtrP[i].ID > list_max_id[target]))
+            target++;
+
+          if(list_numpart[target] != 0)
+            if(PrevMtrP[i].ID >= list_min_id[target] && PrevMtrP[i].ID <= list_max_id[target])
+              {
+                if(mode == 0)
+                  Send_count[target]++;
+                else
+                  {
+                    int off          = Send_offset[target] + Send_count[target]++;
+                    export_data[off] = PrevMtrP[i];
+                  }
+              }
+        }
+
+      if(mode == 0)
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+          Recv_offset[0] = Send_offset[0] = 0;
+          for(int j = 0; j < NTask; j++)
+            {
+              nimport += Recv_count[j];
+              nexport += Send_count[j];
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          export_data = (mergertree_particle_data *)Mem.mymalloc("export_data", nexport * sizeof(mergertree_particle_data));
+          import_data = (mergertree_particle_data *)Mem.mymalloc("import_data", nimport * sizeof(mergertree_particle_data));
+        }
+    }
+
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(mergertree_particle_data), MPI_BYTE,
+                       recvTask, TAG_DENS_B, &import_data[Recv_offset[recvTask]],
+                       Recv_count[recvTask] * sizeof(mergertree_particle_data), MPI_BYTE, recvTask, TAG_DENS_B, Communicator,
+                       MPI_STATUS_IGNORE);
+    }
+
+  /* incoming data should already be sorted, so now do the match */
+
+  for(int i = 0, j = 0; i < MtrP_NumPart && j < nimport;)
+    {
+      if(MtrP[i].ID < import_data[j].ID)
+        i++;
+      else if(MtrP[i].ID > import_data[j].ID)
+        j++;
+      else
+        {
+          MtrP[i].PrevSubhaloNr     = import_data[j].PrevSubhaloNr;
+          MtrP[i].PrevSubhaloLen    = import_data[j].PrevSubhaloLen;
+          MtrP[i].PrevGroupNr       = import_data[j].PrevGroupNr;
+          MtrP[i].PrevRankInSubhalo = import_data[j].PrevRankInSubhalo;
+          i++;
+          j++;
+        }
+    }
+
+  Mem.myfree(import_data);
+  Mem.myfree(export_data);
+  Mem.myfree(list_numpart);
+  Mem.myfree(list_max_id);
+  Mem.myfree(list_min_id);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+void mergertree::mergertree_assign_group_numbers(fof<simparticles> *FoF)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  /* Tell everybody, how many particles are stored by each processor */
+
+  FoF->fof_assign_group_offset();
+  FoF->subfind_assign_subhalo_offsettype();
+
+  int ntype_loc[NTYPES];            /* local particle number of each type */
+  long long ntype_tot[NTYPES];      /* total particle numbers of each type */
+  long long ntype_previous[NTYPES]; /* cumulative number of particles of each type on previous processors */
+
+  for(int i = 0; i < NTYPES; i++)
+    ntype_loc[i] = 0;
+
+  for(int i = 0; i < MtrP_NumPart; i++)
+    ntype_loc[MtrP[i].Type]++;
+
+  /* collect a table with the particle numbers of each type on each processors */
+  int *ntype_all = (int *)Mem.mymalloc("ntype_all", NTYPES * NTask * sizeof(int));
+  MPI_Allgather(ntype_loc, NTYPES, MPI_INT, ntype_all, NTYPES, MPI_INT, Communicator);
+
+  for(int i = 0; i < NTYPES; i++)
+    ntype_tot[i] = 0;
+
+  for(int i = 0; i < NTask; i++)
+    for(int j = 0; j < NTYPES; j++)
+      ntype_tot[j] += ntype_all[i * NTYPES + j];
+
+  for(int i = 0; i < NTYPES; i++)
+    ntype_previous[i] = 0;
+
+  for(int i = 0; i < ThisTask; i++)
+    for(int j = 0; j < NTYPES; j++)
+      ntype_previous[j] += ntype_all[i * NTYPES + j];
+
+  /* tell everybody how many groups each processor holds */
+
+  int *gcount = (int *)Mem.mymalloc("gcount", NTask * sizeof(int));
+  MPI_Allgather(&FoF->Ngroups, 1, MPI_INT, gcount, 1, MPI_INT, Communicator);
+
+  /* determine the number of the first group we hold */
+  long long first_groupnr = 0;
+  for(int i = 0; i < ThisTask; i++)
+    first_groupnr += gcount[i];
+
+  /* tell everybody how many subhalos each processor holds */
+
+  int *scount = (int *)Mem.mymalloc("scount", NTask * sizeof(int));
+  MPI_Allgather(&FoF->Nsubhalos, 1, MPI_INT, scount, 1, MPI_INT, Communicator);
+
+  /* determine the number of the first subhalo we hold */
+  long long first_subhalonr = 0;
+  for(int i = 0; i < ThisTask; i++)
+    first_subhalonr += scount[i];
+
+  /* let's now figure out which groups are needed by different processors to assign the group number information */
+
+  struct group_info
+  {
+    long long GroupNr;
+    long long First;
+    MyLenType Len;
+  };
+  group_info *export_group_data = NULL, *import_group_data = NULL;
+
+  for(int type = 0; type < NTYPES; type++)
+    {
+      int nexport = 0, nimport = 0;
+
+      for(int mode = 0; mode < 2; mode++)
+        {
+          for(int i = 0; i < NTask; i++)
+            Send_count[i] = 0;
+
+          int target                = 0;
+          long long first_in_target = 0; /* this the first particle (of this type) on the current target processor */
+
+          for(int i = 0; i < FoF->Ngroups && target < NTask;)
+            {
+              int flag = 0;
+
+              /* check whether we have an overlap */
+              if(FoF->Group[i].OffsetType[type] + FoF->Group[i].LenType[type] > first_in_target &&
+                 FoF->Group[i].OffsetType[type] < (first_in_target + ntype_all[target * NTYPES + type]))
+                {
+                  flag = 1;
+
+                  if(mode == 0)
+                    Send_count[target]++;
+                  else
+                    {
+                      int off                        = Send_offset[target] + Send_count[target]++;
+                      export_group_data[off].GroupNr = first_groupnr + i;
+                      export_group_data[off].First   = FoF->Group[i].OffsetType[type];
+                      export_group_data[off].Len     = FoF->Group[i].LenType[type];
+                    }
+                }
+
+              if(FoF->Group[i].OffsetType[type] + FoF->Group[i].LenType[type] > first_in_target + ntype_all[target * NTYPES + type])
+                {
+                  first_in_target += ntype_all[target * NTYPES + type];
+                  target++;
+                }
+              else
+                {
+                  if(i < FoF->Ngroups && flag == 0 && FoF->Group[i].LenType[type] > 0)
+                    {
+                      Terminate(
+                          "strange: type=%d mode=%d  i=%d  first_in_target=%lld target=%d   FoF->Group[i].LenType[type]=%lld  "
+                          "Ngroups=%d",
+                          type, mode, i, first_in_target, target, (long long)FoF->Group[i].LenType[type], FoF->Ngroups);
+                    }
+
+                  i++;
+                }
+            }
+
+          if(mode == 0)
+            {
+              MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+              Recv_offset[0] = Send_offset[0] = 0;
+              for(int j = 0; j < NTask; j++)
+                {
+                  nimport += Recv_count[j];
+                  nexport += Send_count[j];
+                  if(j > 0)
+                    {
+                      Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                      Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                    }
+                }
+
+              export_group_data = (group_info *)Mem.mymalloc("export_group_data", nexport * sizeof(group_info));
+              import_group_data = (group_info *)Mem.mymalloc("import_group_data", nimport * sizeof(group_info));
+            }
+        }
+
+      for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+        {
+          int recvTask = ThisTask ^ ngrp;
+          if(recvTask < NTask)
+            if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+              MPI_Sendrecv(&export_group_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(group_info), MPI_BYTE, recvTask,
+                           TAG_DENS_B, &import_group_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(group_info), MPI_BYTE,
+                           recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+        }
+
+      /* now let's go through the local particles and assign the group numbers */
+
+      int p  = ntype_previous[type];
+      int gr = 0;
+
+      for(int i = 0; i < MtrP_NumPart; i++)
+        {
+          if(MtrP[i].Type == type)
+            {
+              MtrP[i].PrevGroupNr = HALONR_MAX; /* default is not in a group */
+
+              while(gr < nimport && p > import_group_data[gr].First + import_group_data[gr].Len - 1)
+                gr++;
+
+              if(gr < nimport && p >= import_group_data[gr].First && p < import_group_data[gr].First + import_group_data[gr].Len)
+                MtrP[i].PrevGroupNr = import_group_data[gr].GroupNr;
+
+              p++;
+            }
+        }
+
+      Mem.myfree(import_group_data);
+      Mem.myfree(export_group_data);
+    }
+
+  /* let's now figure out which groups are needed by different processors to assign the group number information */
+
+  struct subhalo_info
+  {
+    long long SubhaloNr;
+    long long First;
+    MyLenType Len;
+    MyLenType SubhaloLen;
+  };
+  subhalo_info *export_subhalo_data = NULL, *import_subhalo_data = NULL;
+
+  for(int type = 0; type < NTYPES; type++)
+    {
+      int nexport = 0, nimport = 0;
+
+      for(int mode = 0; mode < 2; mode++)
+        {
+          for(int i = 0; i < NTask; i++)
+            Send_count[i] = 0;
+
+          int target                = 0;
+          long long first_in_target = 0; /* this the first particle (of this type) on the current target processor */
+
+          for(int i = 0; i < FoF->Nsubhalos && target < NTask;)
+            {
+              /* check whether we have an overlap */
+              if(FoF->Subhalo[i].OffsetType[type] + FoF->Subhalo[i].LenType[type] > first_in_target &&
+                 FoF->Subhalo[i].OffsetType[type] < (first_in_target + ntype_all[target * NTYPES + type]))
+                {
+                  if(mode == 0)
+                    Send_count[target]++;
+                  else
+                    {
+                      int off                             = Send_offset[target] + Send_count[target]++;
+                      export_subhalo_data[off].SubhaloNr  = first_subhalonr + i;
+                      export_subhalo_data[off].First      = FoF->Subhalo[i].OffsetType[type];
+                      export_subhalo_data[off].Len        = FoF->Subhalo[i].LenType[type];
+                      export_subhalo_data[off].SubhaloLen = FoF->Subhalo[i].Len;
+                    }
+                }
+
+              if(FoF->Subhalo[i].OffsetType[type] + FoF->Subhalo[i].LenType[type] >
+                 first_in_target + ntype_all[target * NTYPES + type])
+                {
+                  first_in_target += ntype_all[target * NTYPES + type];
+                  target++;
+                }
+              else
+                i++;
+            }
+
+          if(mode == 0)
+            {
+              MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+              Recv_offset[0] = Send_offset[0] = 0;
+              for(int j = 0; j < NTask; j++)
+                {
+                  nimport += Recv_count[j];
+                  nexport += Send_count[j];
+                  if(j > 0)
+                    {
+                      Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                      Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                    }
+                }
+
+              export_subhalo_data = (subhalo_info *)Mem.mymalloc("export_subhalo_data", nexport * sizeof(subhalo_info));
+              import_subhalo_data = (subhalo_info *)Mem.mymalloc("import_subhalo_data", nimport * sizeof(subhalo_info));
+            }
+        }
+
+      for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+        {
+          int recvTask = ThisTask ^ ngrp;
+          if(recvTask < NTask)
+            if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+              MPI_Sendrecv(&export_subhalo_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(subhalo_info), MPI_BYTE,
+                           recvTask, TAG_DENS_B, &import_subhalo_data[Recv_offset[recvTask]],
+                           Recv_count[recvTask] * sizeof(subhalo_info), MPI_BYTE, recvTask, TAG_DENS_B, Communicator,
+                           MPI_STATUS_IGNORE);
+        }
+
+      /* now let's go through the local particles and assign group numbers */
+
+      int p  = ntype_previous[type];
+      int gr = 0;
+
+      for(int i = 0; i < MtrP_NumPart; i++)
+        {
+          if(MtrP[i].Type == type)
+            {
+              MtrP[i].PrevSubhaloNr  = HALONR_MAX; /* default is not in a group */
+              MtrP[i].PrevSubhaloLen = 0;
+
+              while(gr < nimport && p > import_subhalo_data[gr].First + import_subhalo_data[gr].Len - 1)
+                gr++;
+
+              if(gr < nimport && p >= import_subhalo_data[gr].First && p < import_subhalo_data[gr].First + import_subhalo_data[gr].Len)
+                {
+                  MtrP[i].PrevSubhaloNr  = import_subhalo_data[gr].SubhaloNr;
+                  MtrP[i].PrevSubhaloLen = import_subhalo_data[gr].SubhaloLen;
+
+                  int rank = p - import_subhalo_data[gr].First;  // Note: This is the rank within particles of the same type
+
+                  if(rank > UCHAR_MAX)  // restrict this to 1 byte (which is the storage we have reserved for this)
+                    rank = UCHAR_MAX;
+
+                  MtrP[i].PrevRankInSubhalo = static_cast<unsigned short>(rank);
+                }
+              p++;
+            }
+        }
+
+      Mem.myfree(import_subhalo_data);
+      Mem.myfree(export_subhalo_data);
+    }
+
+  Mem.myfree(scount);
+  Mem.myfree(gcount);
+  Mem.myfree(ntype_all);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+void mergertree::mergertree_match_ids_of_current_snap(void)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  assign_particle_data *AsP = (assign_particle_data *)Mem.mymalloc("AsP", sizeof(assign_particle_data) * (Sp->NumPart + 1));
+
+  for(int i = 0; i < Sp->NumPart; i++)
+    {
+      AsP[i].OriginTask  = ThisTask;
+      AsP[i].OriginIndex = i;
+      AsP[i].ID          = Sp->P[i].ID.get();
+    }
+
+  mycxxsort_parallel(AsP, AsP + Sp->NumPart, compare_AssignP_ID, Communicator);
+  mycxxsort_parallel(MtrP, MtrP + MtrP_NumPart, compare_MtrP_ID, Communicator);
+
+  MyIDType *list_min_id = (MyIDType *)Mem.mymalloc("list_min_id", NTask * sizeof(MyIDType));
+  MyIDType *list_max_id = (MyIDType *)Mem.mymalloc("list_max_id", NTask * sizeof(MyIDType));
+  int *list_numpart     = (int *)Mem.mymalloc("list_mumpart", NTask * sizeof(int));
+
+  MPI_Allgather(&AsP[0].ID, sizeof(MyIDType), MPI_BYTE, list_min_id, sizeof(MyIDType), MPI_BYTE, Communicator);
+  MPI_Allgather(&AsP[Sp->NumPart > 0 ? Sp->NumPart - 1 : 0].ID, sizeof(MyIDType), MPI_BYTE, list_max_id, sizeof(MyIDType), MPI_BYTE,
+                Communicator);
+  MPI_Allgather(&Sp->NumPart, sizeof(int), MPI_BYTE, list_numpart, sizeof(int), MPI_BYTE, Communicator);
+
+  int nexport = 0, nimport = 0;
+  mergertree_particle_data *import_data = NULL, *export_data = NULL;
+
+  for(int mode = 0; mode < 2; mode++)
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int target = 0;
+
+      for(int i = 0; i < MtrP_NumPart; i++)
+        {
+          while(target < NTask - 1 && (list_numpart[target] == 0 || MtrP[i].ID > list_max_id[target]))
+            target++;
+
+          if(list_numpart[target] != 0)
+            if(MtrP[i].ID >= list_min_id[target] && MtrP[i].ID <= list_max_id[target])
+              {
+                if(mode == 0)
+                  Send_count[target]++;
+                else
+                  {
+                    int off          = Send_offset[target] + Send_count[target]++;
+                    export_data[off] = MtrP[i];
+                  }
+              }
+        }
+
+      if(mode == 0)
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+          Recv_offset[0] = Send_offset[0] = 0;
+          for(int j = 0; j < NTask; j++)
+            {
+              nimport += Recv_count[j];
+              nexport += Send_count[j];
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          export_data = (mergertree_particle_data *)Mem.mymalloc("export_data", nexport * sizeof(mergertree_particle_data));
+          import_data = (mergertree_particle_data *)Mem.mymalloc("import_data", nimport * sizeof(mergertree_particle_data));
+        }
+    }
+
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(mergertree_particle_data), MPI_BYTE,
+                       recvTask, TAG_DENS_B, &import_data[Recv_offset[recvTask]],
+                       Recv_count[recvTask] * sizeof(mergertree_particle_data), MPI_BYTE, recvTask, TAG_DENS_B, Communicator,
+                       MPI_STATUS_IGNORE);
+    }
+
+  /* incoming data should already be sorted, so now do the match */
+
+  for(int i = 0, j = 0; i < Sp->NumPart && j < nimport;)
+    {
+      if(AsP[i].ID < import_data[j].ID)
+        i++;
+      else if(AsP[i].ID > import_data[j].ID)
+        j++;
+      else
+        {
+          AsP[i].PrevSubhaloNr  = import_data[j].PrevSubhaloNr;
+          AsP[i].PrevSubhaloLen = import_data[j].PrevSubhaloLen;
+          i++;
+          j++;
+        }
+    }
+
+  mycxxsort_parallel(AsP, AsP + Sp->NumPart, compare_AssignP_Origin, Communicator);
+
+  for(int i = 0; i < Sp->NumPart; i++)
+    {
+      Sp->P[i].PrevSizeOfSubhalo.set(AsP[i].PrevSubhaloLen);
+      Sp->P[i].PrevSubhaloNr.set(AsP[i].PrevSubhaloNr);
+    }
+
+  Mem.myfree(import_data);
+  Mem.myfree(export_data);
+  Mem.myfree(list_numpart);
+  Mem.myfree(list_max_id);
+  Mem.myfree(list_min_id);
+
+  Mem.myfree(AsP);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+#endif
diff --git a/src/mergertree/rearrange.cc b/src/mergertree/rearrange.cc
new file mode 100644
index 0000000000000000000000000000000000000000..caa44c884dddfcaa995bf09f4a3ea381e7d8cf65
--- /dev/null
+++ b/src/mergertree/rearrange.cc
@@ -0,0 +1,365 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  rearrange.cc
+ *
+ *  \brief routines to rearrange some of the lightcone data on disk to allow easier processing later on
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef MERGERTREE
+
+#include <gsl/gsl_rng.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../io/hdf5_util.h"
+#include "../io/snap_io.h"
+#include "../lightcone/lightcone.h"
+#include "../lightcone/lightcone_particle_io.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_readtrees_mbound.h"
+#include "../sort/cxxsort.h"
+#include "../sort/parallel_sort.h"
+#include "../system/system.h"
+#include "../time_integration/driftfac.h"
+
+#if defined(REARRANGE_OPTION) && defined(MERGERTREE)
+void sim::rearrange_snapshot(int argc, char **argv)
+{
+  if(argc < 5)
+    Terminate("too few arguments: Snapshot rearrange requires  <firstnum>  <lastnum>");
+
+  int conenr   = 0;  // not needed here
+  int firstnum = atoi(argv[3]);
+  int lastnum  = atoi(argv[4]);
+
+  All.Timebase_interval = (log(All.TimeMax) - log(All.TimeBegin)) / TIMEBASE;
+  Driftfac.init_drift_table();
+
+  rearrange_generic<simparticles>(Sp, conenr, firstnum, lastnum);
+}
+
+/* specialization */
+template <>
+void sim::rearrange_read<simparticles>(simparticles &Tp, int num, int conenr)
+{
+  /* read the lightcone  */
+  snap_io Snap{&Tp, Communicator, All.SnapFormat}; /* get an I/O object */
+  Snap.read_snapshot(num, MOST_BOUND_PARTICLE_SNAPHOT);
+}
+
+/* specialization */
+template <>
+void sim::rearrange_write<simparticles>(simparticles &Tp, int num, int conenr)
+{
+  /* write the snapshot  */
+  snap_io Snap{&Tp, &MergerTree, Communicator, All.SnapFormat};     /* get an I/O object */
+  Snap.write_snapshot(num, MOST_BOUND_PARTICLE_SNAPHOT_REORDERED);  // true?
+}
+#endif
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES) && defined(REARRANGE_OPTION)
+
+void sim::rearrange_lightcone(int argc, char **argv)
+{
+  if(argc < 6)
+    Terminate("too few arguments: Lightcone rearrange requires  <conenr>  <firstnum>  <lastnum>");
+
+  int conenr   = atoi(argv[3]);
+  int firstnum = atoi(argv[4]);
+  int lastnum  = atoi(argv[5]);
+
+  All.Timebase_interval = (log(All.TimeMax) - log(All.TimeBegin)) / TIMEBASE;
+  Driftfac.init_drift_table();
+
+  if(LightCone.lightcone_init_times())
+    endrun();
+
+#ifdef LIGHTCONE_MASSMAPS
+  LightCone.lightcone_init_massmaps();
+  if(LightCone.lightcone_massmap_report_boundaries())
+    endrun();
+#endif
+
+  double linklength = 0;
+  LightCone.lightcone_init_intposconverter(linklength);
+
+  rearrange_generic<lcparticles>(Lp, conenr, firstnum, lastnum);
+}
+
+/* specialization */
+template <>
+void sim::rearrange_read<lcparticles>(lcparticles &Tp, int num, int conenr)
+{
+  /* read the lightcone  */
+  lightcone_particle_io LcIO{&Tp, &LightCone, &MergerTree, Communicator, All.SnapFormat}; /* get an I/O object */
+  LcIO.lightcone_read(num, conenr);
+}
+
+/* specialization */
+template <>
+void sim::rearrange_write<lcparticles>(lcparticles &Tp, int num, int conenr)
+{
+  /* write the lightcone  */
+  lightcone_particle_io LcIO{&Tp, &LightCone, &MergerTree, Communicator, All.SnapFormat}; /* get an I/O object */
+  LcIO.lightcone_save(num, conenr, true);
+}
+
+#endif
+
+template <typename partset>
+void sim::rearrange_generic(partset &Tp, int conenr, int firstnum, int lastnum)
+{
+  /* read the merger tree mostbound halo IDs  */
+  {
+    readtrees_mbound_io MtreeIO{&MergerTree, Communicator, All.SnapFormat}; /* get an I/O object */
+    MtreeIO.read_trees_mostbound();                                         // optimize this here to only read most-bound ID
+  }
+
+  /* let's now sort the tree data according to the IDs */
+  mycxxsort_parallel(MergerTree.HaloIDdata, MergerTree.HaloIDdata + MergerTree.Nhalos, MergerTree.compare_HaloIDdata_ID, Communicator);
+
+  /* establish some lists with the minimum and maximum IDs stored on each processor */
+  long long halominID = (MergerTree.Nhalos > 0) ? MergerTree.HaloIDdata[0].SubMostBoundID : 0;
+  long long halomaxID = (MergerTree.Nhalos > 0) ? MergerTree.HaloIDdata[MergerTree.Nhalos - 1].SubMostBoundID : 0;
+
+  long long *list_halominID = (long long *)Mem.mymalloc("list_halominID", NTask * sizeof(long long));
+  long long *list_halomaxID = (long long *)Mem.mymalloc("list_halomaxID", NTask * sizeof(long long));
+  int *list_Nhalos          = (int *)Mem.mymalloc("list_Nhalos", NTask * sizeof(int));
+
+  MPI_Allgather(&halominID, 1, MPI_LONG_LONG, list_halominID, 1, MPI_LONG_LONG, Communicator);
+  MPI_Allgather(&halomaxID, 1, MPI_LONG_LONG, list_halomaxID, 1, MPI_LONG_LONG, Communicator);
+  MPI_Allgather(&MergerTree.Nhalos, 1, MPI_INT, list_Nhalos, 1, MPI_INT, Communicator);
+
+  typedef mergertree::treehalo_ids_type treehalo_ids_type;
+
+  for(int num = firstnum; num <= lastnum; num++)
+    {
+      rearrange_read(Tp, num, conenr);
+
+      mpi_printf("REARRANGE: On Task=%d, %d particles\n", ThisTask, Tp.NumPart);
+
+      /* let's now sort the lightcone_particle_data according to ID */
+      mycxxsort_parallel(Tp.P, Tp.P + Tp.NumPart, Tp.compare_ID, Communicator);
+
+      /* establish some lists with the minimum and maximum IDs stored on each processor for the lightcone particles */
+      long long coneminID = (Tp.NumPart > 0) ? Tp.P[0].ID.get() : 0;
+      long long conemaxID = (Tp.NumPart > 0) ? Tp.P[Tp.NumPart - 1].ID.get() : 0;
+
+      long long *list_coneminID = (long long *)Mem.mymalloc("list_coneminID", NTask * sizeof(long long));
+      long long *list_conemaxID = (long long *)Mem.mymalloc("list_conemaxID", NTask * sizeof(long long));
+      int *list_NumPart         = (int *)Mem.mymalloc("list_NumPart", NTask * sizeof(int));
+
+      MPI_Allgather(&coneminID, 1, MPI_LONG_LONG, list_coneminID, 1, MPI_LONG_LONG, Communicator);
+      MPI_Allgather(&conemaxID, 1, MPI_LONG_LONG, list_conemaxID, 1, MPI_LONG_LONG, Communicator);
+      MPI_Allgather(&Tp.NumPart, 1, MPI_INT, list_NumPart, 1, MPI_INT, Communicator);
+
+      // let's assign matchIDs to the degree possible and assign TreeIDs in accordance to that
+
+      // default is no matching Tree
+      for(int n = 0; n < Tp.NumPart; n++)
+        Tp.P[n].TreeID = -1;
+
+      MPI_Request *requests = (MPI_Request *)Mem.mymalloc("requests", NTask * sizeof(MPI_Request));
+
+      int nreq = 0;
+
+      for(int task = 0; task < NTask; task++)
+        {
+          if(MergerTree.Nhalos > 0 && list_NumPart[task] > 0)
+            if(!(halomaxID < list_coneminID[task] || halominID > list_conemaxID[task]))
+              {
+                MPI_Issend(MergerTree.HaloIDdata, MergerTree.Nhalos * sizeof(mergertree::treehalo_ids_type), MPI_BYTE, task, TAG_N,
+                           Communicator, &requests[nreq++]);
+              }
+        }
+
+      for(int task = 0; task < NTask; task++)
+        {
+          if(list_Nhalos[task] > 0 && Tp.NumPart > 0)
+            if(!(list_halomaxID[task] < coneminID || list_halominID[task] > conemaxID))
+              {
+                treehalo_ids_type *halos = (treehalo_ids_type *)Mem.mymalloc("halos", list_Nhalos[task] * sizeof(treehalo_ids_type));
+
+                MPI_Recv(halos, list_Nhalos[task] * sizeof(treehalo_ids_type), MPI_BYTE, task, TAG_N, Communicator, MPI_STATUS_IGNORE);
+
+                int i_halo = 0;
+                int i_cone = 0;
+
+                while(i_halo < list_Nhalos[task] && i_cone < Tp.NumPart)
+                  {
+                    if(halos[i_halo].SubMostBoundID == Tp.P[i_cone].ID.get())
+                      {
+                        Tp.P[i_cone++].TreeID = halos[i_halo++].TreeID;
+                      }
+                    else if(halos[i_halo].SubMostBoundID < Tp.P[i_cone].ID.get())
+                      i_halo++;
+                    else
+                      i_cone++;
+                  }
+
+                Mem.myfree(halos);
+              }
+        }
+
+      if(nreq)
+        MPI_Waitall(nreq, requests, MPI_STATUSES_IGNORE);
+
+      Mem.myfree(requests);
+      Mem.myfree(list_NumPart);
+      Mem.myfree(list_conemaxID);
+      Mem.myfree(list_coneminID);
+
+      /* now we sort the lightcone_particle_data according to TreeID */
+      mycxxsort_parallel(Tp.P, Tp.P + Tp.NumPart, Tp.compare_TreeID_ID, Communicator);
+
+      rearrange_fill_treetable<partset>(Tp);
+
+      /* write the lightcone  */
+      rearrange_write(Tp, num, conenr);
+
+      /* free the storage again */
+      Tp.free_memory();
+    }
+
+  Mem.myfree(list_Nhalos);
+  Mem.myfree(list_halomaxID);
+  Mem.myfree(list_halominID);
+}
+
+template <typename partset>
+void sim::rearrange_fill_treetable(partset &Tp)
+{
+  /*
+     we use HaloCount just like ParticleCount, and FirstHalo becomes "ParticleFirst */
+
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+
+  for(int i = 0; i < NTask; i++)
+    Send_count[i] = 0;
+
+  long long *TreeID_list = (long long *)Mem.mymalloc("TreeID_list", sizeof(long long) * Tp.NumPart);
+
+  long long maxTreeID = (MergerTree.Ntrees > 0) ? MergerTree.TreeTable[MergerTree.Ntrees - 1].TreeID : -1;
+
+  long long *maxTreeID_list = (long long *)Mem.mymalloc("maxTreeID_list", sizeof(long long) * NTask);
+  MPI_Allgather(&maxTreeID, sizeof(long long), MPI_BYTE, maxTreeID_list, sizeof(long long), MPI_BYTE, Communicator);
+
+  int target_task = 0;
+
+  for(int i = 0; i < Tp.NumPart; i++)
+    {
+      TreeID_list[i] = Tp.P[i].TreeID;
+
+      while(target_task < NTask - 1 && TreeID_list[i] > maxTreeID_list[target_task])
+        target_task++;
+
+      Send_count[target_task]++;
+    }
+
+  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+  int nexport = 0, nimport = 0;
+  Send_offset[0] = 0;
+
+  for(int j = 0; j < NTask; j++)
+    {
+      nexport += Send_count[j];
+      nimport += Recv_count[j];
+
+      if(j > 0)
+        Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+    }
+
+  for(int i = 0; i < MergerTree.Ntrees; i++)
+    MergerTree.TreeTable[i].HaloCount = 0;
+
+  /* exchange data */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+
+      if(recvTask < NTask)
+        {
+          if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+            {
+              long long *treeid_tmp = (long long *)Mem.mymalloc("treeid_tmp", sizeof(long long) * Recv_count[recvTask]);
+
+              MPI_Sendrecv(&TreeID_list[Send_offset[recvTask]], Send_count[recvTask] * sizeof(long long), MPI_BYTE, recvTask,
+                           TAG_DENS_A, treeid_tmp, Recv_count[recvTask] * sizeof(long long), MPI_BYTE, recvTask, TAG_DENS_A,
+                           Communicator, MPI_STATUS_IGNORE);
+
+              for(int i = 0; i < Recv_count[recvTask]; i++)
+                {
+                  if(treeid_tmp[i] != -1)
+                    {
+                      int off = treeid_tmp[i] - MergerTree.TreeTable[0].TreeID;
+
+                      if(off < 0 || off >= MergerTree.Ntrees)
+                        Terminate(
+                            "strange: off=%d  MergerTree.Ntrees=%d  treeid_tmp[i]=%lld  MergerTree.TreeTable[0].TreeID=%lld  i=%d  "
+                            "Recv_count[recvTask]=%d  ",
+                            off, MergerTree.Ntrees, treeid_tmp[i], MergerTree.TreeTable[0].TreeID, i, Recv_count[recvTask]);
+
+                      MergerTree.TreeTable[off].HaloCount++;
+                    }
+                }
+
+              Mem.myfree(treeid_tmp);
+            }
+        }
+    }
+
+  Mem.myfree(maxTreeID_list);
+  Mem.myfree(TreeID_list);
+
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  /* now also fix the cumulative count */
+
+  if(MergerTree.Ntrees > 0)
+    MergerTree.TreeTable[0].FirstHalo = 0;
+
+  for(int i = 1; i < MergerTree.Ntrees; i++)
+    MergerTree.TreeTable[i].FirstHalo = MergerTree.TreeTable[i - 1].FirstHalo + MergerTree.TreeTable[i - 1].HaloCount;
+
+  long long cumul = 0;
+  for(int i = 0; i < MergerTree.Ntrees; i++)
+    cumul += MergerTree.TreeTable[i].HaloCount;
+
+  long long *cumul_list = (long long *)Mem.mymalloc("cumul_list", sizeof(long long) * NTask);
+
+  MPI_Allgather(&cumul, sizeof(long long), MPI_BYTE, cumul_list, sizeof(long long), MPI_BYTE, Communicator);
+
+  cumul = 0;
+  for(int i = 0; i < ThisTask; i++)
+    cumul += cumul_list[i];
+
+  for(int i = 0; i < MergerTree.Ntrees; i++)
+    MergerTree.TreeTable[i].FirstHalo += cumul;
+
+  Mem.myfree(cumul_list);
+}
+
+#endif
diff --git a/src/mpi_utils/allreduce_debugcheck.cc b/src/mpi_utils/allreduce_debugcheck.cc
new file mode 100644
index 0000000000000000000000000000000000000000..997c8ad9835a794cca56d914e7e2d90ce2246050
--- /dev/null
+++ b/src/mpi_utils/allreduce_debugcheck.cc
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  allreduce_debugcheck.cc
+ *
+ *  \brief some routines for cross-checking the use of collective MPI routines
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../mpi_utils/mpi_utils.h"
+
+int myMPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+{
+  int mi, ma;
+
+  MPI_Allreduce(&count, &mi, 1, MPI_INT, MPI_MIN, comm);
+  MPI_Allreduce(&count, &ma, 1, MPI_INT, MPI_MAX, comm);
+
+  if(mi != ma)
+    {
+      int thistask, ntask;
+      MPI_Comm_rank(comm, &thistask);
+      MPI_Comm_size(comm, &ntask);
+
+      printf("Error in MPI_Allreduce:  task=%d out of %d  has size = %d\n", thistask, ntask, count);
+      fflush(stdout);
+      MPI_Barrier(comm);
+
+      Terminate("mi=%d ma=%d\n", mi, ma);
+    }
+
+  return MPI_Allreduce(sendbuf, recvbuf, count, datatype, op, comm);
+}
diff --git a/src/mpi_utils/allreduce_sparse_double_sum.cc b/src/mpi_utils/allreduce_sparse_double_sum.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c600ccce88f5c318aae12ce22002e9800603a5b7
--- /dev/null
+++ b/src/mpi_utils/allreduce_sparse_double_sum.cc
@@ -0,0 +1,169 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  allreduce_sparse_double_sum.cc
+ *
+ *  \brief implementation of a reduction operation for sparsely populated data
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../mpi_utils/mpi_utils.h"
+
+void allreduce_sparse_double_sum(double *loc, double *glob, int N, MPI_Comm Communicator)
+{
+  int ntask, thistask, ptask;
+  MPI_Comm_size(Communicator, &ntask);
+  MPI_Comm_rank(Communicator, &thistask);
+
+  for(ptask = 0; ntask > (1 << ptask); ptask++)
+    ;
+
+  int *send_count  = (int *)Mem.mymalloc("send_count", sizeof(int) * ntask);
+  int *recv_count  = (int *)Mem.mymalloc("recv_count", sizeof(int) * ntask);
+  int *send_offset = (int *)Mem.mymalloc("send_offset", sizeof(int) * ntask);
+  int *recv_offset = (int *)Mem.mymalloc("recv_offset", sizeof(int) * ntask);
+  int *blocksize   = (int *)Mem.mymalloc("blocksize", sizeof(int) * ntask);
+
+  int blk     = N / ntask;
+  int rmd     = N - blk * ntask; /* remainder */
+  int pivot_n = rmd * (blk + 1);
+
+  int loc_first_n = 0;
+  for(int task = 0; task < ntask; task++)
+    {
+      if(task < rmd)
+        blocksize[task] = blk + 1;
+      else
+        blocksize[task] = blk;
+
+      if(task < thistask)
+        loc_first_n += blocksize[task];
+    }
+
+  double *loc_data = (double *)Mem.mymalloc("loc_data", blocksize[thistask] * sizeof(double));
+  memset(loc_data, 0, blocksize[thistask] * sizeof(double));
+
+  for(int j = 0; j < ntask; j++)
+    send_count[j] = 0;
+
+  /* find for each non-zero element the processor where it should go for being summed */
+  for(int n = 0; n < N; n++)
+    {
+      if(loc[n] != 0)
+        {
+          int task;
+          if(n < pivot_n)
+            task = n / (blk + 1);
+          else
+            task = rmd + (n - pivot_n) / blk; /* note: if blk=0, then this case can not occur */
+
+          send_count[task]++;
+        }
+    }
+
+  MPI_Alltoall(send_count, 1, MPI_INT, recv_count, 1, MPI_INT, Communicator);
+
+  int nimport = 0, nexport = 0;
+
+  recv_offset[0] = 0, send_offset[0] = 0;
+
+  for(int j = 0; j < ntask; j++)
+    {
+      nexport += send_count[j];
+      nimport += recv_count[j];
+      if(j > 0)
+        {
+          send_offset[j] = send_offset[j - 1] + send_count[j - 1];
+          recv_offset[j] = recv_offset[j - 1] + recv_count[j - 1];
+        }
+    }
+
+  struct ind_data
+  {
+    int n;
+    double val;
+  };
+  ind_data *export_data, *import_data;
+
+  export_data = (ind_data *)Mem.mymalloc("export_data", nexport * sizeof(ind_data));
+  import_data = (ind_data *)Mem.mymalloc("import_data", nimport * sizeof(ind_data));
+
+  for(int j = 0; j < ntask; j++)
+    send_count[j] = 0;
+
+  for(int n = 0; n < N; n++)
+    {
+      if(loc[n] != 0)
+        {
+          int task;
+
+          if(n < pivot_n)
+            task = n / (blk + 1);
+          else
+            task = rmd + (n - pivot_n) / blk; /* note: if blk=0, then this case can not occur */
+
+          int index              = send_offset[task] + send_count[task]++;
+          export_data[index].n   = n;
+          export_data[index].val = loc[n];
+        }
+    }
+
+  for(int ngrp = 0; ngrp < (1 << ptask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+    {
+      int recvTask = thistask ^ ngrp;
+      if(recvTask < ntask)
+        if(send_count[recvTask] > 0 || recv_count[recvTask] > 0)
+          MPI_Sendrecv(&export_data[send_offset[recvTask]], send_count[recvTask] * sizeof(ind_data), MPI_BYTE, recvTask, TAG_DENS_B,
+                       &import_data[recv_offset[recvTask]], recv_count[recvTask] * sizeof(ind_data), MPI_BYTE, recvTask, TAG_DENS_B,
+                       Communicator, MPI_STATUS_IGNORE);
+    }
+
+  for(int i = 0; i < nimport; i++)
+    {
+      int j = import_data[i].n - loc_first_n;
+
+      if(j < 0 || j >= blocksize[thistask])
+        Terminate("j=%d < 0 || j>= blocksize[thistask]=%d", j, blocksize[thistask]);
+
+      loc_data[j] += import_data[i].val;
+    }
+
+  Mem.myfree(import_data);
+  Mem.myfree(export_data);
+
+  /* now share the cost data across all processors */
+  int *bytecounts = (int *)Mem.mymalloc("bytecounts", sizeof(int) * ntask);
+  int *byteoffset = (int *)Mem.mymalloc("byteoffset", sizeof(int) * ntask);
+
+  for(int task = 0; task < ntask; task++)
+    bytecounts[task] = blocksize[task] * sizeof(double);
+
+  byteoffset[0] = 0;
+  for(int task = 1; task < ntask; task++)
+    byteoffset[task] = byteoffset[task - 1] + bytecounts[task - 1];
+
+  MPI_Allgatherv(loc_data, bytecounts[thistask], MPI_BYTE, glob, bytecounts, byteoffset, MPI_BYTE, Communicator);
+
+  Mem.myfree(byteoffset);
+  Mem.myfree(bytecounts);
+
+  Mem.myfree(loc_data);
+  Mem.myfree(blocksize);
+  Mem.myfree(recv_offset);
+  Mem.myfree(send_offset);
+  Mem.myfree(recv_count);
+  Mem.myfree(send_count);
+}
diff --git a/src/mpi_utils/generic_comm.h b/src/mpi_utils/generic_comm.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc500ce5ea039622e9dc233820a949c691adcf9d
--- /dev/null
+++ b/src/mpi_utils/generic_comm.h
@@ -0,0 +1,713 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  generic_comm.h
+ *
+ *  \brief defines a class with a generic communication pattern for parallel tree walks
+ */
+
+#ifndef GENERIC_COMM_H
+#define GENERIC_COMM_H
+
+#include "../domain/domain.h"
+#include "../logs/logs.h"
+#include "../mpi_utils/mpi_utils.h"
+
+#define EXTRA_SPACE 16384
+
+struct data_in_generic
+{
+  int Firstnode;
+};
+
+template <typename T_in, typename T_out, typename T_tree, typename T_domain, typename T_partset>
+class generic_comm
+{
+ public:
+  virtual void particle2in(T_in *in, int i)              = 0;  // Pure virtual function, *must* be overridden
+  virtual void out2particle(T_out *out, int i, int mode) = 0;  // Pure virtual function, *must* be overridden
+  virtual int evaluate(int target, int mode, int thread_id, int action, T_in *in, int numnodes, node_info *firstnode,
+                       T_out &out)                       = 0;  // Pure virtual function, *must* be overridden
+
+  T_domain *D;
+  T_tree *Tree;
+  T_partset *Tp;
+
+  thread_data Thread;
+
+  generic_comm(T_domain *dptr, T_tree *tptr, T_partset *pptr)
+  {
+    D    = dptr;
+    Tree = tptr;
+    Tp   = pptr;
+
+    set_MaxNexport(); /* initialisation */
+  }
+
+  /* some public diagnostic info */
+
+  long long SumNexport;
+
+ private:
+  enum logs::timers cpu_primary   = logs::CPU_NONE;
+  enum logs::timers cpu_secondary = logs::CPU_NONE;
+  enum logs::timers cpu_imbalance = logs::CPU_NONE;
+
+  int Nactive;
+  int *Targets;
+  int actioncode;
+
+  T_in *DataIn, *DataGet;
+  T_out *DataResult, *DataOut;
+
+  struct send_recv_counts
+  {
+    int Count;
+    int CountNodes;
+  };
+  send_recv_counts *Send, *Recv;
+
+  /** Array of NTask size of the offset into the send array where the
+      objects to be sent to the specified task starts. */
+  int *Send_offset,
+      /** Array of NTask size of the number of objects to send to the
+          tasks. */
+      *Send_count,
+      /** Array of NTask size of the number of objects to receive from the
+          tasks. */
+      *Recv_count,
+      /** Array of NTask size of the offset into the receive array where the
+          objects from the specified task starts. */
+      *Recv_offset;
+
+  int *Send_offset_nodes, *Send_count_nodes, *Recv_count_nodes, *Recv_offset_nodes;
+
+  /** Buffer of size NTask used for flagging whether a particle needs to
+    be exported to the other tasks. */
+  int *Exportflag;
+  /** Buffer of size NTask used for counting how many nodes are to be
+      exported to the other tasks? */
+  int *Exportnodecount;
+  /** Buffer of size NTask used for holding the index into the
+      DataIndexTable. */
+  int *Exportindex;
+
+  size_t ExportSpace;
+  size_t MinSpace;
+  int NextParticle;
+  int Nexport, Nimport;
+  int NexportNodes, NimportNodes;
+
+  node_info *NodeInfoIn;
+  node_info *NodeInfoGet;
+
+  char callorigin[1000];
+
+  void primary_loop(int mode)
+  {
+    if(cpu_primary != logs::CPU_NONE)
+      Logs.timer_start(cpu_primary);
+
+    int idx;
+
+    int j;
+
+    for(j = 0; j < D->NTask; j++)
+      Thread.Exportflag[j] = -1;
+
+    while(1)
+      {
+        if(Thread.ExportSpace < MinSpace)
+          break;
+
+        idx = NextParticle++;
+
+        if(idx >= Nactive)
+          break;
+
+        int i = Targets[idx];
+        if(i < 0)
+          continue;
+
+        T_in local;
+        T_out out;
+        particle2in(&local, i);
+        local.Firstnode = 0;
+
+        int num = evaluate(i, mode, 0, actioncode, &local, 1, NULL, out);
+
+        out2particle(&out, i, mode);
+
+        Thread.Interactions += num;
+      }
+
+    if(cpu_primary != logs::CPU_NONE)
+      Logs.timer_stop(cpu_primary);
+  }
+
+  void secondary_loop(void)
+  {
+    if(cpu_secondary != logs::CPU_NONE)
+      Logs.timer_start(cpu_secondary);
+
+    /* now do the particles that were sent to us */
+    int i, cnt = 0;
+
+    {
+      while(1)
+        {
+          i = cnt++;
+
+          if(i >= Nimport)
+            break;
+
+          int numnodes;
+          node_info *firstnode;
+          generic_get_numnodes(i, &numnodes, &firstnode);
+
+          T_in *in = &DataGet[i];
+          T_out out;
+
+          int num = evaluate(i, MODE_IMPORTED_PARTICLES, 0, actioncode, in, numnodes, firstnode, out);
+
+          DataResult[i] = out;
+
+          Thread.Interactions += num;
+        }
+    }
+
+    if(cpu_secondary != logs::CPU_NONE)
+      Logs.timer_stop(cpu_secondary);
+  }
+
+  /* this function determines how much buffer space we may use based on the memory that is locally still free,
+   * and it computes how much memory may at most be needed to process a single particle. We will only continue with a particle
+   * if this can still be safely processed.
+   */
+
+  /* this function does the memory allocation at the beginning of a loop over the remaining local particles.
+   * The fields PartList[] and NodeList[] share the buffer space of size "ExportSpace" (in bytes).
+   * Here PartList will be filled in from the beginning, while NodeList will be filled in from the end.
+   * Since we do not know a priory the relative share of these two fields, we can make optimum use of
+   * the available space in this way.
+   */
+
+  void set_MaxNexport(const char *file = __FILE__, int line = __LINE__)
+  {
+    ExportSpace = 0.5 * (Mem.FreeBytes); /* we just grab at most half of the still available memory here */
+
+    if(ExportSpace <= Tp->NumPart * sizeof(int))
+      {
+        Mem.dump_memory_table();
+        Terminate("It seems we have too little space left for properly sized ExportSpace... (%lld %lld)   Need more memory.\n",
+                  (long long)ExportSpace, (long long)Tp->NumPart * sizeof(int))
+      }
+
+    ExportSpace -= Tp->NumPart * sizeof(int); /* to account for the neighbor list buffer that the process allocated */
+
+    /* make the size a multiple both of data_partlist and data_nodelist */
+    ExportSpace /= (sizeof(data_partlist) * sizeof(data_nodelist));
+    ExportSpace *= (sizeof(data_partlist) * sizeof(data_nodelist));
+
+    MinSpace = (D->NTask - 1) * (sizeof(data_partlist) + sizeof(T_in) + sizeof(T_out)) +
+               D->NTopleaves * (sizeof(data_nodelist) + sizeof(int));
+
+    sprintf(callorigin, "%s|%d|", file, line);
+
+    /*
+     mpi_printf("GENERIC: file %s, line %d: MinSpace = %g MB  NTopleaves = %d  ExportSpace = %g MB\n", file, line,
+     MinSpace / (1024.0 * 1024.0), D->NTopleaves, ExportSpace / (1024.0 * 1024.0));
+     */
+
+    if(ExportSpace < MinSpace)
+      {
+        Mem.dump_memory_table();
+        Terminate(
+            "Bummer. Can't even safely process a single particle for the available memory. FreeBytes=%lld  ExportSpace=%lld  "
+            "MinSpace=%lld  D->NTask=%d  NTopleaves=%d",
+            (long long)Mem.FreeBytes, (long long)ExportSpace, (long long)MinSpace, D->NTask, D->NTopleaves);
+      }
+  }
+
+  /* this function does the memory allocation at the beginning of a loop over the remaining local particles.
+   * The fields PartList[] and NodeList[] share the buffer space of size "ExportSpace" (in bytes).
+   * Here PartList will be filled in from the beginning, while NodeList will be filled in from the end.
+   * Since we do not know a priory the relative share of these two fields, we can make optimum use of
+   * the available space in this way.
+   */
+  void generic_alloc_partlist_nodelist_ngblist_threadbufs(void)
+  {
+    Thread.Nexport      = 0;
+    Thread.NexportNodes = 0;
+    Thread.ExportSpace  = ExportSpace;
+    Thread.InitialSpace = ExportSpace;
+    Thread.ItemSize     = (sizeof(data_partlist) + sizeof(T_in) + sizeof(T_out));
+
+    Thread.PartList = (data_partlist *)Mem.mymalloc_movable_g(&Thread.PartList, "PartList", ExportSpace);
+    /* note: the NodeList array will be attached to the end of this buffer, growing backwards */
+    /* Thread[i].NodeList = (data_nodelist *) (((char *) Thread[i].PartList) + InitialSpace);
+     */
+    Thread.Ngblist     = (int *)Mem.mymalloc_movable_g(&Thread.Ngblist, "Ngblist", Tp->NumPart * sizeof(int));
+    Thread.Shmranklist = (int *)Mem.mymalloc_movable_g(&Thread.Shmranklist, "Shmranklist", Tp->NumPart * sizeof(int));
+    Thread.Exportflag  = Exportflag;
+  }
+
+  void generic_free_partlist_nodelist_ngblist_threadbufs(void)
+  {
+    Mem.myfree(Thread.Shmranklist);
+    Mem.myfree(Thread.Ngblist);
+    Mem.myfree(Thread.PartList);
+    Thread.Shmranklist = NULL;
+    Thread.Ngblist     = NULL;
+    Thread.PartList    = NULL;
+  }
+
+  void generic_prepare_export_counts(void)
+  {
+    for(int j = 0; j < D->NTask; j++)
+      {
+        Send[j].Count      = 0;
+        Send[j].CountNodes = 0;
+      }
+
+    Nexport      = 0;
+    NexportNodes = 0;
+
+    for(int j = 0; j < Thread.Nexport; j++)
+      Send[Thread.PartList[j].Task].Count++;
+
+    data_nodelist *nodelist = (data_nodelist *)(((char *)Thread.PartList) + Thread.InitialSpace);
+
+    for(int j = 0; j < Thread.NexportNodes; j++)
+      Send[nodelist[-1 - j].Task].CountNodes++;
+
+    Nexport += Thread.Nexport;
+    NexportNodes += Thread.NexportNodes;
+
+    SumNexport += Nexport;
+  }
+
+  /* establish the Recv counts from the Send counts (effectively a big transpose)
+   */
+  void generic_prepare_import_counts(void)
+  {
+    /* our standard approach for this is to use an all-to-all communication. For very large processor counts,
+     * this in principle becomes inefficient since mostly zeros need to be communicated.
+     * we have also two option experimental communication routines that use a sparse=communication pattern instead.
+     */
+    /* the default */
+    MPI_Alltoall(Send, sizeof(send_recv_counts), MPI_BYTE, Recv, sizeof(send_recv_counts), MPI_BYTE, D->Communicator);
+  }
+
+  /* initialize offset tables that we need for the communication
+   */
+  void generic_prepare_export_offsets(void)
+  {
+    Send_offset[0]       = 0;
+    Send_offset_nodes[0] = 0;
+
+    for(int j = 1; j < D->NTask; j++)
+      {
+        Send_offset[j]       = Send_offset[j - 1] + Send[j - 1].Count;
+        Send_offset_nodes[j] = Send_offset_nodes[j - 1] + Send[j - 1].CountNodes;
+      }
+  }
+
+  /* organize the particle and node data for export in contiguous memory regions
+   */
+  void generic_prepare_particle_data_for_export(void)
+  {
+    int *rel_node_index = (int *)Mem.mymalloc_g("rel_node_index", D->NTask * sizeof(int));
+
+    for(int j = 0; j < D->NTask; j++)
+      {
+        Send[j].Count      = 0;
+        Send[j].CountNodes = 0;
+        rel_node_index[j]  = 0;
+      }
+
+    data_nodelist *nodelist = (data_nodelist *)(((char *)Thread.PartList) + Thread.InitialSpace);
+
+    for(int j = 0, jj = 0; j < Thread.Nexport; j++)
+      {
+        int task = Thread.PartList[j].Task;
+        int off  = Send_offset[task] + Send[task].Count++;
+
+        int target = Thread.PartList[j].Index;
+
+        particle2in(&DataIn[off], target);
+        DataIn[off].Firstnode = rel_node_index[task];
+
+        if(j < Thread.Nexport - 1)
+          if(Thread.PartList[j].Index == Thread.PartList[j + 1].Index)
+            continue;
+
+        while(jj < Thread.NexportNodes && Thread.PartList[j].Index == nodelist[-1 - jj].Index)
+          {
+            int task = nodelist[-1 - jj].Task;
+            int off  = Send_offset_nodes[task] + Send[task].CountNodes++;
+
+            NodeInfoIn[off] = nodelist[-1 - jj].NodeInfo;
+
+            rel_node_index[task]++;
+            jj++;
+          }
+      }
+
+    Mem.myfree(rel_node_index);
+  }
+
+  /* driver routine to process the results that we obtained for a particle from a remote processor
+   * by working on it with the supplied out2particle() routine
+   */
+  void generic_add_results_to_local(void)
+  {
+    for(int j = 0; j < D->NTask; j++)
+      Send[j].Count = 0;
+
+    for(int j = 0; j < Thread.Nexport; j++)
+      {
+        int task = Thread.PartList[j].Task;
+        int off  = Send_offset[task] + Send[task].Count++;
+
+        int target = Thread.PartList[j].Index;
+
+        out2particle(&DataOut[off], target, MODE_IMPORTED_PARTICLES);
+      }
+  }
+
+#ifndef OMIT_GENERIC_GET_NUMNODES
+
+  /* this function is called in the actual tree walk routine to find out how the number and
+   * starting index of the section in the node-list that needs to be processed for the imported particle
+   */
+  void generic_get_numnodes(int target, int *numnodes, node_info **firstnode)
+  {
+    if(target == Nimport - 1)
+      *numnodes = NimportNodes - DataGet[target].Firstnode;
+    else
+      *numnodes = DataGet[target + 1].Firstnode - DataGet[target].Firstnode;
+
+    if(*numnodes > Tree->NumNodes)
+      {
+        Terminate(
+            "odd: target=%d  Nimport=%d  NimportNodes=%d numnodes=%d Tree->NumNodes=%d Tree->MaxNodes=%d "
+            "DataGet[target].Firstnode=%d\n",
+            target, Nimport, NimportNodes, *numnodes, Tree->NumNodes, Tree->MaxNodes, DataGet[target].Firstnode);
+      }
+
+    *firstnode = &NodeInfoGet[DataGet[target].Firstnode];
+  }
+
+#endif
+
+  /* calculate how many space we need to allocate to safely process a certain number of
+   * nodes and particles that are imported.
+   */
+  size_t generic_calc_import_storage(int nimport, int nimportnodes)
+  {
+    size_t needed = nimport * sizeof(T_in) + nimportnodes * sizeof(int) + nimport * sizeof(T_out);
+
+    /* add some extra space to not go to the last byte */
+    needed += EXTRA_SPACE;
+
+    return needed;
+  }
+
+  /* this routine carries out the communication step in several phases if needed
+   */
+  void generic_multiple_phases(void)
+  {
+    int ncycles;
+
+    for(int ngrpstart = 1; ngrpstart < (1 << D->PTask); ngrpstart += ncycles)
+      {
+        /* now decide how many cycles we can process in this iteration */
+        ncycles = (1 << D->PTask) - ngrpstart;
+
+        do
+          {
+            Nimport      = 0;
+            NimportNodes = 0;
+
+            for(int ngrp = ngrpstart; ngrp < ngrpstart + ncycles; ngrp++)
+              {
+                int recvTask = D->ThisTask ^ ngrp;
+
+                if(recvTask < D->NTask)
+                  {
+                    if(Recv[recvTask].Count > 0)
+                      {
+                        Nimport += Recv[recvTask].Count;
+                        NimportNodes += Recv[recvTask].CountNodes;
+                      }
+                  }
+              }
+
+            int flag = 0, flagall;
+
+            if(generic_calc_import_storage(Nimport, NimportNodes) > Mem.FreeBytes)
+              flag = 1;
+
+            MPI_Allreduce(&flag, &flagall, 1, MPI_INT, MPI_MAX, D->Communicator);
+
+            if(flagall)
+              ncycles /= 2;
+            else
+              break;
+          }
+        while(ncycles > 0);
+
+        if(ncycles == 0)
+          Terminate(
+              "Seems like we can't even do one cycle: ncycles=%d  ngrpstart=%d  Nimport=%d  NimportNodes=%d  FreeBytes=%lld  needed "
+              "storage=%lld",
+              ncycles, ngrpstart, Nimport, NimportNodes, (long long)Mem.FreeBytes,
+              (long long)generic_calc_import_storage(Nimport, NimportNodes));
+
+        if(ngrpstart == 1 && ncycles != ((1 << D->PTask) - ngrpstart) && D->ThisTask == 0)
+          warn("need multiple import/export phases to avoid memory overflow");
+
+        /* now allocated the import and results buffers */
+
+        DataGet     = (T_in *)Mem.mymalloc_movable_g(&DataGet, "DataGet", Nimport * sizeof(T_in));
+        NodeInfoGet = (node_info *)Mem.mymalloc_movable_g(&NodeInfoGet, "NodeInfoGet", NimportNodes * sizeof(node_info));
+        DataResult  = (T_out *)Mem.mymalloc_movable_g(&DataResult, "DataResult", Nimport * sizeof(T_out));
+
+        Nimport      = 0;
+        NimportNodes = 0;
+
+        /* exchange particle data */
+        for(int ngrp = ngrpstart; ngrp < ngrpstart + ncycles; ngrp++)
+          {
+            int recvTask = D->ThisTask ^ ngrp;
+
+            if(recvTask < D->NTask)
+              {
+                if(Send[recvTask].Count > 0 || Recv[recvTask].Count > 0)
+                  {
+                    size_t len = sizeof(T_in);
+
+                    /* get the particles */
+                    MPI_Sendrecv(&DataIn[Send_offset[recvTask]], Send[recvTask].Count * len, MPI_BYTE, recvTask, TAG_HYDRO_A,
+                                 &DataGet[Nimport], Recv[recvTask].Count * len, MPI_BYTE, recvTask, TAG_HYDRO_A, D->Communicator,
+                                 MPI_STATUS_IGNORE);
+
+                    /* get the node info */
+                    MPI_Sendrecv(&NodeInfoIn[Send_offset_nodes[recvTask]], Send[recvTask].CountNodes * sizeof(node_info), MPI_BYTE,
+                                 recvTask, TAG_GRAV_B, &NodeInfoGet[NimportNodes], Recv[recvTask].CountNodes * sizeof(node_info),
+                                 MPI_BYTE, recvTask, TAG_GRAV_B, D->Communicator, MPI_STATUS_IGNORE);
+
+                    for(int k = 0; k < Recv[recvTask].Count; k++)
+                      DataGet[Nimport + k].Firstnode += NimportNodes;
+
+                    Nimport += Recv[recvTask].Count;
+                    NimportNodes += Recv[recvTask].CountNodes;
+                  }
+              }
+          }
+
+        /* now do the actual work for the imported points */
+        secondary_loop();
+
+        /* send the results */
+        Nimport      = 0;
+        NimportNodes = 0;
+
+        for(int ngrp = ngrpstart; ngrp < ngrpstart + ncycles; ngrp++)
+          {
+            int recvTask = D->ThisTask ^ ngrp;
+            if(recvTask < D->NTask)
+              {
+                if(Send[recvTask].Count > 0 || Recv[recvTask].Count > 0)
+                  {
+                    size_t len = sizeof(T_out);
+
+                    /* exchange the results */
+                    MPI_Sendrecv(&DataResult[Nimport], Recv[recvTask].Count * len, MPI_BYTE, recvTask, TAG_HYDRO_B,
+                                 &DataOut[Send_offset[recvTask]], Send[recvTask].Count * len, MPI_BYTE, recvTask, TAG_HYDRO_B,
+                                 D->Communicator, MPI_STATUS_IGNORE);
+
+                    Nimport += Recv[recvTask].Count;
+                    NimportNodes += Recv[recvTask].CountNodes;
+                  }
+              }
+          }
+
+        Mem.myfree(DataResult);
+        Mem.myfree(NodeInfoGet);
+        Mem.myfree(DataGet);
+      }
+  }
+
+  /* this function deals with the communication step, and then processes the imported particles, and finally computes the results back.
+   * if there is not enough memory available to hold all the data sent to us from other processors, we process the incoming data in
+   * multiple stages, which will always be possible.
+   */
+  void generic_exchange(void)
+  {
+    /* set up Sendcount table */
+    generic_prepare_export_counts();
+
+    /* do the all-to-all exchange so that we have the Recvcount table as well */
+    generic_prepare_import_counts();
+
+    /* prepare offsets in export tables */
+    generic_prepare_export_offsets();
+
+    /* allocate particle data buffers */
+    DataIn     = (T_in *)Mem.mymalloc_movable_g(&DataIn, "DataIn", Nexport * sizeof(T_in));
+    NodeInfoIn = (node_info *)Mem.mymalloc_movable_g(&NodeInfoIn, "NodeInfoIn", NexportNodes * sizeof(node_info));
+    DataOut    = (T_out *)Mem.mymalloc_movable_g(&DataOut, "DataOut", Nexport * sizeof(T_out));
+
+    /* prepare particle data for export */
+    generic_prepare_particle_data_for_export();
+
+    /* export particles and process them, if needed in several installments */
+    generic_multiple_phases();
+
+    /* add the results to the local particles */
+    generic_add_results_to_local();
+
+    Mem.myfree(DataOut);
+    Mem.myfree(NodeInfoIn);
+    Mem.myfree(DataIn);
+  }
+
+  void generic_allocate_comm_tables(void)
+  {
+    ptrdiff_t off;
+
+    off = (char *)Tree->Nodes - Mem.Base;
+    MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, Tree->TreeNodes_offsets, sizeof(ptrdiff_t), MPI_BYTE, Tree->TreeSharedMemComm);
+
+    off = (char *)Tree->Points - Mem.Base;
+    MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, Tree->TreePoints_offsets, sizeof(ptrdiff_t), MPI_BYTE, Tree->TreeSharedMemComm);
+
+    off = (char *)Tree->Nextnode - Mem.Base;
+    MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, Tree->TreeNextnode_offsets, sizeof(ptrdiff_t), MPI_BYTE, Tree->TreeSharedMemComm);
+
+    off = (char *)Tp->P - Mem.Base;
+    MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, Tree->TreeP_offsets, sizeof(ptrdiff_t), MPI_BYTE, Tree->TreeSharedMemComm);
+
+    off = (char *)Tp->PS - Mem.Base;
+    MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, Tree->TreePS_offsets, sizeof(ptrdiff_t), MPI_BYTE, Tree->TreeSharedMemComm);
+
+    Exportflag      = (int *)Mem.mymalloc("Exportflag", ((((D->NTask - 1) / 16) + 1) * 16) * sizeof(int));
+    Exportindex     = (int *)Mem.mymalloc("Exportindex", D->NTask * sizeof(int));
+    Exportnodecount = (int *)Mem.mymalloc("Exportnodecount", D->NTask * sizeof(int));
+
+    Send = (send_recv_counts *)Mem.mymalloc("Send", sizeof(send_recv_counts) * D->NTask);
+    Recv = (send_recv_counts *)Mem.mymalloc("Recv", sizeof(send_recv_counts) * D->NTask);
+
+    Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * D->NTask);
+    Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * D->NTask);
+    Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * D->NTask);
+    Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * D->NTask);
+
+    Send_count_nodes  = (int *)Mem.mymalloc("Send_count_nodes", sizeof(int) * D->NTask);
+    Send_offset_nodes = (int *)Mem.mymalloc("Send_offset_nodes", sizeof(int) * D->NTask);
+    Recv_count_nodes  = (int *)Mem.mymalloc("Recv_count_nodes", sizeof(int) * D->NTask);
+    Recv_offset_nodes = (int *)Mem.mymalloc("Recv_offset_nodes", sizeof(int) * D->NTask);
+  }
+
+  void generic_free_comm_tables(void)
+  {
+    Mem.myfree(Recv_offset_nodes);
+    Mem.myfree(Recv_count_nodes);
+    Mem.myfree(Send_offset_nodes);
+    Mem.myfree(Send_count_nodes);
+    Mem.myfree(Recv_offset);
+    Mem.myfree(Recv_count);
+    Mem.myfree(Send_offset);
+    Mem.myfree(Send_count);
+    Mem.myfree(Recv);
+    Mem.myfree(Send);
+    Mem.myfree(Exportnodecount);
+    Mem.myfree(Exportindex);
+    Mem.myfree(Exportflag);
+  }
+
+ public:
+  /* Implements a repeated loop over the local particles in the list, processing them with the local kernel function,
+   * until we're done or the export buffer is full. Then we exchange the data, and process the imported ones with the provided kernel.
+   * We repeat if neeed until all processors are done.
+   */
+  int execute(int nactive, int *targetlist, int action)
+  {
+    generic_allocate_comm_tables();
+
+    Nactive    = nactive;
+    Targets    = targetlist;
+    actioncode = action;
+
+    Thread.Interactions = 0;
+
+    int ndone_flag, ndone, iter = 0;
+
+    SumNexport = 0; /* can be queried as a book-keeping variable */
+
+    NextParticle = 0; /* first particle index for this task */
+
+    if(cpu_imbalance != logs::CPU_NONE)
+      Logs.timer_start(cpu_imbalance);
+
+    do
+      {
+        iter++;
+
+        /* allocate buffers to arrange communication */
+        generic_alloc_partlist_nodelist_ngblist_threadbufs();
+
+        /* do local particles */
+        if(action == MODE_DEFAULT)
+          {
+            primary_loop(MODE_LOCAL_PARTICLES);
+          }
+        else if(action == MODE_LOCAL_NO_EXPORT)
+          {
+            primary_loop(MODE_LOCAL_NO_EXPORT);
+          }
+        else
+          {
+            Terminate("unknown action code (%d) for primary loop", action);
+          }
+
+        /* do all necessary bookkeeping, data exchange, and processing of imported particles */
+        generic_exchange();
+
+        /* free the rest of the buffers */
+        generic_free_partlist_nodelist_ngblist_threadbufs();
+
+        /* check whether we are done */
+        if(NextParticle >= nactive)
+          ndone_flag = 1;
+        else
+          ndone_flag = 0;
+
+        MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, D->Communicator);
+      }
+    while(ndone < D->NTask);
+
+    generic_free_comm_tables();
+
+    if(cpu_imbalance != logs::CPU_NONE)
+      Logs.timer_stop(cpu_imbalance);
+
+    return iter;
+  }
+
+  int execute(int nactive, int *targetlist, int action, enum logs::timers a, enum logs::timers b, enum logs::timers c)
+  {
+    cpu_primary   = a;
+    cpu_secondary = b;
+    cpu_imbalance = c;
+
+    return execute(nactive, targetlist, action);
+  }
+};
+
+#endif
diff --git a/src/mpi_utils/healthtest.cc b/src/mpi_utils/healthtest.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd2a2f88404872da1f0f99adc989ceb46a196975
--- /dev/null
+++ b/src/mpi_utils/healthtest.cc
@@ -0,0 +1,300 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  healthtest.cc
+ *
+ *  \brief routines for testing whether all compute nodes yield the full CPU and communication performance
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../logs/logs.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../system/system.h"
+
+#define TEST_PACKET_SIZE_IN_MB 5
+#define WORK_LOOP_COUNTER 50000000
+#define WORK_NUMBER_OF_IPROBE_TESTS 1000000
+
+#ifndef MAX_VARIATION_TOLERANCE
+#define MAX_VARIATION_TOLERANCE 0.5
+#endif
+
+void sim::healthtest(void)
+{
+  mpi_printf("\n");
+
+  measure_cpu_performance(Communicator);
+
+  // Let's take a look at the communication speed in a global all-to-all data exchange realized through pairwise exchanges along a
+  // hypercube
+  if(NTask > 1)
+    measure_hyper_cube_speed("Full hypercube:", Communicator);
+
+  // Let's take a look at inter-node communication speed
+  if(NumNodes > 1)
+    {
+      int CommSplitColor;
+
+      if(RankInThisNode == 0)
+        CommSplitColor = 0;
+      else
+        CommSplitColor = 1;
+
+      MPI_Comm comm;
+      MPI_Comm_split(Communicator, CommSplitColor, ThisTask, &comm);
+
+      if(RankInThisNode == 0)
+        measure_hyper_cube_speed("Internode cube:", comm);
+
+      MPI_Comm_free(&comm);
+    }
+
+  // Now look at intra-node communication speed
+  if(NumNodes < NTask)
+    {
+      int CommSplitColor = ThisNode;
+      MPI_Comm comm;
+      MPI_Comm_split(Communicator, CommSplitColor, ThisTask, &comm);
+
+      measure_hyper_cube_speed("Intranode cube, 1st node:", comm);
+
+      MPI_Comm_free(&comm);
+    }
+
+  measure_iprobe_performance("Iprobe for any message:");
+
+  mpi_printf("\n");
+}
+
+double sim::measure_cpu_performance(MPI_Comm Communicator)
+{
+  int loc_ntask, loc_thistask, loc_ptask;
+
+  double ta = Logs.second();
+
+  MPI_Comm_rank(Communicator, &loc_thistask);
+  MPI_Comm_size(Communicator, &loc_ntask);
+
+  for(loc_ptask = 0; loc_ntask > (1 << loc_ptask); loc_ptask++)
+    ;
+
+  double sum = 0;
+
+  MPI_Barrier(Communicator);
+
+  double t0 = Logs.second();
+
+  // do some computationally intense (but useless) work for a while
+  for(int i = 0; i < WORK_LOOP_COUNTER; i++)
+    sum += sin((i + 0.1) / WORK_LOOP_COUNTER) / (2.0 + cos(i - 0.1) / WORK_LOOP_COUNTER);
+
+  double t1 = Logs.second();
+
+  double tperf = Logs.timediff(t0, t1), tperfsum;
+
+  MPI_Allreduce(&tperf, &tperfsum, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  double tavg = tperfsum / loc_ntask;
+
+  struct
+  {
+    double t;
+    int rank;
+  } local = {tperf, ThisTask}, localnode = {tperf, ThisNode}, min_time, max_time, min_timenode, max_timenode;
+
+  MPI_Allreduce(&local, &min_time, 1, MPI_DOUBLE_INT, MPI_MINLOC, Communicator);
+  MPI_Allreduce(&local, &max_time, 1, MPI_DOUBLE_INT, MPI_MAXLOC, Communicator);
+
+  MPI_Allreduce(&localnode, &min_timenode, 1, MPI_DOUBLE_INT, MPI_MINLOC, Communicator);
+  MPI_Allreduce(&localnode, &max_timenode, 1, MPI_DOUBLE_INT, MPI_MAXLOC, Communicator);
+
+  double variation = (max_time.t - min_time.t) / tavg;
+
+  double tb = Logs.second();
+
+  mpi_printf(
+      "HEALTHTEST: %25s  %8.3f sec            %7.3f%%  variation   | Best=%g on Task=%d/Node=%d, Worst=%g on Task=%d/Node=%d, test "
+      "took %g sec\n",
+      "CPU performance:", tavg, 100.0 * variation, min_time.t, min_time.rank, min_timenode.rank, max_time.t, max_time.rank,
+      max_timenode.rank, Logs.timediff(ta, tb));
+
+  if(variation >= MAX_VARIATION_TOLERANCE)
+    {
+      char name_maxnode[MPI_MAX_PROCESSOR_NAME];
+      int len;
+      if(ThisTask == max_time.rank)
+        MPI_Get_processor_name(name_maxnode, &len);
+
+      MPI_Bcast(name_maxnode, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, max_time.rank, Communicator);
+
+      char buf[1000 + MPI_MAX_PROCESSOR_NAME];
+      sprintf(buf, "processes_%s.txt", name_maxnode);
+
+      mpi_printf("HEALTHTEST: We are dumping a process list to the file '%s'\n", buf);
+
+      if(ThisTask == max_time.rank)
+        {
+          char cmd[10000 + MPI_MAX_PROCESSOR_NAME];
+          sprintf(cmd, "ps -ef >& %s", buf);
+          system(cmd);
+        }
+
+      MPI_Barrier(Communicator);
+
+      /*
+      mpi_printf(
+          "\n\nHEALTHTEST: We are stopping because the performance variation=%g of the CPUs lies above the prescribed tolerance "
+          "MAX_VARIATION_TOLERANCE=%g, possibly indicating a machine problem. (sum=%g)\n",
+          variation, MAX_VARIATION_TOLERANCE, sum);
+       */
+      warn(
+          "\n\nHEALTHTEST: We issue a warning because the performance variation=%g of the CPUs lies above the prescribed tolerance "
+          "MAX_VARIATION_TOLERANCE=%g, possibly indicating a machine problem. (sum=%g)\n",
+          variation, MAX_VARIATION_TOLERANCE, sum);
+
+      // endrun();    // only issue a warning for now
+    }
+
+  return sum;
+}
+
+double sim::measure_hyper_cube_speed(const char *tag, MPI_Comm Communicator)
+{
+  double ta = Logs.second();
+
+  int loc_ntask, loc_thistask, loc_ptask;
+
+  MPI_Comm_rank(Communicator, &loc_thistask);
+  MPI_Comm_size(Communicator, &loc_ntask);
+
+  for(loc_ptask = 0; loc_ntask > (1 << loc_ptask); loc_ptask++)
+    ;
+
+  int bytecount = (TEST_PACKET_SIZE_IN_MB * 1024L * 1024L) / loc_ntask;
+
+  double tall = 0;
+  int count   = 0;
+
+  char *sendbuf = (char *)Mem.mymalloc_clear("send", bytecount * sizeof(char));
+  char *recvbuf = (char *)Mem.mymalloc_clear("recv", bytecount * sizeof(char));
+
+  /* exchange the test data */
+  for(int ngrp = 1; ngrp < (1 << loc_ptask); ngrp++)
+    {
+      int recvTask = loc_thistask ^ ngrp;
+
+      MPI_Barrier(Communicator);
+
+      if(recvTask < loc_ntask)
+        {
+          double t0 = Logs.second();
+          MPI_Sendrecv(sendbuf, bytecount, MPI_BYTE, recvTask, TAG_DENS_A, recvbuf, bytecount, MPI_BYTE, recvTask, TAG_DENS_A,
+                       Communicator, MPI_STATUS_IGNORE);
+          double t1 = Logs.second();
+
+          tall += Logs.timediff(t0, t1);
+          count++;
+        }
+    }
+
+  Mem.myfree(recvbuf);
+  Mem.myfree(sendbuf);
+
+  double tperf = 0.5 * tall / count, tperfsum;
+
+  MPI_Allreduce(&tperf, &tperfsum, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  double tavg = tperfsum / loc_ntask;
+
+  struct
+  {
+    double t;
+    int rank;
+  } local = {tperf, ThisTask}, localnode = {tperf, ThisNode}, min_time, max_time, min_timenode, max_timenode;
+
+  MPI_Allreduce(&local, &min_time, 1, MPI_DOUBLE_INT, MPI_MINLOC, Communicator);
+  MPI_Allreduce(&local, &max_time, 1, MPI_DOUBLE_INT, MPI_MAXLOC, Communicator);
+
+  MPI_Allreduce(&localnode, &min_timenode, 1, MPI_DOUBLE_INT, MPI_MINLOC, Communicator);
+  MPI_Allreduce(&localnode, &max_timenode, 1, MPI_DOUBLE_INT, MPI_MAXLOC, Communicator);
+
+  double tb = Logs.second();
+
+  double variation = (bytecount / min_time.t - bytecount / max_time.t) / (bytecount / tavg);
+
+  mpi_printf(
+      "HEALTHTEST: %25s  %8.1f MB/s per pair  %7.3f%%  variation   | Best=%g on Task=%d/Node=%d, Worst=%g on Task=%d/Node=%d, test "
+      "took %g sec\n",
+      tag, bytecount / tavg * TO_MBYTE_FAC, 100.0 * variation, bytecount / min_time.t * TO_MBYTE_FAC, min_time.rank, min_timenode.rank,
+      bytecount / max_time.t * TO_MBYTE_FAC, max_time.rank, max_timenode.rank, Logs.timediff(ta, tb));
+
+  if(variation > MAX_VARIATION_TOLERANCE && ThisTask == 0)
+    warn(
+        "\nThe performance variation=%g of the communication speed lies above the prescribed tolerance MAX_VARIATION_TOLERANCE=%g, "
+        "possibly indicating a machine problem.\n",
+        variation, MAX_VARIATION_TOLERANCE);
+
+  return tavg;
+}
+
+void sim::measure_iprobe_performance(const char *tag)
+{
+  double ta = Logs.second();
+
+  for(int i = 0; i < WORK_NUMBER_OF_IPROBE_TESTS; i++)
+    {
+      int flag;
+      MPI_Status status;
+
+      MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, Communicator, &flag, &status);
+    }
+
+  double tb = Logs.second();
+
+  double tperf = Logs.timediff(ta, tb) / WORK_NUMBER_OF_IPROBE_TESTS;
+
+  struct
+  {
+    double t;
+    int rank;
+  } local = {tperf, ThisTask}, min_time, max_time;
+
+  MPI_Allreduce(&local, &min_time, 1, MPI_DOUBLE_INT, MPI_MINLOC, Communicator);
+  MPI_Allreduce(&local, &max_time, 1, MPI_DOUBLE_INT, MPI_MAXLOC, Communicator);
+
+  double tperfsum;
+  MPI_Allreduce(&tperf, &tperfsum, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  double tavg = tperfsum / NTask;
+
+  char name_minnode[MPI_MAX_PROCESSOR_NAME];
+  char name_maxnode[MPI_MAX_PROCESSOR_NAME];
+
+  int len;
+  if(ThisTask == min_time.rank)
+    MPI_Get_processor_name(name_minnode, &len);
+  if(ThisTask == max_time.rank)
+    MPI_Get_processor_name(name_maxnode, &len);
+
+  MPI_Bcast(name_minnode, MPI_MAX_PROCESSOR_NAME, MPI_BYTE, min_time.rank, Communicator);
+  MPI_Bcast(name_maxnode, MPI_MAX_PROCESSOR_NAME, MPI_BYTE, max_time.rank, Communicator);
+
+  double variation = (max_time.t - min_time.t) / tavg;
+
+  mpi_printf(
+      "HEALTHTEST: %25s  %g s per MPI_Ip%7.3f%%  variation   | Best=%g on Task=%d/Node=%s, Worst=%g on Task=%d/Node=%s, test took %g "
+      "sec\n",
+      tag, tavg, 100.0 * variation, min_time.t, min_time.rank, name_minnode, max_time.t, max_time.rank, name_maxnode,
+      Logs.timediff(ta, tb));
+}
diff --git a/src/mpi_utils/hypercube_allgatherv.cc b/src/mpi_utils/hypercube_allgatherv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f13b9b6dd1da8905e2c3c0c7e7810e2b174ec358
--- /dev/null
+++ b/src/mpi_utils/hypercube_allgatherv.cc
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file hypercube_allgatherv.cc
+ *
+ *  \brief a simple version of an Allgatherv implemented with a hypercube communication model
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+
+#ifdef MPI_HYPERCUBE_ALLGATHERV
+
+#define TAG 100
+
+int MPI_hypercube_Allgatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int *recvcount, int *displs,
+                             MPI_Datatype recvtype, MPI_Comm comm)
+{
+  int ntask, thistask, ptask, ngrp, size_sendtype, size_recvtype;
+  MPI_Status status;
+
+  MPI_Comm_rank(comm, &thistask);
+  MPI_Comm_size(comm, &ntask);
+
+  MPI_Type_size(sendtype, &size_sendtype);
+  MPI_Type_size(recvtype, &size_recvtype);
+
+  for(ptask = 0; ntask > (1 << ptask); ptask++)
+    ;
+
+  for(ngrp = 1; ngrp < (1 << ptask); ngrp++)
+    {
+      int recvtask = thistask ^ ngrp;
+
+      if(recvtask < ntask)
+        MPI_Sendrecv(sendbuf, sendcount, sendtype, recvtask, TAG, (char *)recvbuf + displs[recvtask] * size_recvtype,
+                     recvcount[recvtask], recvtype, recvtask, TAG, comm, &status);
+    }
+
+  if((char *)sendbuf != (char *)recvbuf + displs[thistask] * size_recvtype)
+    memcpy((char *)recvbuf + displs[thistask] * size_recvtype, sendbuf, sendcount * size_sendtype);
+
+  return 0;
+}
+
+#endif
diff --git a/src/mpi_utils/mpi_types.cc b/src/mpi_utils/mpi_types.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6bc02e007f48953785e24dcf744225220a1878a3
--- /dev/null
+++ b/src/mpi_utils/mpi_types.cc
@@ -0,0 +1,83 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  mpi_types.cc
+ *
+ *  \brief implements some user defined MPI types for collectives
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../mpi_utils/mpi_utils.h"
+
+static void min_MPI_MyIntPosType(void *in, void *inout, int *len, MPI_Datatype *type)
+{
+  /* we here trust that this is called only for the correct type  */
+
+  MyIntPosType *invalues    = (MyIntPosType *)in;
+  MyIntPosType *inoutvalues = (MyIntPosType *)inout;
+
+  for(int i = 0; i < *len; i++)
+    if(invalues[i] < inoutvalues[i])
+      inoutvalues[i] = invalues[i];
+}
+
+static void max_MPI_MyIntPosType(void *in, void *inout, int *len, MPI_Datatype *type)
+{
+  /* we here trust that this is called only for the correct type  */
+
+  MyIntPosType *invalues    = (MyIntPosType *)in;
+  MyIntPosType *inoutvalues = (MyIntPosType *)inout;
+
+  for(int i = 0; i < *len; i++)
+    if(invalues[i] > inoutvalues[i])
+      inoutvalues[i] = invalues[i];
+}
+
+static void min_MPI_MySignedIntPosType(void *in, void *inout, int *len, MPI_Datatype *type)
+{
+  /* we here trust that this is called only for the correct type  */
+
+  MySignedIntPosType *invalues    = (MySignedIntPosType *)in;
+  MySignedIntPosType *inoutvalues = (MySignedIntPosType *)inout;
+
+  for(int i = 0; i < *len; i++)
+    if(invalues[i] < inoutvalues[i])
+      inoutvalues[i] = invalues[i];
+}
+
+static void max_MPI_MySignedIntPosType(void *in, void *inout, int *len, MPI_Datatype *type)
+{
+  /* we here trust that this is called only for the correct type  */
+
+  MySignedIntPosType *invalues    = (MySignedIntPosType *)in;
+  MySignedIntPosType *inoutvalues = (MySignedIntPosType *)inout;
+
+  for(int i = 0; i < *len; i++)
+    if(invalues[i] > inoutvalues[i])
+      inoutvalues[i] = invalues[i];
+}
+
+void my_mpi_types_init(void)
+{
+  /* create our new data type */
+  MPI_Type_contiguous(sizeof(MyIntPosType), MPI_BYTE, &MPI_MyIntPosType);
+  MPI_Type_commit(&MPI_MyIntPosType);
+
+  /* create our operators */
+  MPI_Op_create(min_MPI_MyIntPosType, 1, &MPI_MIN_MyIntPosType);
+  MPI_Op_create(max_MPI_MyIntPosType, 1, &MPI_MAX_MyIntPosType);
+  MPI_Op_create(min_MPI_MySignedIntPosType, 1, &MPI_MIN_MySignedIntPosType);
+  MPI_Op_create(max_MPI_MySignedIntPosType, 1, &MPI_MAX_MySignedIntPosType);
+}
diff --git a/src/mpi_utils/mpi_utils.h b/src/mpi_utils/mpi_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f62434cbba373cc094be7259984e94485485f4cb
--- /dev/null
+++ b/src/mpi_utils/mpi_utils.h
@@ -0,0 +1,289 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  mpi_utils.h
+ *  \brief declares some numerical values for MPI tags and function prototypes for MPI helper functions
+ */
+
+#ifndef MPI_UTILS_H
+#define MPI_UTILS_H
+
+#include <mpi.h>
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+
+/*!< Various tags used for labeling MPI messages */
+#define TAG_TOPNODE_FREE 4
+#define TAG_TOPNODE_OFFSET 5
+#define TAG_TOPNODE_ALLOC 6
+#define TAG_EWALD_ALLOC 7
+#define TAG_TABLE_ALLOC 8
+#define TAG_TABLE_FREE 9
+#define TAG_N 10
+#define TAG_HEADER 11
+#define TAG_PDATA 12
+#define TAG_SPHDATA 13
+#define TAG_KEY 14
+#define TAG_DMOM 15
+#define TAG_NODELEN 16
+#define TAG_HMAX 17
+#define TAG_GRAV_A 18
+#define TAG_GRAV_B 19
+#define TAG_DIRECT_A 20
+#define TAG_DIRECT_B 21
+#define TAG_HYDRO_A 22
+#define TAG_HYDRO_B 23
+#define TAG_NFORTHISTASK 24
+#define TAG_PERIODIC_A 25
+#define TAG_PERIODIC_B 26
+#define TAG_PERIODIC_C 27
+#define TAG_PERIODIC_D 28
+#define TAG_NONPERIOD_A 29
+#define TAG_NONPERIOD_B 30
+#define TAG_NONPERIOD_C 31
+#define TAG_NONPERIOD_D 32
+#define TAG_POTENTIAL_A 33
+#define TAG_POTENTIAL_B 34
+#define TAG_DENS_A 35
+#define TAG_DENS_B 36
+#define TAG_LOCALN 37
+#define TAG_BH_A 38
+#define TAG_BH_B 39
+#define TAG_SMOOTH_A 40
+#define TAG_SMOOTH_B 41
+#define TAG_ENRICH_A 42
+#define TAG_CONDUCT_A 43
+#define TAG_CONDUCT_B 44
+#define TAG_FOF_A 45
+#define TAG_FOF_B 46
+#define TAG_FOF_C 47
+#define TAG_FOF_D 48
+#define TAG_FOF_E 49
+#define TAG_FOF_F 50
+#define TAG_FOF_G 51
+#define TAG_HOTNGB_A 52
+#define TAG_HOTNGB_B 53
+#define TAG_GRAD_A 54
+#define TAG_GRAD_B 55
+
+#define TAG_SE 56
+
+#define TAG_SEARCH_A 58
+#define TAG_SEARCH_B 59
+
+#define TAG_INJECT_A 61
+
+#define TAG_PDATA_SPH 70
+#define TAG_KEY_SPH 71
+
+#define TAG_PDATA_STAR 72
+#define TAG_STARDATA 73
+#define TAG_KEY_STAR 74
+
+#define TAG_PDATA_BH 75
+#define TAG_BHDATA 76
+#define TAG_KEY_BH 77
+
+#define TAG_GRAVCOST_A 79
+#define TAG_GRAVCOST_B 80
+
+#define TAG_PM_FOLD 81
+
+#define TAG_BARRIER 82
+#define TAG_PART_DATA 83
+#define TAG_NODE_DATA 84
+#define TAG_RESULTS 85
+#define TAG_DRIFT_INIT 86
+#define TAG_METDATA 500
+#define TAG_FETCH_GRAVTREE 1000
+#define TAG_FETCH_SPH_DENSITY 2000
+#define TAG_FETCH_SPH_HYDRO 3000
+#define TAG_FETCH_SPH_TREETIMESTEP 4000
+
+void my_mpi_types_init(void);
+
+int myMPI_Sendrecv(void *sendbuf, size_t sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvbuf, size_t recvcount,
+                   MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status *status);
+
+int myMPI_Alltoallv_new_prep(int *sendcnt, int *recvcnt, int *rdispls, MPI_Comm comm, int method);
+
+void myMPI_Alltoallv_new(void *sendb, int *sendcounts, int *sdispls, MPI_Datatype sendtype, void *recvb, int *recvcounts, int *rdispls,
+                         MPI_Datatype recvtype, MPI_Comm comm, int method);
+
+void myMPI_Alltoallv(void *sendbuf, size_t *sendcounts, size_t *sdispls, void *recvbuf, size_t *recvcounts, size_t *rdispls, int len,
+                     int big_flag, MPI_Comm comm);
+
+void my_int_MPI_Alltoallv(void *sendb, int *sendcounts, int *sdispls, void *recvb, int *recvcounts, int *rdispls, int len,
+                          int big_flag, MPI_Comm comm);
+
+int myMPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);
+
+int MPI_hypercube_Allgatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int *recvcount, int *displs,
+                             MPI_Datatype recvtype, MPI_Comm comm);
+
+void allreduce_sparse_double_sum(double *loc, double *glob, int N, MPI_Comm comm);
+
+void minimum_large_ints(int n, long long *src, long long *res, MPI_Comm comm);
+void sumup_longs(int n, long long *src, long long *res, MPI_Comm comm);
+void sumup_large_ints(int n, int *src, long long *res, MPI_Comm comm);
+
+extern MPI_Datatype MPI_MyIntPosType;
+
+extern MPI_Op MPI_MIN_MyIntPosType;
+extern MPI_Op MPI_MAX_MyIntPosType;
+extern MPI_Op MPI_MIN_MySignedIntPosType;
+extern MPI_Op MPI_MAX_MySignedIntPosType;
+
+template <typename T>
+void allreduce_sum(T *glob, int N, MPI_Comm Communicator)
+{
+  int ntask, thistask, ptask;
+  MPI_Comm_size(Communicator, &ntask);
+  MPI_Comm_rank(Communicator, &thistask);
+
+  for(ptask = 0; ntask > (1 << ptask); ptask++)
+    ;
+
+  // we are responsible for a certain stretch of the result, namely the one starting at loc_first_n, of length blocksize[thistask]
+
+  int *blocksize  = (int *)Mem.mymalloc("blocksize", sizeof(int) * ntask);
+  int *blockstart = (int *)Mem.mymalloc("blockstart", sizeof(int) * ntask);
+
+  int blk     = N / ntask;
+  int rmd     = N - blk * ntask; /* remainder */
+  int pivot_n = rmd * (blk + 1);
+
+  int loc_first_n = 0;
+  blockstart[0]   = 0;
+
+  for(int task = 0; task < ntask; task++)
+    {
+      if(task < rmd)
+        blocksize[task] = blk + 1;
+      else
+        blocksize[task] = blk;
+
+      if(task < thistask)
+        loc_first_n += blocksize[task];
+
+      if(task > 0)
+        blockstart[task] = blockstart[task - 1] + blocksize[task - 1];
+    }
+
+  /* here we store the local result */
+  T *loc_data = (T *)Mem.mymalloc_clear("loc_data", blocksize[thistask] * sizeof(T));
+
+  int *send_count = (int *)Mem.mymalloc("send_count", sizeof(int) * ntask);
+  int *recv_count = (int *)Mem.mymalloc("recv_count", sizeof(int) * ntask);
+
+  int *send_offset = (int *)Mem.mymalloc("send_offset", sizeof(int) * ntask);
+
+  struct ind_data
+  {
+    int n;
+    T val;
+  };
+
+  ind_data *export_data = NULL;
+  int nexport           = 0;
+
+  for(int rep = 0; rep < 2; rep++)
+    {
+      for(int j = 0; j < ntask; j++)
+        send_count[j] = 0;
+
+      /* find for each non-zero element the processor where it should go for being summed */
+      for(int n = 0; n < N; n++)
+        {
+          if(glob[n] != 0)
+            {
+              int task;
+              if(n < pivot_n)
+                task = n / (blk + 1);
+              else
+                task = rmd + (n - pivot_n) / blk; /* note: if blk=0, then this case can not occur */
+
+              if(rep == 0)
+                send_count[task]++;
+              else
+                {
+                  int index              = send_offset[task] + send_count[task]++;
+                  export_data[index].n   = n;
+                  export_data[index].val = glob[n];
+                }
+            }
+        }
+
+      if(rep == 0)
+        {
+          MPI_Alltoall(send_count, 1, MPI_INT, recv_count, 1, MPI_INT, Communicator);
+
+          send_offset[0] = 0;
+
+          for(int j = 0; j < ntask; j++)
+            {
+              nexport += send_count[j];
+
+              if(j > 0)
+                send_offset[j] = send_offset[j - 1] + send_count[j - 1];
+            }
+
+          export_data = (ind_data *)Mem.mymalloc("export_data", nexport * sizeof(ind_data));
+        }
+      else
+        {
+          for(int ngrp = 0; ngrp < (1 << ptask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+            {
+              int recvTask = thistask ^ ngrp;
+              if(recvTask < ntask)
+                if(send_count[recvTask] > 0 || recv_count[recvTask] > 0)
+                  {
+                    int nimport = recv_count[recvTask];
+
+                    ind_data *import_data = (ind_data *)Mem.mymalloc("import_data", nimport * sizeof(ind_data));
+
+                    MPI_Sendrecv(&export_data[send_offset[recvTask]], send_count[recvTask] * sizeof(ind_data), MPI_BYTE, recvTask,
+                                 TAG_DENS_B, import_data, recv_count[recvTask] * sizeof(ind_data), MPI_BYTE, recvTask, TAG_DENS_B,
+                                 Communicator, MPI_STATUS_IGNORE);
+
+                    for(int i = 0; i < nimport; i++)
+                      {
+                        int j = import_data[i].n - loc_first_n;
+
+                        if(j < 0 || j >= blocksize[thistask])
+                          Terminate("j=%d < 0 || j>= blocksize[thistask]=%d", j, blocksize[thistask]);
+
+                        loc_data[j] += import_data[i].val;
+                      }
+
+                    Mem.myfree(import_data);
+                  }
+            }
+
+          Mem.myfree(export_data);
+        }
+    }
+
+  Mem.myfree(send_offset);
+  Mem.myfree(recv_count);
+  Mem.myfree(send_count);
+
+  /* now share the result across all processors */
+  for(int ngrp = 0; ngrp < (1 << ptask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+    {
+      int recvTask = thistask ^ ngrp;
+      if(recvTask < ntask)
+        if(blocksize[thistask] > 0 || blocksize[recvTask] > 0)
+          MPI_Sendrecv(loc_data, blocksize[thistask] * sizeof(T), MPI_BYTE, recvTask, TAG_DENS_A, &glob[blockstart[recvTask]],
+                       blocksize[recvTask] * sizeof(T), MPI_BYTE, recvTask, TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  Mem.myfree(loc_data);
+  Mem.myfree(blockstart);
+  Mem.myfree(blocksize);
+}
+
+#endif
diff --git a/src/mpi_utils/mpi_vars.cc b/src/mpi_utils/mpi_vars.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8c7f5fc691a5e79ca85e0468be14bf045c46077
--- /dev/null
+++ b/src/mpi_utils/mpi_vars.cc
@@ -0,0 +1,29 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  mpi_vars.cc
+ *
+ *  \brief contains the global variables defined in the MPI helper functions
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../mpi_utils/mpi_utils.h"
+
+MPI_Datatype MPI_MyIntPosType;
+
+MPI_Op MPI_MIN_MyIntPosType;
+MPI_Op MPI_MAX_MyIntPosType;
+MPI_Op MPI_MIN_MySignedIntPosType;
+MPI_Op MPI_MAX_MySignedIntPosType;
diff --git a/src/mpi_utils/myalltoall.cc b/src/mpi_utils/myalltoall.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b8148667e5831fb063c05da3034d2f6dbc4b710d
--- /dev/null
+++ b/src/mpi_utils/myalltoall.cc
@@ -0,0 +1,292 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ ******************************************************************************/
+
+/*! \file  myalltoall.cc
+ *
+ *  \brief a simple wrapper around MPI_Alltoallv that can deal with data in individual sends that are very big
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../mpi_utils/mpi_utils.h"
+
+#define PCHAR(a) ((char *)a)
+
+/* This method prepares an Alltoallv computation.
+   sendcnt: must have as many entries as there are Tasks in comm
+            must be set
+   recvcnt: must have as many entries as there are Tasks in comm
+            will be set on return
+   rdispls: must have as many entries as there are Tasks in comm, or be NULL
+            if not NULL, will be set on return
+   method:  use standard Alltoall() approach or one-sided approach
+   returns: number of entries needed in the recvbuf */
+int myMPI_Alltoallv_new_prep(int *sendcnt, int *recvcnt, int *rdispls, MPI_Comm comm, int method)
+{
+  int rank, nranks;
+  MPI_Comm_size(comm, &nranks);
+  MPI_Comm_rank(comm, &rank);
+
+  if(method == 0 || method == 1)
+    MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, comm);
+  else if(method == 10)
+    {
+      for(int i = 0; i < nranks; ++i)
+        recvcnt[i] = 0;
+      recvcnt[rank] = sendcnt[rank];  // local communication
+      MPI_Win win;
+      MPI_Win_create(recvcnt, nranks * sizeof(MPI_INT), sizeof(MPI_INT), MPI_INFO_NULL, comm, &win);
+      MPI_Win_fence(0, win);
+      for(int i = 1; i < nranks; ++i)  // remote communication
+        {
+          int tgt = (rank + i) % nranks;
+          if(sendcnt[tgt] != 0)
+            MPI_Put(&sendcnt[tgt], 1, MPI_INT, tgt, rank, 1, MPI_INT, win);
+        }
+      MPI_Win_fence(0, win);
+      MPI_Win_free(&win);
+    }
+  else
+    Terminate("bad communication method");
+
+  int total = 0;
+  for(int i = 0; i < nranks; ++i)
+    {
+      if(rdispls)
+        rdispls[i] = total;
+      total += recvcnt[i];
+    }
+  return total;
+}
+
+void myMPI_Alltoallv_new(void *sendbuf, int *sendcnt, int *sdispls, MPI_Datatype sendtype, void *recvbuf, int *recvcnt, int *rdispls,
+                         MPI_Datatype recvtype, MPI_Comm comm, int method)
+{
+  int rank, nranks, itsz;
+  MPI_Comm_size(comm, &nranks);
+  MPI_Comm_rank(comm, &rank);
+  MPI_Type_size(sendtype, &itsz);
+  size_t tsz = itsz;  // to enforce size_t data type in later computations
+
+  if(method == 0)  // standard Alltoallv
+    MPI_Alltoallv(sendbuf, sendcnt, sdispls, sendtype, recvbuf, recvcnt, rdispls, recvtype, comm);
+  else if(method == 1)  // blocking sendrecv
+    {
+      if(sendtype != recvtype)
+        Terminate("bad MPI communication types");
+      int lptask = 1;
+      while(lptask < nranks)
+        lptask <<= 1;
+      int tag = 42;
+      MPI_Status status;
+
+      if(recvcnt[rank] > 0)  // local communication
+        memcpy(PCHAR(recvbuf) + tsz * rdispls[rank], PCHAR(sendbuf) + tsz * sdispls[rank], tsz * recvcnt[rank]);
+
+      for(int ngrp = 1; ngrp < lptask; ngrp++)
+        {
+          int otask = rank ^ ngrp;
+          if(otask < nranks)
+            if(sendcnt[otask] > 0 || recvcnt[otask] > 0)
+              MPI_Sendrecv(PCHAR(sendbuf) + tsz * sdispls[otask], sendcnt[otask], sendtype, otask, tag,
+                           PCHAR(recvbuf) + tsz * rdispls[otask], recvcnt[otask], recvtype, otask, tag, comm, &status);
+        }
+    }
+  else if(method == 2)  // asynchronous communication
+    {
+      if(sendtype != recvtype)
+        Terminate("bad MPI communication types");
+      int lptask = 1;
+      while(lptask < nranks)
+        lptask <<= 1;
+      int tag = 42;
+
+      MPI_Request *requests = (MPI_Request *)Mem.mymalloc("requests", 2 * nranks * sizeof(MPI_Request));
+      int n_requests        = 0;
+
+      if(recvcnt[rank] > 0)  // local communication
+        memcpy(PCHAR(recvbuf) + tsz * rdispls[rank], PCHAR(sendbuf) + tsz * sdispls[rank], tsz * recvcnt[rank]);
+
+      for(int ngrp = 1; ngrp < lptask; ngrp++)
+        {
+          int otask = rank ^ ngrp;
+          if(otask < nranks)
+            if(recvcnt[otask] > 0)
+              MPI_Irecv(PCHAR(recvbuf) + tsz * rdispls[otask], recvcnt[otask], recvtype, otask, tag, comm, &requests[n_requests++]);
+        }
+
+      for(int ngrp = 1; ngrp < lptask; ngrp++)
+        {
+          int otask = rank ^ ngrp;
+          if(otask < nranks)
+            if(sendcnt[otask] > 0)
+              MPI_Issend(PCHAR(sendbuf) + tsz * sdispls[otask], sendcnt[otask], sendtype, otask, tag, comm, &requests[n_requests++]);
+        }
+
+      MPI_Waitall(n_requests, requests, MPI_STATUSES_IGNORE);
+      Mem.myfree(requests);
+    }
+  else if(method == 10)
+    {
+      if(sendtype != recvtype)
+        Terminate("bad MPI communication types");
+      int *disp_at_sender  = (int *)Mem.mymalloc("disp_at_sender", nranks * sizeof(int));
+      disp_at_sender[rank] = sdispls[rank];
+      MPI_Win win;
+      // TODO:supply info object with "no_lock"
+      MPI_Win_create(sdispls, nranks * sizeof(MPI_INT), sizeof(MPI_INT), MPI_INFO_NULL, comm, &win);
+      MPI_Win_fence(0, win);
+      for(int i = 1; i < nranks; ++i)
+        {
+          int tgt = (rank + i) % nranks;
+          if(recvcnt[tgt] != 0)
+            MPI_Get(&disp_at_sender[tgt], 1, MPI_INT, tgt, rank, 1, MPI_INT, win);
+        }
+      MPI_Win_fence(0, win);
+      MPI_Win_free(&win);
+      if(recvcnt[rank] > 0)  // first take care of local communication
+        memcpy(PCHAR(recvbuf) + tsz * rdispls[rank], PCHAR(sendbuf) + tsz * sdispls[rank], tsz * recvcnt[rank]);
+      MPI_Win_create(sendbuf, (sdispls[nranks - 1] + sendcnt[nranks - 1]) * tsz, tsz, MPI_INFO_NULL, comm, &win);
+      MPI_Win_fence(0, win);
+      for(int i = 1; i < nranks; ++i)  // now the rest, start with right neighbour
+        {
+          int tgt = (rank + i) % nranks;
+          if(recvcnt[tgt] != 0)
+            MPI_Get(PCHAR(recvbuf) + tsz * rdispls[tgt], recvcnt[tgt], sendtype, tgt, disp_at_sender[tgt], recvcnt[tgt], sendtype,
+                    win);
+        }
+      MPI_Win_fence(0, win);
+      MPI_Win_free(&win);
+      Mem.myfree(disp_at_sender);
+    }
+  else
+    Terminate("bad communication method");
+}
+
+void myMPI_Alltoallv(void *sendb, size_t *sendcounts, size_t *sdispls, void *recvb, size_t *recvcounts, size_t *rdispls, int len,
+                     int big_flag, MPI_Comm comm)
+{
+  char *sendbuf = (char *)sendb;
+  char *recvbuf = (char *)recvb;
+
+  if(big_flag == 0)
+    {
+      int ntask;
+      MPI_Comm_size(comm, &ntask);
+
+      int *scount = (int *)Mem.mymalloc("scount", ntask * sizeof(int));
+      int *rcount = (int *)Mem.mymalloc("rcount", ntask * sizeof(int));
+      int *soff   = (int *)Mem.mymalloc("soff", ntask * sizeof(int));
+      int *roff   = (int *)Mem.mymalloc("roff", ntask * sizeof(int));
+
+      for(int i = 0; i < ntask; i++)
+        {
+          scount[i] = sendcounts[i] * len;
+          rcount[i] = recvcounts[i] * len;
+          soff[i]   = sdispls[i] * len;
+          roff[i]   = rdispls[i] * len;
+        }
+
+      MPI_Alltoallv(sendbuf, scount, soff, MPI_BYTE, recvbuf, rcount, roff, MPI_BYTE, comm);
+
+      Mem.myfree(roff);
+      Mem.myfree(soff);
+      Mem.myfree(rcount);
+      Mem.myfree(scount);
+    }
+  else
+    {
+      /* here we definitely have some large messages. We default to the
+       * pair-wise protocol, which should be most robust anyway.
+       */
+      int ntask, thistask, ptask;
+      MPI_Comm_size(comm, &ntask);
+      MPI_Comm_rank(comm, &thistask);
+
+      for(ptask = 0; ntask > (1 << ptask); ptask++)
+        ;
+
+      for(int ngrp = 0; ngrp < (1 << ptask); ngrp++)
+        {
+          int target = thistask ^ ngrp;
+
+          if(target < ntask)
+            {
+              if(sendcounts[target] > 0 || recvcounts[target] > 0)
+                myMPI_Sendrecv(sendbuf + sdispls[target] * len, sendcounts[target] * len, MPI_BYTE, target, TAG_PDATA + ngrp,
+                               recvbuf + rdispls[target] * len, recvcounts[target] * len, MPI_BYTE, target, TAG_PDATA + ngrp, comm,
+                               MPI_STATUS_IGNORE);
+            }
+        }
+    }
+}
+
+void my_int_MPI_Alltoallv(void *sendb, int *sendcounts, int *sdispls, void *recvb, int *recvcounts, int *rdispls, int len,
+                          int big_flag, MPI_Comm comm)
+{
+  char *sendbuf = (char *)sendb;
+  char *recvbuf = (char *)recvb;
+
+  if(big_flag == 0)
+    {
+      int ntask;
+      MPI_Comm_size(comm, &ntask);
+
+      int *scount = (int *)Mem.mymalloc("scount", ntask * sizeof(int));
+      int *rcount = (int *)Mem.mymalloc("rcount", ntask * sizeof(int));
+      int *soff   = (int *)Mem.mymalloc("soff", ntask * sizeof(int));
+      int *roff   = (int *)Mem.mymalloc("roff", ntask * sizeof(int));
+
+      for(int i = 0; i < ntask; i++)
+        {
+          scount[i] = sendcounts[i] * len;
+          rcount[i] = recvcounts[i] * len;
+          soff[i]   = sdispls[i] * len;
+          roff[i]   = rdispls[i] * len;
+        }
+
+      MPI_Alltoallv(sendbuf, scount, soff, MPI_BYTE, recvbuf, rcount, roff, MPI_BYTE, comm);
+
+      Mem.myfree(roff);
+      Mem.myfree(soff);
+      Mem.myfree(rcount);
+      Mem.myfree(scount);
+    }
+  else
+    {
+      /* here we definitely have some large messages. We default to the
+       * pair-wise protocoll, which should be most robust anyway.
+       */
+      int ntask, thistask, ptask;
+      MPI_Comm_size(comm, &ntask);
+      MPI_Comm_rank(comm, &thistask);
+
+      for(ptask = 0; ntask > (1 << ptask); ptask++)
+        ;
+
+      for(int ngrp = 0; ngrp < (1 << ptask); ngrp++)
+        {
+          int target = thistask ^ ngrp;
+
+          if(target < ntask)
+            {
+              if(sendcounts[target] > 0 || recvcounts[target] > 0)
+                myMPI_Sendrecv(sendbuf + sdispls[target] * len, sendcounts[target] * len, MPI_BYTE, target, TAG_PDATA + ngrp,
+                               recvbuf + rdispls[target] * len, recvcounts[target] * len, MPI_BYTE, target, TAG_PDATA + ngrp, comm,
+                               MPI_STATUS_IGNORE);
+            }
+        }
+    }
+}
diff --git a/src/mpi_utils/setcomm.h b/src/mpi_utils/setcomm.h
new file mode 100644
index 0000000000000000000000000000000000000000..17ae1a39dad657e050e4118ac85f39553de610a6
--- /dev/null
+++ b/src/mpi_utils/setcomm.h
@@ -0,0 +1,173 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  setcomm.h
+ *
+ *  \brief implements a class providing basic information about the local MPI communicator
+ */
+
+#ifndef SETCOMM_H
+#define SETCOMM_H
+
+#include <mpi.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+class setcomm
+{
+ public:
+  setcomm(MPI_Comm Comm) { initcomm(Comm); }
+  setcomm(const char *str)
+  {
+    /* do nothing in this case, because we need to delay the initialization until MPI_Init has been executed */
+  }
+
+  MPI_Comm Communicator;
+  int NTask;
+  int ThisTask;
+  int PTask;
+
+  int ThisNode;        /**< the rank of the current compute node  */
+  int NumNodes = 0;    /**< the number of compute nodes used  */
+  int TasksInThisNode; /**< number of MPI tasks on  current compute node */
+  int RankInThisNode;  /**< rank of the MPI task on the current compute node */
+  int MinTasksPerNode; /**< the minimum number of MPI tasks that is found on any of the nodes  */
+  int MaxTasksPerNode; /**< the maximum number of MPI tasks that is found on any of the nodes  */
+  long long MemoryOnNode;
+  long long SharedMemoryOnNode;
+
+  void initcomm(MPI_Comm Comm)
+  {
+    Communicator = Comm;
+    MPI_Comm_rank(Communicator, &ThisTask);
+    MPI_Comm_size(Communicator, &NTask);
+
+    for(PTask = 0; NTask > (1 << PTask); PTask++)
+      ;
+  }
+
+  void mpi_printf(const char *fmt, ...)
+  {
+    if(ThisTask == 0)
+      {
+        va_list l;
+        va_start(l, fmt);
+        vprintf(fmt, l);
+        //        myflush(stdout);
+        va_end(l);
+      }
+  }
+
+ private:
+  struct node_data
+  {
+    int task, this_node, first_task_in_this_node;
+    int first_index, rank_in_node, tasks_in_node;
+    char name[MPI_MAX_PROCESSOR_NAME];
+  };
+  node_data loc_node, *list_of_nodes;
+
+  static bool system_compare_hostname(const node_data &a, const node_data &b)
+  {
+    int res = strcmp(a.name, b.name);
+    if(res < 0)
+      return true;
+    if(res > 0)
+      return false;
+    return a.task < b.task;
+  }
+
+  static bool system_compare_first_task(const node_data &a, const node_data &b)
+  {
+    if(a.first_task_in_this_node < b.first_task_in_this_node)
+      return true;
+    if(a.first_task_in_this_node > b.first_task_in_this_node)
+      return false;
+    return a.task < b.task;
+  }
+
+  static bool system_compare_task(const node_data &a, const node_data &b) { return a.task < b.task; }
+
+ public:
+  void determine_compute_nodes(void)
+  {
+    int len, nodes, i, no, rank, first_index;
+
+    MPI_Get_processor_name(loc_node.name, &len);
+    loc_node.task = ThisTask;
+
+    list_of_nodes = (node_data *)malloc(
+        sizeof(node_data) * NTask); /* Note: Internal memory allocation routines are not yet available when this function is called */
+
+    MPI_Allgather(&loc_node, sizeof(node_data), MPI_BYTE, list_of_nodes, sizeof(node_data), MPI_BYTE, Communicator);
+
+    std::sort(list_of_nodes, list_of_nodes + NTask, system_compare_hostname);
+
+    list_of_nodes[0].first_task_in_this_node = list_of_nodes[0].task;
+
+    for(i = 1, nodes = 1; i < NTask; i++)
+      {
+        if(strcmp(list_of_nodes[i].name, list_of_nodes[i - 1].name) != 0)
+          {
+            list_of_nodes[i].first_task_in_this_node = list_of_nodes[i].task;
+            nodes++;
+          }
+        else
+          list_of_nodes[i].first_task_in_this_node = list_of_nodes[i - 1].first_task_in_this_node;
+      }
+
+    std::sort(list_of_nodes, list_of_nodes + NTask, system_compare_first_task);
+
+    for(i = 0; i < NTask; i++)
+      list_of_nodes[i].tasks_in_node = 0;
+
+    for(i = 0, no = 0, rank = 0, first_index = 0; i < NTask; i++)
+      {
+        if(i ? list_of_nodes[i].first_task_in_this_node != list_of_nodes[i - 1].first_task_in_this_node : 0)
+          {
+            no++;
+            rank        = 0;
+            first_index = i;
+          }
+
+        list_of_nodes[i].first_index  = first_index;
+        list_of_nodes[i].this_node    = no;
+        list_of_nodes[i].rank_in_node = rank++;
+        list_of_nodes[first_index].tasks_in_node++;
+      }
+
+    int max_count = 0;
+    int min_count = (1 << 30);
+
+    for(i = 0; i < NTask; i++)
+      {
+        list_of_nodes[i].tasks_in_node = list_of_nodes[list_of_nodes[i].first_index].tasks_in_node;
+
+        if(list_of_nodes[i].tasks_in_node > max_count)
+          max_count = list_of_nodes[i].tasks_in_node;
+        if(list_of_nodes[i].tasks_in_node < min_count)
+          min_count = list_of_nodes[i].tasks_in_node;
+      }
+
+    std::sort(list_of_nodes, list_of_nodes + NTask, system_compare_task);
+
+    TasksInThisNode = list_of_nodes[ThisTask].tasks_in_node;
+    RankInThisNode  = list_of_nodes[ThisTask].rank_in_node;
+
+    ThisNode = list_of_nodes[ThisTask].this_node;
+
+    NumNodes        = nodes;
+    MinTasksPerNode = min_count;
+    MaxTasksPerNode = max_count;
+
+    free(list_of_nodes);
+  }
+};
+
+#endif
diff --git a/src/mpi_utils/shared_mem_handler.cc b/src/mpi_utils/shared_mem_handler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee9edfd377d3a88cbb9a1ff912bf83f4886b63ef
--- /dev/null
+++ b/src/mpi_utils/shared_mem_handler.cc
@@ -0,0 +1,563 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  shared_mem_handler.cc
+ *
+ *  \brief implements code for the shared-memory fetching of remote date through  designated MPI handler ranks
+ */
+
+#include <hdf5.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <cstring>
+
+#include "../gravtree/gravtree.h"
+#include "../ngbtree/ngbtree.h"
+#include "../time_integration/driftfac.h"
+
+#include "../mpi_utils/shared_mem_handler.h"
+
+typedef gravtree<simparticles> gtree;
+typedef ngbtree ntree;
+
+void shmem::prepare_offset_table(void *p, ptrdiff_t *&offset_tab)  // called by ghost task
+{
+  ptrdiff_t off = ((char *)p - Mem.Base);
+
+  offset_tab = (ptrdiff_t *)Mem.mymalloc("offset_tab", Island_NTask * sizeof(ptrdiff_t));
+
+  MPI_Gather(&off, sizeof(ptrdiff_t), MPI_BYTE, offset_tab, sizeof(ptrdiff_t), MPI_BYTE, Island_NTask - 1, SharedMemComm);
+}
+
+void shmem::inform_offset_table(void *p)  // called by worked tasks
+{
+  ptrdiff_t off = ((char *)p - Mem.Base);
+
+  MPI_Gather(&off, sizeof(ptrdiff_t), MPI_BYTE, NULL, sizeof(ptrdiff_t), MPI_BYTE, Island_NTask - 1, SharedMemComm);
+}
+
+void shmem::free_offset_table(ptrdiff_t *&offset_tab) { Mem.myfree(offset_tab); }
+
+void shmem::shared_memory_handler(void)
+{
+  simparticles Dp{MPI_COMM_WORLD}; /* dummy needed to access drift functions for ngbtree */
+
+  /* first, we wait for the parameter All.MaxMemSize, so that we can initialize the memory handler */
+  MPI_Bcast(All.get_data_ptr(), All.get_data_size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+  Mem.mymalloc_init(All.MaxMemSize, RST_BEGIN);
+
+  while(true)
+    {
+      /* wait for an incoming message */
+      MPI_Status status;
+      MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+
+      int source = status.MPI_SOURCE;
+      int tag    = status.MPI_TAG;
+
+      int length;
+      MPI_Get_count(&status, MPI_BYTE, &length);
+
+      /* now pick it up */
+      char *message = (char *)Mem.mymalloc("message", length);
+      MPI_Recv(message, length, MPI_BYTE, source, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+      if(tag == TAG_METDATA)  // signals that we are synchronizing addresses and values for tree access
+        {
+          int handle = *((int *)message);
+          Mem.myfree(message);
+
+          MPI_Recv(&All.Ti_Current, sizeof(All.Ti_Current), MPI_BYTE, source, TAG_METDATA + 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+          MPI_Recv(&tree_info[handle].Bd, sizeof(bookkeeping_data), MPI_BYTE, source, TAG_METDATA + 2, MPI_COMM_WORLD,
+                   MPI_STATUS_IGNORE);
+
+          intposconvert *convfac = &Dp;
+          MPI_Recv(convfac, sizeof(intposconvert), MPI_BYTE, source, TAG_METDATA + 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+          prepare_offset_table(NULL, tree_info[handle].TopNodes_offsets);
+          prepare_offset_table(NULL, tree_info[handle].Nodes_offsets);
+          prepare_offset_table(NULL, tree_info[handle].Nextnode_offsets);
+          prepare_offset_table(NULL, tree_info[handle].Points_offsets);
+          prepare_offset_table(NULL, tree_info[handle].P_offsets);
+          prepare_offset_table(NULL, tree_info[handle].SphP_offsets);
+          prepare_offset_table(NULL, tree_info[handle].Foreign_Nodes_offsets);
+          prepare_offset_table(NULL, tree_info[handle].Foreign_Points_offsets);
+        }
+      else if(tag == TAG_HEADER)  // signals that we are freeing addresses we stored for tree access
+        {
+          int handle = *((int *)message);
+          Mem.myfree(message);
+
+          free_offset_table(tree_info[handle].Foreign_Points_offsets);
+          free_offset_table(tree_info[handle].Foreign_Nodes_offsets);
+          free_offset_table(tree_info[handle].SphP_offsets);
+          free_offset_table(tree_info[handle].P_offsets);
+          free_offset_table(tree_info[handle].Points_offsets);
+          free_offset_table(tree_info[handle].Nextnode_offsets);
+          free_offset_table(tree_info[handle].Nodes_offsets);
+          free_offset_table(tree_info[handle].TopNodes_offsets);
+        }
+      else if(tag >= TAG_FETCH_SPH_DENSITY && tag < TAG_FETCH_SPH_DENSITY + MAX_TREE_INFOS)  // fetch from SPH density tree
+        {
+          int handle = tag - TAG_FETCH_SPH_DENSITY;
+          deal_with_sph_node_request(message, length, source, handle, &Dp);
+          Mem.myfree(message);
+        }
+      else if(tag >= TAG_FETCH_SPH_HYDRO && tag < TAG_FETCH_SPH_HYDRO + MAX_TREE_INFOS)  // fetch from SPH hydro tree
+        {
+          int handle = tag - TAG_FETCH_SPH_HYDRO;
+          deal_with_sph_node_request(message, length, source, handle, &Dp);
+          Mem.myfree(message);
+        }
+      else if(tag >= TAG_FETCH_SPH_TREETIMESTEP && tag < TAG_FETCH_SPH_TREETIMESTEP + MAX_TREE_INFOS)  // fetch from SPH timesteptree
+        {
+          int handle = tag - TAG_FETCH_SPH_TREETIMESTEP;
+          deal_with_sph_node_request(message, length, source, handle, &Dp);
+          Mem.myfree(message);
+        }
+      else if(tag >= TAG_FETCH_GRAVTREE && tag < TAG_FETCH_GRAVTREE + MAX_TREE_INFOS)  // fetch from gravity tree
+        {
+          int handle = tag - TAG_FETCH_GRAVTREE;
+          deal_with_gravity_node_request(message, length, source, handle);
+          Mem.myfree(message);
+        }
+      else if(tag == TAG_KEY)  // request to terminate gracefully
+        {
+          H5Eset_auto(H5E_DEFAULT, NULL, NULL);
+          MPI_Finalize();
+          exit(0);
+        }
+      else if(tag == TAG_TABLE_ALLOC)  // take over storage for TableData
+        {
+          size_t tab_len = *((size_t *)message);
+          Mem.myfree(message);
+
+          TableData = (char *)Mem.mymalloc("table", tab_len);
+          MPI_Recv(TableData, tab_len, MPI_BYTE, source, TAG_DMOM, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+          ptrdiff_t off = ((char *)TableData - Mem.Base);
+          MPI_Bcast(&off, sizeof(ptrdiff_t), MPI_BYTE, Island_ThisTask, SharedMemComm);
+        }
+      else if(tag == TAG_TABLE_FREE)
+        {
+          Mem.myfree(message);
+          Mem.myfree(TableData);
+        }
+      else if(tag == TAG_EWALD_ALLOC)  // take over storage for EwaldData
+        {
+          size_t tab_len = *((size_t *)message);
+          Mem.myfree(message);
+
+          EwaldData = (char *)Mem.mymalloc("table", tab_len);
+          MPI_Recv(EwaldData, tab_len, MPI_BYTE, source, TAG_DMOM, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+          ptrdiff_t off = ((char *)EwaldData - Mem.Base);
+          MPI_Bcast(&off, sizeof(ptrdiff_t), MPI_BYTE, Island_ThisTask, SharedMemComm);
+        }
+      else if(tag == TAG_TOPNODE_ALLOC)  // take over the storage of common top-level tree data
+        {
+          int handle = num_tree_info++;
+          if(num_tree_info > MAX_TREE_INFOS)
+            Terminate("num_tree_info > MAX_TREE_INFOS");
+
+          size_t *sizep   = ((size_t *)message);
+          size_t sizes[4] = {sizep[0], sizep[1], sizep[2], sizep[3]};
+          Mem.myfree(message);
+
+          tree_info[handle].NodeLevel_storage   = (char *)Mem.mymalloc("NodeLevel_storage", sizes[0]);
+          tree_info[handle].NodeSibling_storage = (char *)Mem.mymalloc("NodeSibling_storage", sizes[1]);
+          tree_info[handle].NodeIndex_storage   = (char *)Mem.mymalloc("NodeIndex_storage", sizes[2]);
+          tree_info[handle].TopNodes_storage    = (char *)Mem.mymalloc("TopNodes_storage", sizes[3]);
+
+          ptrdiff_t off[4] = {
+              ((char *)tree_info[handle].NodeLevel_storage - Mem.Base), ((char *)tree_info[handle].NodeSibling_storage - Mem.Base),
+              ((char *)tree_info[handle].NodeIndex_storage - Mem.Base), ((char *)tree_info[handle].TopNodes_storage - Mem.Base)};
+
+          MPI_Send(off, 4 * sizeof(ptrdiff_t), MPI_BYTE, source, TAG_TOPNODE_OFFSET, MPI_COMM_WORLD);
+
+          MPI_Send(&handle, 1, MPI_INT, source, TAG_N, MPI_COMM_WORLD);
+        }
+      else if(tag == TAG_TOPNODE_FREE)  // free the top-level storage for a tree again
+        {
+          int handle = *((int *)message);
+          Mem.myfree(message);
+
+          num_tree_info--;
+          if(handle != num_tree_info)
+            Terminate("unexpected handle");
+
+          Mem.myfree(tree_info[handle].TopNodes_storage);
+          Mem.myfree(tree_info[handle].NodeIndex_storage);
+          Mem.myfree(tree_info[handle].NodeSibling_storage);
+          Mem.myfree(tree_info[handle].NodeLevel_storage);
+        }
+      else if(tag == TAG_DRIFT_INIT)  // make the shared memory handler update All and init the local drift tables
+        {
+          memcpy(All.get_data_ptr(), message, All.get_data_size());
+          Mem.myfree(message);
+          Driftfac.init_drift_table();
+        }
+    }
+}
+
+void shmem::deal_with_sph_node_request(char *message, int length, int source, int handle, simparticles *Sp)
+{
+#ifndef LEAN
+  bookkeeping_data &Bdat = tree_info[handle].Bd;
+
+  // we got the list of requested nodes
+  ntree::node_req *node_req_recv = (ntree::node_req *)message;
+  int nrecv                      = length / sizeof(ntree::node_req);
+
+  /* as part of this message, we get the real actually targeted rank in
+   * the simulation communicator. We will translate this to the rank in our shared memory
+   * block
+   */
+
+  /* now prepare answer message by reading from shared memory */
+  /******************* prepare tree answer ********************/
+
+  ntree::node_count_info *node_info_recv =
+      (ntree::node_count_info *)Mem.mymalloc("node_info_recv", nrecv * sizeof(ntree::node_count_info));
+
+  /* first let's count how many nodes and particles are hanging below in each case */
+  int n_recvpoints = 0;
+  int n_recvnodes  = 0;
+
+  for(int i = 0; i < nrecv; i++)
+    {
+      node_info_recv[i].count_nodes = 0;
+      node_info_recv[i].count_parts = 0;
+
+      int no      = node_req_recv[i].foreignnode;
+      int task    = node_req_recv[i].foreigntask;
+      int shmrank = GetShmRankForSimulCommRank[task];  // corresponding local shared memory rank, stays fixed below
+
+      if(no < Bdat.MaxPart || no >= Bdat.MaxPart + Bdat.MaxNodes)
+        Terminate("not an internal node");
+
+      ngbnode *nop = ((ngbnode *)get_basenodep(no, shmrank, handle)) + no;
+
+      int p = nop->nextnode;
+
+      while(p != nop->sibling)
+        {
+          if(p < 0)
+            Terminate("p=%d < 0", p);
+
+          if(p < Bdat.MaxPart) /* a local particle */
+            {
+              node_info_recv[i].count_parts++;
+              p = get_nextnodep(shmrank, handle)[p];
+            }
+          else if(p < Bdat.MaxPart + Bdat.MaxNodes) /* an internal node  */
+            {
+              node_info_recv[i].count_nodes++;
+              p = (((ngbnode *)get_basenodep(p, shmrank, handle)) + p)->sibling;
+            }
+          else if(p >= Bdat.ImportedNodeOffset && p < Bdat.EndOfTreePoints) /* an imported tree point */
+            {
+              node_info_recv[i].count_parts++;
+              p = get_nextnodep(shmrank, handle)[p - Bdat.MaxNodes];
+            }
+          else
+            Terminate("p=%d MaxPart=%d MaxNodes=%d", p, Bdat.MaxPart, Bdat.MaxNodes);
+        }
+
+      if(node_info_recv[i].count_parts == 0 && node_info_recv[i].count_nodes == 0)
+        Terminate("strange: we have we requested an empty node?\n");
+
+      n_recvpoints += node_info_recv[i].count_parts;
+      n_recvnodes += node_info_recv[i].count_nodes;
+    }
+
+  foreign_sphpoint_data *exportbuf_points = (foreign_sphpoint_data *)Mem.mymalloc_movable(
+      &exportbuf_points, "exportbuf_points", n_recvpoints * sizeof(foreign_sphpoint_data));
+  ngbnode *exportbuf_nodes = (ngbnode *)Mem.mymalloc_movable(&exportbuf_nodes, "exportbuf_nodes", n_recvnodes * sizeof(ngbnode));
+
+  n_recvpoints = 0;
+  n_recvnodes  = 0;
+
+  for(int i = 0; i < nrecv; i++)
+    {
+      int no      = node_req_recv[i].foreignnode;
+      int task    = node_req_recv[i].foreigntask;
+      int shmrank = GetShmRankForSimulCommRank[task];  // corresponding local shared memory rank, stays fixed below
+
+      ngbnode *nop = ((ngbnode *)get_basenodep(no, shmrank, handle)) + no;
+
+      int p = nop->nextnode;
+
+      while(p != nop->sibling)
+        {
+          if(p < Bdat.MaxPart) /* a local particle */
+            {
+              int off = n_recvpoints++;
+
+              foreign_sphpoint_data *expoints = &exportbuf_points[off];
+
+              particle_data *ptr         = (particle_data *)get_Pp(shmrank, handle) + p;
+              sph_particle_data *sph_ptr = (sph_particle_data *)get_SphPp(shmrank, handle) + p;
+
+              particle_data p_copy;
+              sph_particle_data sphp_copy;
+
+              if(ptr->get_Ti_Current() != All.Ti_Current)
+                {
+                  /* Because of possible lightcone output, shared memory fetch not allowed to drift original,
+                   * because this rank doesn't have a lightcone buffer. The original node needs to do it.
+                   * We thus drift a copy of the particle without allowing lightcone access
+                   */
+#ifndef LEAN
+                  while(ptr->access.test_and_set(std::memory_order_acquire))
+                    ;  // acquire spin lock
+#endif
+                  p_copy    = *ptr;
+                  sphp_copy = *sph_ptr;
+
+#ifndef LEAN
+                  ptr->access.clear(std::memory_order_release);  // release spin lock
+#endif
+                  p_copy.access.clear(std::memory_order_release);  // clear spin lock in copy
+
+                  /* use the copy from now on */
+
+                  ptr     = &p_copy;
+                  sph_ptr = &sphp_copy;
+
+                  // the final flag tells the drift to not consider lightcone crossings
+                  Sp->drift_particle(ptr, sph_ptr, All.Ti_Current, true);
+                }
+
+              expoints->IntPos[0]    = ptr->IntPos[0];
+              expoints->IntPos[1]    = ptr->IntPos[1];
+              expoints->IntPos[2]    = ptr->IntPos[2];
+              expoints->Mass         = ptr->getMass();
+              expoints->TimeBinHydro = ptr->TimeBinHydro;
+              expoints->SphCore      = *sph_ptr;
+
+              expoints->Nextnode = -1;
+
+              p = get_nextnodep(shmrank, handle)[p];
+            }
+          else if(p < Bdat.MaxPart + Bdat.MaxNodes) /* an internal node  */
+            {
+              int off = n_recvnodes++;
+
+              ngbnode *sourcep = (((ngbnode *)get_basenodep(p, shmrank, handle)) + p);
+              ngbnode *exnodes = &exportbuf_nodes[off];
+
+              if(sourcep->Ti_Current != All.Ti_Current)
+                sourcep->drift_node(All.Ti_Current, Sp);
+
+              *exnodes = *sourcep;
+
+              exnodes->cannot_be_opened_locally = 1;
+              exnodes->nextnode                 = -1;
+              exnodes->sibling                  = -1;
+              exnodes->OriginTask               = task;
+              exnodes->OriginNode               = p;
+
+              p = sourcep->sibling;
+            }
+          else if(p >= Bdat.ImportedNodeOffset) /* an imported treepoint particle  */
+            {
+              Terminate("not expected here");
+            }
+        }
+    }
+
+  /************************************************************/
+
+  MPI_Send(node_info_recv, nrecv * sizeof(ntree::node_count_info), MPI_BYTE, source, TAG_N, MPI_COMM_WORLD);
+
+  /* now transfer the points and nodes */
+  if(n_recvpoints > 0)
+    MPI_Send(exportbuf_points, n_recvpoints * sizeof(foreign_sphpoint_data), MPI_BYTE, source, TAG_PDATA, MPI_COMM_WORLD);
+
+  if(n_recvnodes > 0)
+    MPI_Send(exportbuf_nodes, n_recvnodes * sizeof(ngbnode), MPI_BYTE, source, TAG_SPHDATA, MPI_COMM_WORLD);
+
+  Mem.myfree(exportbuf_nodes);
+  Mem.myfree(exportbuf_points);
+  Mem.myfree(node_info_recv);
+#endif
+}
+
+void shmem::deal_with_gravity_node_request(char *message, int length, int source, int handle)
+{
+  bookkeeping_data &Bdat = tree_info[handle].Bd;
+
+  // we got the list of requested nodes
+  gtree::node_req *node_req_recv = (gtree::node_req *)message;
+  int nrecv                      = length / sizeof(gtree::node_req);
+
+  /* as part of this message, we get the real actually targeted rank in
+   * the simulation communicator. We will translate this to the rank in our shared memory
+   * block
+   */
+
+  /* now prepare answer message by reading from shared memory */
+  /******************* prepare tree answer ********************/
+
+  gtree::node_count_info *node_info_recv =
+      (gtree::node_count_info *)Mem.mymalloc("node_info_recv", nrecv * sizeof(gtree::node_count_info));
+
+  /* first let's count how many nodes and particles are hanging below in each case */
+  int n_recvpoints = 0;
+  int n_recvnodes  = 0;
+
+  for(int i = 0; i < nrecv; i++)
+    {
+      node_info_recv[i].count_nodes = 0;
+      node_info_recv[i].count_parts = 0;
+
+      int no      = node_req_recv[i].foreignnode;
+      int task    = node_req_recv[i].foreigntask;
+      int shmrank = GetShmRankForSimulCommRank[task];  // corresponding local shared memory rank, stays fixed below
+
+      if(no < Bdat.MaxPart || no >= Bdat.MaxPart + Bdat.MaxNodes)
+        Terminate("not an internal node");
+
+      gravnode *nop = ((gravnode *)get_basenodep(no, shmrank, handle)) + no;
+
+      int p = nop->nextnode;
+
+      while(p != nop->sibling)
+        {
+          if(p < 0)
+            Terminate("p=%d < 0", p);
+
+          if(p < Bdat.MaxPart) /* a local particle */
+            {
+              node_info_recv[i].count_parts++;
+              p = get_nextnodep(shmrank, handle)[p];
+            }
+          else if(p < Bdat.MaxPart + Bdat.MaxNodes) /* an internal node  */
+            {
+              node_info_recv[i].count_nodes++;
+              p = (((gravnode *)get_basenodep(p, shmrank, handle)) + p)->sibling;
+            }
+          else if(p >= Bdat.ImportedNodeOffset && p < Bdat.EndOfTreePoints) /* an imported tree point */
+            {
+              node_info_recv[i].count_parts++;
+              p = get_nextnodep(shmrank, handle)[p - Bdat.MaxNodes];
+            }
+          else
+            Terminate("p=%d MaxPart=%d MaxNodes=%d", p, Bdat.MaxPart, Bdat.MaxNodes);
+        }
+
+      if(node_info_recv[i].count_parts == 0 && node_info_recv[i].count_nodes == 0)
+        Terminate("strange: we have we requested an empty node?\n");
+
+      n_recvpoints += node_info_recv[i].count_parts;
+      n_recvnodes += node_info_recv[i].count_nodes;
+    }
+
+  foreign_gravpoint_data *exportbuf_points = (foreign_gravpoint_data *)Mem.mymalloc_movable(
+      &exportbuf_points, "exportbuf_points", n_recvpoints * sizeof(foreign_gravpoint_data));
+  gravnode *exportbuf_nodes = (gravnode *)Mem.mymalloc_movable(&exportbuf_nodes, "exportbuf_nodes", n_recvnodes * sizeof(gravnode));
+
+  n_recvpoints = 0;
+  n_recvnodes  = 0;
+
+  for(int i = 0; i < nrecv; i++)
+    {
+      int no      = node_req_recv[i].foreignnode;
+      int task    = node_req_recv[i].foreigntask;
+      int shmrank = GetShmRankForSimulCommRank[task];  // corresponding local shared memory rank, stays fixed below
+
+      gravnode *nop = ((gravnode *)get_basenodep(no, shmrank, handle)) + no;
+
+      int p = nop->nextnode;
+
+      while(p != nop->sibling)
+        {
+          if(p < Bdat.MaxPart) /* a local particle */
+            {
+              int off = n_recvpoints++;
+
+              foreign_gravpoint_data *expoints = &exportbuf_points[off];
+
+              particle_data *ptr = (particle_data *)get_Pp(shmrank, handle) + p;
+
+              expoints->IntPos[0] = ptr->IntPos[0];
+              expoints->IntPos[1] = ptr->IntPos[1];
+              expoints->IntPos[2] = ptr->IntPos[2];
+              expoints->Mass      = ptr->getMass();
+              expoints->Type      = ptr->getType();
+              expoints->OldAcc    = ptr->OldAcc;
+#if NSOFTCLASSES > 1
+              expoints->SofteningClass = ptr->getSofteningClass();
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+              expoints->InsideOutsideFlag = ptr->InsideOutsideFlag;
+#endif
+              expoints->Nextnode = -1;
+
+              p = get_nextnodep(shmrank, handle)[p];
+            }
+          else if(p < Bdat.MaxPart + Bdat.MaxNodes) /* an internal node  */
+            {
+              int off = n_recvnodes++;
+
+              gravnode *exnodes = &exportbuf_nodes[off];
+              gravnode *sourcep = (((gravnode *)get_basenodep(p, shmrank, handle)) + p);
+
+              memcpy(static_cast<void *>(exnodes), static_cast<void *>(sourcep),
+                     sizeof(gravnode));  //  cannot do a  *exnodes = *sourcep; because out std::atomic_flag
+                                         //  has a deleted default copy operator
+
+              exnodes->cannot_be_opened_locally = 1;
+              exnodes->nextnode                 = -1;
+              exnodes->sibling                  = -1;
+              exnodes->OriginTask               = task;
+              exnodes->OriginNode               = p;
+
+              p = sourcep->sibling;
+            }
+          else if(p >= Bdat.ImportedNodeOffset) /* an imported Treepoint particle  */
+            {
+              int off = n_recvpoints++;
+
+              foreign_gravpoint_data *expoints = &exportbuf_points[off];
+              int n                            = p - Bdat.ImportedNodeOffset;
+              gravpoint_data *pointsp          = ((gravpoint_data *)get_pointsp(shmrank, handle)) + n;
+
+              expoints->IntPos[0] = pointsp->IntPos[0];
+              expoints->IntPos[1] = pointsp->IntPos[1];
+              expoints->IntPos[2] = pointsp->IntPos[2];
+              expoints->Mass      = pointsp->Mass;
+              expoints->Type      = pointsp->Type;
+              expoints->OldAcc    = pointsp->OldAcc;
+#if NSOFTCLASSES > 1
+              expoints->SofteningClass = pointsp->SofteningClass;
+#endif
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+              expoints->InsideOutsideFlag = pointsp->InsideOutsideFlag;
+#endif
+              expoints->Nextnode = -1;
+
+              p = get_nextnodep(shmrank, handle)[p - Bdat.MaxNodes];
+            }
+        }
+    }
+
+  /************************************************************/
+
+  MPI_Send(node_info_recv, nrecv * sizeof(gtree::node_count_info), MPI_BYTE, source, TAG_N, MPI_COMM_WORLD);
+
+  /* now transfer the points and nodes */
+  if(n_recvpoints > 0)
+    MPI_Send(exportbuf_points, n_recvpoints * sizeof(foreign_gravpoint_data), MPI_BYTE, source, TAG_PDATA, MPI_COMM_WORLD);
+
+  if(n_recvnodes > 0)
+    MPI_Send(exportbuf_nodes, n_recvnodes * sizeof(gravnode), MPI_BYTE, source, TAG_SPHDATA, MPI_COMM_WORLD);
+
+  Mem.myfree(exportbuf_nodes);
+  Mem.myfree(exportbuf_points);
+  Mem.myfree(node_info_recv);
+}
diff --git a/src/mpi_utils/shared_mem_handler.h b/src/mpi_utils/shared_mem_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..befcf6154d5f8921667be889e01fd2ecca5c8b93
--- /dev/null
+++ b/src/mpi_utils/shared_mem_handler.h
@@ -0,0 +1,142 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  shared_mem_handler.h
+ *
+ *  \brief provides a class for accessing data of other MPI ranks via shared memory and designated MPI handler ranks
+ */
+
+#ifndef SHAREDMEM_H
+#define SHAREDMEM_H
+
+#include <hdf5.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <atomic>
+#include <cstring>
+
+#include "../data/simparticles.h"
+
+#define MAX_TREE_INFOS 10
+
+class shmem
+{
+ public:
+  MPI_Comm SharedMemComm;   // the communicator linking the processes that have mutual shared memory access in the same node
+  MPI_Comm SimulationComm;  // the communicator containing all the compute processors (or all the ghost processors)
+
+  int World_ThisTask;  // rank
+  int World_NTask;     // total number of MPI processes
+
+  int Island_ThisTask;  // rank in current shared memory region
+  int Island_NTask;     // number of MPI tasks in shared memory region
+
+  int Sim_ThisTask;  // rank in simulation partition
+  int Sim_NTask;     // size of MPI tasks in simulation partition
+
+  int GhostRank;  // equal to 1 if we are a ghost rank, otherwise zero
+
+  int Island_Smallest_WorldTask;  // this is the smallest global rank in the shared memory node
+
+  // we need a table that maps the rank of a destination processor in the
+  // simulation communicator to the rank of the responsible ghost processor in
+  // the global communicator
+  int *GetGhostRankForSimulCommRank;
+
+  // we need a table that maps the rank of a destination processor in the
+  // simulation communicator to the rank in the shared memory communicator
+  int *GetShmRankForSimulCommRank;
+
+  // we need a table that maps the rank of a simulation processor to the
+  // smallest world rank in its shared memory node. With this we can decide
+  // whether two ranks are on the same node
+  int *GetNodeIDForSimulCommRank;
+
+  // the rank in the global communicator that a processor should turn to for a shared memory request
+  int MyShmRankInGlobal;
+
+  MPI_Win SharedMemWin;
+
+  void **SharedMemBaseAddr;
+
+  char *TableData;
+  char *EwaldData;
+
+  struct bookkeeping_data
+  {
+    int MaxPart;
+    int MaxNodes;
+    int NTopnodes;
+    int ImportedNodeOffset;
+    int EndOfTreePoints;
+    int EndOfForeignNodes;
+  };
+
+  struct tree_storage_info
+  {
+    bookkeeping_data Bd;
+
+    ptrdiff_t *TopNodes_offsets;
+    ptrdiff_t *Nodes_offsets;
+    ptrdiff_t *Nextnode_offsets;
+    ptrdiff_t *Points_offsets;
+    ptrdiff_t *P_offsets;
+    ptrdiff_t *SphP_offsets;
+    ptrdiff_t *Foreign_Points_offsets;
+    ptrdiff_t *Foreign_Nodes_offsets;
+
+    char *TopNodes_storage;
+    char *NodeLevel_storage;
+    char *NodeSibling_storage;
+    char *NodeIndex_storage;
+  };
+
+  tree_storage_info tree_info[MAX_TREE_INFOS];
+  int num_tree_info = 0;
+
+  inline char *get_basenodep(int no, unsigned char shmrank, int handle)
+  {
+    if(no < tree_info[handle].Bd.MaxPart + tree_info[handle].Bd.NTopnodes)
+      return ((char *)SharedMemBaseAddr[shmrank] + tree_info[handle].TopNodes_offsets[shmrank]);
+    else
+      return ((char *)SharedMemBaseAddr[shmrank] + tree_info[handle].Nodes_offsets[shmrank]);
+  }
+
+  inline int *get_nextnodep(unsigned char shmrank, int handle)
+  {
+    return (int *)((char *)SharedMemBaseAddr[shmrank] + tree_info[handle].Nextnode_offsets[shmrank]);
+  }
+
+  inline char *get_pointsp(unsigned char shmrank, int handle)
+  {
+    return ((char *)SharedMemBaseAddr[shmrank] + tree_info[handle].Points_offsets[shmrank]);
+  }
+
+  inline char *get_Pp(unsigned char shmrank, int handle)
+  {
+    return ((char *)SharedMemBaseAddr[shmrank] + tree_info[handle].P_offsets[shmrank]);
+  }
+
+  inline char *get_SphPp(unsigned char shmrank, int handle)
+  {
+    return ((char *)SharedMemBaseAddr[shmrank] + tree_info[handle].SphP_offsets[shmrank]);
+  }
+
+  void deal_with_gravity_node_request(char *message, int length, int source, int handle);
+  void deal_with_sph_node_request(char *message, int length, int source, int handle, simparticles *Sp);
+
+  void prepare_offset_table(void *p, ptrdiff_t *&offset_tab);
+  void inform_offset_table(void *p);
+  void free_offset_table(ptrdiff_t *&offset_tab);
+
+  void shared_memory_handler(void);
+};
+
+extern shmem Shmem;
+
+#endif
diff --git a/src/mpi_utils/sizelimited_sendrecv.cc b/src/mpi_utils/sizelimited_sendrecv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..acc163df42077b01b41d26941542b836e952b779
--- /dev/null
+++ b/src/mpi_utils/sizelimited_sendrecv.cc
@@ -0,0 +1,80 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  sizelimited_sendrecv.cc
+ *
+ *  \brief implements a wrapper around MPI_Sendrecv that if needed transmits the data in smaller pieces than a prescribed maximum size
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../mpi_utils/mpi_utils.h"
+
+int myMPI_Sendrecv(void *sendb, size_t sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvb, size_t recvcount,
+                   MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status *status)
+{
+  int iter      = 0, size_sendtype, size_recvtype, send_now, recv_now;
+  char *sendbuf = (char *)sendb;
+  char *recvbuf = (char *)recvb;
+
+  if(dest != source)
+    Terminate("dest != source");
+
+  MPI_Type_size(sendtype, &size_sendtype);
+  MPI_Type_size(recvtype, &size_recvtype);
+
+  int thistask;
+  MPI_Comm_rank(comm, &thistask);
+
+  if(dest == thistask)
+    {
+      memcpy(recvbuf, sendbuf, recvcount * size_recvtype);
+      return 0;
+    }
+
+  size_t count_limit = MPI_MESSAGE_SIZELIMIT_IN_BYTES / size_sendtype;
+
+  while(sendcount > 0 || recvcount > 0)
+    {
+      if(sendcount > count_limit)
+        {
+          send_now = count_limit;
+          /*
+             if(iter == 0)
+             {
+             printf("Imposing size limit on MPI_Sendrecv() on task=%d (send of size=%lld)\n", ThisTask, (long long) sendcount *
+             size_sendtype); myflush(stdout);
+             }
+           */
+          iter++;
+        }
+      else
+        send_now = sendcount;
+
+      if(recvcount > count_limit)
+        recv_now = count_limit;
+      else
+        recv_now = recvcount;
+
+      MPI_Sendrecv(sendbuf, send_now, sendtype, dest, sendtag, recvbuf, recv_now, recvtype, source, recvtag, comm, status);
+
+      sendcount -= send_now;
+      recvcount -= recv_now;
+
+      sendbuf += send_now * size_sendtype;
+      recvbuf += recv_now * size_recvtype;
+    }
+
+  return 0;
+}
diff --git a/src/mpi_utils/sums_and_minmax.cc b/src/mpi_utils/sums_and_minmax.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d0c3b1e002d97239e52047271867a09a8b0cd321
--- /dev/null
+++ b/src/mpi_utils/sums_and_minmax.cc
@@ -0,0 +1,50 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  sums_and_minmax.cc
+ *
+ *  \brief some simple extensions of MPI-collectives
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../mpi_utils/mpi_utils.h"
+
+void minimum_large_ints(int n, long long *src, long long *res, MPI_Comm comm)
+{
+  if(src == res)
+    MPI_Allreduce(MPI_IN_PLACE, res, n, MPI_LONG_LONG, MPI_MIN, comm);
+  else
+    MPI_Allreduce(src, res, n, MPI_LONG_LONG, MPI_MIN, comm);
+}
+
+void sumup_large_ints(int n, int *src, long long *res, MPI_Comm comm)
+{
+  long long *numlist = (long long *)Mem.mymalloc("numlist", n * sizeof(long long));
+
+  for(int j = 0; j < n; j++)
+    numlist[j] = src[j];
+  MPI_Allreduce(numlist, res, n, MPI_LONG_LONG, MPI_SUM, comm);
+
+  Mem.myfree(numlist);
+}
+
+void sumup_longs(int n, long long *src, long long *res, MPI_Comm comm)
+{
+  if(src == res)
+    MPI_Allreduce(MPI_IN_PLACE, res, n, MPI_LONG_LONG, MPI_SUM, comm);
+  else
+    MPI_Allreduce(src, res, n, MPI_LONG_LONG, MPI_SUM, comm);
+}
diff --git a/src/ngbtree/ngbtree.h b/src/ngbtree/ngbtree.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d4402478bf17de434c837984acc7284b1546cb6
--- /dev/null
+++ b/src/ngbtree/ngbtree.h
@@ -0,0 +1,161 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  ngbtree.h
+ *
+ *  \brief declaration of the class providing the neighbor tree routines
+ */
+
+#ifndef NGBTREE_H_
+#define NGBTREE_H_
+
+#include "../data/simparticles.h"
+#include "../time_integration/driftfac.h"
+#include "../tree/tree.h"
+
+/** The ngb-tree data structure
+ */
+
+struct ngbpoint_data
+{
+  MyIntPosType IntPos[3];
+  int no;
+};
+
+struct ngbnode : public basenode
+{
+  // we do this ugly trick of using memcpy for our own copy constructor and assignment operator
+  // because the atomic_flag in particle_data has an implicitly deleted copy operator... so that the implicit functions
+  // for this are unavailable. But we know what we are doing here, and surrounding this with an ugly hack
+  // is the easiest way at the moment to work around this in our case unnecessary protection
+
+  ngbnode() {}
+
+  // declare our own copy constructor
+  ngbnode(ngbnode &other) { memcpy(static_cast<void *>(this), static_cast<void *>(&other), sizeof(ngbnode)); }
+
+  // declare our own assignment operator
+  ngbnode &operator=(ngbnode &other)
+  {
+    memcpy(static_cast<void *>(this), static_cast<void *>(&other), sizeof(ngbnode));
+    return *this;
+  }
+
+  MySignedIntPosType center_offset_min[3];
+  MySignedIntPosType center_offset_max[3];
+
+  MyNgbTreeFloat vmin[3];
+  MyNgbTreeFloat vmax[3];
+  MyNgbTreeFloat MaxHsml;
+  MyNgbTreeFloat MaxDtHsml;
+  MyNgbTreeFloat MaxCsnd;
+
+  std::atomic<integertime> Ti_Current;
+
+  void drift_node(integertime time1, simparticles *Sp)
+  {
+    while(access.test_and_set(std::memory_order_acquire))
+      ;  // acquire spin lock
+
+    if(Ti_Current != time1)
+      {
+        double dt_drift;
+
+        if(All.ComovingIntegrationOn)
+          dt_drift = Driftfac.get_drift_factor(Ti_Current, time1);
+        else
+          dt_drift = (time1 - Ti_Current) * All.Timebase_interval;
+
+        /* get the shift in the enclosing box in coordinate space */
+        double posdiff_min[3];
+        double posdiff_max[3];
+        for(int j = 0; j < 3; j++)
+          {
+            posdiff_min[j] = vmin[j] * dt_drift;
+            posdiff_max[j] = vmax[j] * dt_drift;
+          }
+
+        /* convert to integer coordinate shifts */
+        MySignedIntPosType delta_min[3];
+        MySignedIntPosType delta_max[3];
+        Sp->pos_to_signedintpos(posdiff_min, delta_min);
+        Sp->pos_to_signedintpos(posdiff_max, delta_max);
+
+        /* not adjust the bounding box in integer coordinates */
+        for(int j = 0; j < 3; j++)
+          {
+            center_offset_min[j] += delta_min[j];
+            center_offset_max[j] += delta_max[j];
+          }
+
+        MaxHsml += MaxDtHsml * dt_drift;
+
+        Ti_Current = time1;
+      }
+
+    access.clear(std::memory_order_release);
+  }
+};
+
+struct foreign_sphpoint_data
+{
+  sph_particle_data_hydrocore SphCore;
+
+  MyDouble Mass;
+  MyIntPosType IntPos[3];
+
+  int Nextnode;
+  unsigned char Nextnode_shmrank;
+
+  signed char TimeBinHydro;
+};
+
+class ngbtree : public tree<ngbnode, simparticles, ngbpoint_data, foreign_sphpoint_data>
+{
+ public:
+  typedef tree<ngbnode, simparticles, ngbpoint_data, foreign_sphpoint_data> basetree;
+
+  // The various using statements  make sure that we can access the elements from the base class without having to use "this"
+  using basetree::Buildtime;
+  using basetree::D;
+  using basetree::Father;
+  using basetree::FirstNonTopLevelNode;
+  using basetree::get_nextnodep;
+  using basetree::get_nodep;
+  using basetree::get_Pp;
+  using basetree::get_SphPp;
+  using basetree::ImportedNodeOffset;
+  using basetree::MaxNodes;
+  using basetree::MaxPart;
+  using basetree::Nextnode;
+  using basetree::NodeIndex;
+  using basetree::Nodes;
+  using basetree::NumNodes;
+  using basetree::Recv_count;
+  using basetree::Recv_offset;
+  using basetree::Send_count;
+  using basetree::Send_offset;
+  using basetree::TopNodes;
+  using basetree::Tp;
+  using basetree::tree_export_node_threads;
+  using basetree::TreeSharedMem_ThisTask;
+
+  void update_velocities(void);
+  void update_maxhsml(void);
+  void update_vbounds(int i, int *nchanged, int *nodelist, char *flag_changed);
+  void check_bounds(void);
+
+  void update_node_recursive(int no, int sib, int mode) override;
+  void exchange_topleafdata(void) override;
+  void fill_in_export_points(ngbpoint_data *exp_point, int i, int no) override;
+  void report_log_message(void) override;
+
+ private: /* private member functions */
+  void finish_vounds_update(int nchanged, int *nodelist);
+  void finish_maxhsml_update(int nchanged, int *nodelist);
+};
+
+#endif
diff --git a/src/ngbtree/ngbtree_build.cc b/src/ngbtree/ngbtree_build.cc
new file mode 100644
index 0000000000000000000000000000000000000000..126132bd89d395640cc774b01afc1b679563e08f
--- /dev/null
+++ b/src/ngbtree/ngbtree_build.cc
@@ -0,0 +1,757 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  ngbtree_build.cc
+ *
+ *  \brief contains the code for the neighbor tree construction
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../gravtree/gravtree.h"
+#include "../io/io.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../ngbtree/ngbtree.h"
+#include "../sort/peano.h"
+#include "../system/system.h"
+#include "../time_integration/driftfac.h"
+#include "../time_integration/timestep.h"
+
+void ngbtree::report_log_message(void)
+{
+  double numnodes = NumNodes, tot_numnodes;
+  MPI_Reduce(&numnodes, &tot_numnodes, 1, MPI_DOUBLE, MPI_SUM, 0, D->Communicator);
+
+  D->mpi_printf("NGBTREE: Ngb-tree construction done. took %g sec  <numnodes>=%g  NTopnodes=%d NTopleaves=%d\n", Buildtime,
+                tot_numnodes / D->NTask, D->NTopnodes, D->NTopleaves);
+}
+
+void ngbtree::fill_in_export_points(ngbpoint_data *exp_point, int i, int no) { Terminate("we don't expect to get here"); }
+
+void ngbtree::exchange_topleafdata(void)
+{
+  struct leafnode_data
+  {
+    MySignedIntPosType center_offset_min[3];
+    MySignedIntPosType center_offset_max[3];
+    MyNgbTreeFloat vmin[3];
+    MyNgbTreeFloat vmax[3];
+    MyNgbTreeFloat MaxCsnd;
+    MyNgbTreeFloat MaxHsml;
+    MyNgbTreeFloat MaxDtHsml;
+    unsigned char not_empty;
+  };
+  leafnode_data *glob_leaf_node_data, *loc_leaf_node_data;
+
+  glob_leaf_node_data = (leafnode_data *)Mem.mymalloc("glob_leaf_node_data", D->NTopleaves * sizeof(leafnode_data));
+
+  /* share the pseudo-particle data accross CPUs */
+  int *recvcounts = (int *)Mem.mymalloc("recvcounts", sizeof(int) * D->NTask);
+  int *recvoffset = (int *)Mem.mymalloc("recvoffset", sizeof(int) * D->NTask);
+  int *bytecounts = (int *)Mem.mymalloc("bytecounts", sizeof(int) * D->NTask);
+  int *byteoffset = (int *)Mem.mymalloc("byteoffset", sizeof(int) * D->NTask);
+
+  for(int task = 0; task < D->NTask; task++)
+    recvcounts[task] = 0;
+
+  for(int n = 0; n < D->NTopleaves; n++)
+    recvcounts[D->TaskOfLeaf[n]]++;
+
+  for(int task = 0; task < D->NTask; task++)
+    bytecounts[task] = recvcounts[task] * sizeof(leafnode_data);
+
+  recvoffset[0] = 0;
+  byteoffset[0] = 0;
+
+  for(int task = 1; task < D->NTask; task++)
+    {
+      recvoffset[task] = recvoffset[task - 1] + recvcounts[task - 1];
+      byteoffset[task] = byteoffset[task - 1] + bytecounts[task - 1];
+    }
+
+  loc_leaf_node_data = (leafnode_data *)Mem.mymalloc("loc_leaf_node_data", recvcounts[D->ThisTask] * sizeof(leafnode_data));
+
+  int idx = 0;
+
+  for(int n = 0; n < D->NTopleaves; n++)
+    {
+      if(D->TaskOfLeaf[n] == D->ThisTask)
+        {
+          int no       = NodeIndex[n];
+          ngbnode *nop = &TopNodes[no];
+
+          leafnode_data *locp = &loc_leaf_node_data[idx];
+
+          locp->MaxCsnd   = nop->MaxCsnd;
+          locp->MaxHsml   = nop->MaxHsml;
+          locp->MaxDtHsml = nop->MaxDtHsml;
+          locp->not_empty = nop->not_empty;
+
+          for(int k = 0; k < 3; k++)
+            {
+              locp->center_offset_min[k] = nop->center_offset_min[k];
+              locp->center_offset_max[k] = nop->center_offset_max[k];
+              locp->vmin[k]              = nop->vmin[k];
+              locp->vmax[k]              = nop->vmax[k];
+            }
+
+          idx++;
+        }
+    }
+
+  MPI_Allgatherv(loc_leaf_node_data, bytecounts[D->ThisTask], MPI_BYTE, glob_leaf_node_data, bytecounts, byteoffset, MPI_BYTE,
+                 D->Communicator);
+
+  for(int task = 0; task < D->NTask; task++)
+    recvcounts[task] = 0;
+
+  for(int n = 0; n < D->NTopleaves; n++)
+    {
+      int task = D->TaskOfLeaf[n];
+      if(task != D->ThisTask)
+        {
+          int no       = NodeIndex[n];
+          ngbnode *nop = &TopNodes[no];
+
+          int idx              = recvoffset[task] + recvcounts[task]++;
+          leafnode_data *globp = &glob_leaf_node_data[idx];
+
+          nop->MaxCsnd    = globp->MaxCsnd;
+          nop->MaxHsml    = globp->MaxHsml;
+          nop->MaxDtHsml  = globp->MaxDtHsml;
+          nop->Ti_Current = All.Ti_Current;
+          nop->not_empty  = globp->not_empty;
+
+          for(int k = 0; k < 3; k++)
+            {
+              nop->center_offset_min[k] = globp->center_offset_min[k];
+              nop->center_offset_max[k] = globp->center_offset_max[k];
+              nop->vmin[k]              = globp->vmin[k];
+              nop->vmax[k]              = globp->vmax[k];
+            }
+
+          nop->Ti_Current = All.Ti_Current;
+        }
+    }
+
+  Mem.myfree(loc_leaf_node_data);
+  Mem.myfree(byteoffset);
+  Mem.myfree(bytecounts);
+  Mem.myfree(recvoffset);
+  Mem.myfree(recvcounts);
+  Mem.myfree(glob_leaf_node_data);
+}
+
+void ngbtree::check_bounds(void)
+{
+  for(int i = 0; i < Ninsert; i++)
+    {
+      if(Tp->P[i].get_Ti_Current() != All.Ti_Current)
+        Tp->drift_particle(&Tp->P[i], &Tp->SphP[i], All.Ti_Current);  // this function avoids race conditions
+
+      int no = Father[i];
+
+      while(no >= 0)
+        {
+          ngbnode *nop = get_nodep(no);
+
+          if(nop->level <= LEVEL_ALWAYS_OPEN)  // don't test the root node
+            break;
+
+          if(nop->Ti_Current != All.Ti_Current)
+            nop->drift_node(All.Ti_Current, Tp);
+
+          int errflag = 0;
+
+          MyIntPosType left[3], right[3];
+
+          left[0]  = Tp->nearest_image_intpos_to_intpos_X(nop->center_offset_min[0] + nop->center[0], Tp->P[i].IntPos[0]);
+          right[0] = Tp->nearest_image_intpos_to_intpos_X(nop->center_offset_max[0] + nop->center[0], Tp->P[i].IntPos[0]);
+
+          /* check whether we can stop walking along this branch */
+          if(left[0] > 0 && right[0] > left[0])
+            errflag |= 1;
+
+          left[1]  = Tp->nearest_image_intpos_to_intpos_Y(nop->center_offset_min[1] + nop->center[1], Tp->P[i].IntPos[1]);
+          right[1] = Tp->nearest_image_intpos_to_intpos_Y(nop->center_offset_max[1] + nop->center[1], Tp->P[i].IntPos[1]);
+
+          /* check whether we can stop walking along this branch */
+          if(left[1] > 0 && right[1] > left[1])
+            errflag |= 2;
+
+          left[2]  = Tp->nearest_image_intpos_to_intpos_Z(nop->center_offset_min[2] + nop->center[2], Tp->P[i].IntPos[2]);
+          right[2] = Tp->nearest_image_intpos_to_intpos_Z(nop->center_offset_max[2] + nop->center[2], Tp->P[i].IntPos[2]);
+
+          /* check whether we can stop walking along this branch */
+          if(left[2] > 0 && right[2] > left[2])
+            errflag |= 4;
+
+          if(errflag)
+            {
+              MyIntPosType range_min[3], range_max[3];
+              for(int k = 0; k < 3; k++)
+                {
+                  range_min[k] = nop->center_offset_min[k] + nop->center[k];
+                  range_max[k] = nop->center_offset_max[k] + nop->center[k];
+                }
+
+              double pos[3], min[3], max[3];
+
+              Tp->intpos_to_pos(Tp->P[i].IntPos, pos);
+              Tp->intpos_to_pos(range_min, min);
+              Tp->intpos_to_pos(range_max, max);
+
+              Terminate(
+                  "level=%d  errflag=%d  pos=%g %g %g  vel=%g %g %g    min=%g %g %g   max=%g %g %g   vmin=%g %g %g  vmax=%g %g %g    "
+                  "\n",
+                  nop->level, errflag, pos[0], pos[1], pos[2], Tp->P[i].Vel[0], Tp->P[i].Vel[1], Tp->P[i].Vel[2], min[0], min[1],
+                  min[2], max[0], max[1], max[2], nop->vmin[0], nop->vmin[1], nop->vmin[2], nop->vmax[0], nop->vmax[1], nop->vmax[2]);
+            }
+
+          errflag = 0;
+
+          for(int k = 0; k < 3; k++)
+            {
+              if(nop->vmin[k] > Tp->P[i].Vel[k])
+                errflag = 1;
+
+              if(nop->vmax[k] < Tp->P[i].Vel[k])
+                errflag = 1;
+            }
+
+          if(errflag)
+            {
+              Terminate("vel=%g %g %g   min=%g %g %g   max=%g %g %g\n", Tp->P[i].Vel[0], Tp->P[i].Vel[1], Tp->P[i].Vel[2],
+                        nop->vmin[0], nop->vmin[1], nop->vmin[2], nop->vmax[0], nop->vmax[1], nop->vmax[2]);
+            }
+
+          no = nop->father;
+        }
+    }
+}
+
+/*! this routine determines the node ranges a given internal node
+ *  and all its subnodes using a recursive computation.  The result is
+ *  stored in the Nodes[] structure in the sequence of this tree-walk.
+ *  mode = 0: process a leaf branch, mode = 1: process top-level nodes
+ */
+
+void ngbtree::update_node_recursive(int no, int sib, int mode)
+{
+  if(!(no >= MaxPart && no < MaxPart + MaxNodes)) /* are we an internal node? */
+    Terminate("no internal node\n");
+
+  ngbnode *nop = get_nodep(no);
+
+  if(mode == TREE_MODE_TOPLEVEL)
+    {
+      int p = nop->nextnode;
+
+      /* if the next node is not a top-level node, we have reached a leaf node, and we need to do nothing */
+      if(p < MaxPart || p >= FirstNonTopLevelNode)
+        return;
+    }
+
+  MyNgbTreeFloat maxcsnd   = 0;
+  MyNgbTreeFloat maxhsml   = 0;
+  MyNgbTreeFloat maxDtHsml = 0;
+
+  MySignedIntPosType center_offset_min[3];
+  MySignedIntPosType center_offset_max[3];
+  MyNgbTreeFloat vmin[3], vmax[3];
+
+  unsigned char not_empty = 0;
+
+  MyIntPosType halflen = ((MyIntPosType)1) << ((BITS_FOR_POSITIONS - 1) - nop->level);
+
+  for(int k = 0; k < 3; k++)
+    {
+      center_offset_min[k] = (halflen - 1);
+      center_offset_max[k] = -halflen;
+
+      vmin[k] = MAX_FLOAT_NUMBER;
+      vmax[k] = -MAX_FLOAT_NUMBER;
+    }
+
+  int p = nop->nextnode;
+
+  while(p != nop->sibling)
+    {
+      if(p >= 0)
+        {
+          if(p >= MaxPart && p < MaxPart + MaxNodes) /* we have an internal node */
+            {
+              int nextsib = get_nodep(p)->sibling;
+
+              update_node_recursive(p, nextsib, mode);
+            }
+
+          if(p < MaxPart) /* a particle */
+            {
+              if(maxcsnd < Tp->get_Csnd(p))
+                maxcsnd = Tp->get_Csnd(p);
+
+              if(maxhsml < Tp->SphP[p].get_Hsml())
+                maxhsml = Tp->SphP[p].get_Hsml();
+
+              if(maxDtHsml < Tp->get_DtHsml(p))
+                maxDtHsml = Tp->get_DtHsml(p);
+
+              MySignedIntPosType offset[3];
+
+              for(int k = 0; k < 3; k++)
+                {
+                  offset[k] = Tp->P[p].IntPos[k] - nop->center[k];
+
+                  if(offset[k] < center_offset_min[k])
+                    center_offset_min[k] = offset[k];
+
+                  if(offset[k] > center_offset_max[k])
+                    center_offset_max[k] = offset[k];
+
+                  if(vmin[k] > Tp->P[p].Vel[k])
+                    vmin[k] = Tp->P[p].Vel[k];
+
+                  if(vmax[k] < Tp->P[p].Vel[k])
+                    vmax[k] = Tp->P[p].Vel[k];
+                }
+
+              not_empty = 1;
+
+              p = Nextnode[p];
+            }
+          else if(p < MaxPart + MaxNodes) /* an internal node  */
+            {
+              ngbnode *noptr = get_nodep(p);
+
+              if(maxcsnd < noptr->MaxCsnd)
+                maxcsnd = noptr->MaxCsnd;
+
+              if(maxhsml < noptr->MaxHsml)
+                maxhsml = noptr->MaxHsml;
+
+              if(maxDtHsml < noptr->MaxDtHsml)
+                maxDtHsml = noptr->MaxDtHsml;
+
+              MySignedIntPosType offset_min[3], offset_max[3];
+
+              for(int k = 0; k < 3; k++)
+                {
+                  offset_min[k] = noptr->center_offset_min[k] + (MySignedIntPosType)(noptr->center[k] - nop->center[k]);
+                  offset_max[k] = noptr->center_offset_max[k] + (MySignedIntPosType)(noptr->center[k] - nop->center[k]);
+
+                  if(offset_min[k] < center_offset_min[k])
+                    center_offset_min[k] = offset_min[k];
+
+                  if(offset_max[k] > center_offset_max[k])
+                    center_offset_max[k] = offset_max[k];
+
+                  if(vmin[k] > noptr->vmin[k])
+                    vmin[k] = noptr->vmin[k];
+
+                  if(vmax[k] < noptr->vmax[k])
+                    vmax[k] = noptr->vmax[k];
+                }
+
+              not_empty |= noptr->not_empty;
+
+              p = noptr->sibling;
+            }
+          else if(p < MaxPart + MaxNodes + D->NTopleaves) /* a pseudo particle */
+            {
+              /* we are processing a local leaf-node which does not have any particles.
+               * can continue to the next element, which should end the work.
+               */
+              p = Nextnode[p - MaxNodes];
+            }
+          else
+            {
+              /* an imported point */
+
+              Terminate("Ups!");
+              p = Nextnode[p - MaxNodes];
+            }
+        }
+    }
+
+  nop->MaxCsnd   = maxcsnd;
+  nop->MaxHsml   = maxhsml;
+  nop->MaxDtHsml = maxDtHsml;
+
+  nop->cannot_be_opened_locally = 0;
+  nop->not_empty                = not_empty;
+
+  for(int k = 0; k < 3; k++)
+    {
+      nop->center_offset_min[k] = center_offset_min[k];
+      nop->center_offset_max[k] = center_offset_max[k];
+      nop->vmin[k]              = vmin[k];
+      nop->vmax[k]              = vmax[k];
+    }
+
+  nop->Ti_Current = All.Ti_Current;
+}
+
+void ngbtree::update_vbounds(int i, int *nchanged, int *nodelist, char *flag_changed)
+{
+  int no = Father[i];
+
+  while(no >= 0)
+    {
+      ngbnode *nop = get_nodep(no);
+
+      if(nop->Ti_Current != All.Ti_Current)
+        nop->drift_node(All.Ti_Current, Tp);
+
+      int has_changed = 0;
+
+      for(int j = 0; j < 3; j++)
+        {
+          if(nop->vmin[j] > Tp->P[i].Vel[j])
+            {
+              nop->vmin[j] = Tp->P[i].Vel[j];
+              has_changed  = 1;
+            }
+
+          if(nop->vmax[j] < Tp->P[i].Vel[j])
+            {
+              nop->vmax[j] = Tp->P[i].Vel[j];
+              has_changed  = 1;
+            }
+        }
+
+      if(has_changed == 0)
+        break;
+
+      if(no < FirstNonTopLevelNode) /* top-level tree-node reached */
+        {
+          int top_no = no - MaxPart;
+
+          if(flag_changed[top_no] == 0)
+            {
+              flag_changed[top_no] = 1;
+
+              nodelist[*nchanged] = no;
+              *nchanged           = *nchanged + 1;
+            }
+          break;
+        }
+
+      no = nop->father;
+    }
+}
+
+void ngbtree::finish_vounds_update(int nchanged, int *nodelist)
+{
+  int i, j, no, task, tot_nchanged;
+  int *recvcounts, *recvoffset, *bytecounts, *byteoffset;
+  int *tot_nodelist;
+  struct leafnode_data
+  {
+    MyNgbTreeFloat vmin[3];
+    MyNgbTreeFloat vmax[3];
+  };
+  leafnode_data *glob_leaf_node_data, *loc_leaf_node_data;
+
+  /* share the pseudo-particle data accross CPUs */
+  recvcounts = (int *)Mem.mymalloc("recvcounts", sizeof(int) * D->NTask);
+  recvoffset = (int *)Mem.mymalloc("recvoffset", sizeof(int) * D->NTask);
+  bytecounts = (int *)Mem.mymalloc("bytecounts", sizeof(int) * D->NTask);
+  byteoffset = (int *)Mem.mymalloc("byteoffset", sizeof(int) * D->NTask);
+
+  MPI_Allgather(&nchanged, 1, MPI_INT, recvcounts, 1, MPI_INT, D->Communicator);
+
+  for(task = 0; task < D->NTask; task++)
+    bytecounts[task] = recvcounts[task] * sizeof(leafnode_data);
+
+  for(task = 1, recvoffset[0] = 0, byteoffset[0] = 0; task < D->NTask; task++)
+    {
+      recvoffset[task] = recvoffset[task - 1] + recvcounts[task - 1];
+      byteoffset[task] = byteoffset[task - 1] + bytecounts[task - 1];
+    }
+
+  loc_leaf_node_data = (leafnode_data *)Mem.mymalloc("loc_leaf_node_data", recvcounts[D->ThisTask] * sizeof(leafnode_data));
+
+  for(i = 0; i < nchanged; i++)
+    {
+      for(j = 0; j < 3; j++)
+        {
+          loc_leaf_node_data[i].vmin[j] = get_nodep(nodelist[i])->vmin[j];
+          loc_leaf_node_data[i].vmax[j] = get_nodep(nodelist[i])->vmax[j];
+        }
+    }
+
+  for(task = 0, tot_nchanged = 0; task < D->NTask; task++)
+    tot_nchanged += recvcounts[task];
+
+  tot_nodelist        = (int *)Mem.mymalloc("tot_nodelist", tot_nchanged * sizeof(int));
+  glob_leaf_node_data = (leafnode_data *)Mem.mymalloc("glob_leaf_node_data", tot_nchanged * sizeof(leafnode_data));
+
+  MPI_Allgatherv(nodelist, nchanged, MPI_INT, tot_nodelist, recvcounts, recvoffset, MPI_INT, D->Communicator);
+  MPI_Allgatherv(loc_leaf_node_data, bytecounts[D->ThisTask], MPI_BYTE, glob_leaf_node_data, bytecounts, byteoffset, MPI_BYTE,
+                 D->Communicator);
+
+  if(TreeSharedMem_ThisTask == 0) /* only one of the shared memory threads needs to update the toplevel tree */
+    {
+      for(i = 0; i < tot_nchanged; i++)
+        {
+          no = tot_nodelist[i];
+
+          ngbnode *nop = get_nodep(no);
+
+          if(nop->Ti_Current != All.Ti_Current)
+            nop->drift_node(All.Ti_Current, Tp);
+
+          for(j = 0; j < 3; j++)
+            {
+              nop->vmin[j] = glob_leaf_node_data[i].vmin[j];
+              nop->vmax[j] = glob_leaf_node_data[i].vmax[j];
+            }
+
+          no = nop->father;
+
+          while(no >= 0)
+            {
+              ngbnode *nop = get_nodep(no);
+
+              if(nop->Ti_Current != All.Ti_Current)
+                nop->drift_node(All.Ti_Current, Tp);
+
+              int flag_changed = 0;
+
+              for(j = 0; j < 3; j++)
+                {
+                  if(nop->vmin[j] > glob_leaf_node_data[i].vmin[j])
+                    {
+                      nop->vmin[j] = glob_leaf_node_data[i].vmin[j];
+                      flag_changed = 1;
+                    }
+
+                  if(nop->vmax[j] < glob_leaf_node_data[i].vmax[j])
+                    {
+                      nop->vmax[j] = glob_leaf_node_data[i].vmax[j];
+                      flag_changed = 1;
+                    }
+                }
+
+              if(flag_changed == 0)
+                break;
+
+              no = nop->father;
+            }
+        }
+    }
+
+  Mem.myfree(glob_leaf_node_data);
+  Mem.myfree(tot_nodelist);
+  Mem.myfree(loc_leaf_node_data);
+  Mem.myfree(byteoffset);
+  Mem.myfree(bytecounts);
+  Mem.myfree(recvoffset);
+  Mem.myfree(recvcounts);
+}
+
+void ngbtree::finish_maxhsml_update(int nchanged, int *nodelist)
+{
+  int i, no, task, tot_nchanged;
+  int *recvcounts, *recvoffset, *bytecounts, *byteoffset;
+  int *tot_nodelist;
+  struct leafnode_data
+  {
+    MyNgbTreeFloat MaxHsml;
+    MyNgbTreeFloat MaxDtHsml;
+  };
+  leafnode_data *glob_leaf_node_data, *loc_leaf_node_data;
+
+  /* share the pseudo-particle data accross CPUs */
+  recvcounts = (int *)Mem.mymalloc("recvcounts", sizeof(int) * D->NTask);
+  recvoffset = (int *)Mem.mymalloc("recvoffset", sizeof(int) * D->NTask);
+  bytecounts = (int *)Mem.mymalloc("bytecounts", sizeof(int) * D->NTask);
+  byteoffset = (int *)Mem.mymalloc("byteoffset", sizeof(int) * D->NTask);
+
+  MPI_Allgather(&nchanged, 1, MPI_INT, recvcounts, 1, MPI_INT, D->Communicator);
+
+  for(task = 0; task < D->NTask; task++)
+    bytecounts[task] = recvcounts[task] * sizeof(leafnode_data);
+
+  for(task = 1, recvoffset[0] = 0, byteoffset[0] = 0; task < D->NTask; task++)
+    {
+      recvoffset[task] = recvoffset[task - 1] + recvcounts[task - 1];
+      byteoffset[task] = byteoffset[task - 1] + bytecounts[task - 1];
+    }
+
+  loc_leaf_node_data = (leafnode_data *)Mem.mymalloc("loc_leaf_node_data", recvcounts[D->ThisTask] * sizeof(leafnode_data));
+
+  for(i = 0; i < nchanged; i++)
+    {
+      loc_leaf_node_data[i].MaxHsml   = get_nodep(nodelist[i])->MaxHsml;
+      loc_leaf_node_data[i].MaxDtHsml = get_nodep(nodelist[i])->MaxDtHsml;
+    }
+
+  for(task = 0, tot_nchanged = 0; task < D->NTask; task++)
+    tot_nchanged += recvcounts[task];
+
+  tot_nodelist        = (int *)Mem.mymalloc("tot_nodelist", tot_nchanged * sizeof(int));
+  glob_leaf_node_data = (leafnode_data *)Mem.mymalloc("glob_leaf_node_data", tot_nchanged * sizeof(leafnode_data));
+
+  MPI_Allgatherv(nodelist, nchanged, MPI_INT, tot_nodelist, recvcounts, recvoffset, MPI_INT, D->Communicator);
+  MPI_Allgatherv(loc_leaf_node_data, bytecounts[D->ThisTask], MPI_BYTE, glob_leaf_node_data, bytecounts, byteoffset, MPI_BYTE,
+                 D->Communicator);
+
+  if(TreeSharedMem_ThisTask == 0) /* only one of the shared memory threads needs to update the toplevel tree */
+    {
+      for(i = 0; i < tot_nchanged; i++)
+        {
+          no = tot_nodelist[i];
+
+          ngbnode *nop = get_nodep(no);
+
+          if(nop->Ti_Current != All.Ti_Current)
+            nop->drift_node(All.Ti_Current, Tp);
+
+          nop->MaxHsml   = glob_leaf_node_data[i].MaxHsml;
+          nop->MaxDtHsml = glob_leaf_node_data[i].MaxDtHsml;
+
+          no = nop->father;
+
+          while(no >= 0)
+            {
+              ngbnode *nop = get_nodep(no);
+
+              if(nop->Ti_Current != All.Ti_Current)
+                nop->drift_node(All.Ti_Current, Tp);
+
+              if(glob_leaf_node_data[i].MaxHsml <= nop->MaxHsml && glob_leaf_node_data[i].MaxDtHsml <= nop->MaxDtHsml)
+                break;
+              else
+                {
+                  if(glob_leaf_node_data[i].MaxHsml > nop->MaxHsml)
+                    nop->MaxHsml = glob_leaf_node_data[i].MaxHsml;
+
+                  if(glob_leaf_node_data[i].MaxDtHsml > nop->MaxDtHsml)
+                    nop->MaxDtHsml = glob_leaf_node_data[i].MaxDtHsml;
+                }
+
+              no = nop->father;
+            }
+        }
+    }
+
+  Mem.myfree(glob_leaf_node_data);
+  Mem.myfree(tot_nodelist);
+  Mem.myfree(loc_leaf_node_data);
+  Mem.myfree(byteoffset);
+  Mem.myfree(bytecounts);
+  Mem.myfree(recvoffset);
+  Mem.myfree(recvcounts);
+}
+
+void ngbtree::update_velocities(void)
+{
+  TIMER_START(CPU_NGBTREEUPDATEVEL);
+
+  int nchanged       = 0;
+  int *nodelist      = (int *)Mem.mymalloc("nodelist", D->NTopleaves * sizeof(int));
+  char *flag_changed = (char *)Mem.mymalloc_clear("flag_changed", D->NTopnodes * sizeof(char));
+
+  for(int i = 0; i < Tp->TimeBinsHydro.NActiveParticles; i++)
+    {
+      int target = Tp->TimeBinsHydro.ActiveParticleList[i];
+
+      if(Tp->P[target].getType() == 0)
+        update_vbounds(target, &nchanged, nodelist, flag_changed);
+    }
+
+  for(int timebin = All.HighestSynchronizedTimeBin; timebin >= 0; timebin--)
+    {
+      for(int target = Tp->TimeBinsHydro.FirstInTimeBin[timebin]; target >= 0; target = Tp->TimeBinsHydro.NextInTimeBin[target])
+        if(Tp->P[target].getType() == 0)
+          {
+            update_vbounds(target, &nchanged, nodelist, flag_changed);
+          }
+    }
+
+  finish_vounds_update(nchanged, nodelist);
+
+  Mem.myfree(flag_changed);
+  Mem.myfree(nodelist);
+
+  TIMER_STOP(CPU_NGBTREEUPDATEVEL);
+}
+
+void ngbtree::update_maxhsml(void)
+{
+  TIMER_START(CPU_NGBTREEUPDATEMAXHSML);
+
+  int nchanged       = 0;
+  int *nodelist      = (int *)Mem.mymalloc("nodelist", D->NTopleaves * sizeof(int));
+  char *flag_changed = (char *)Mem.mymalloc_clear("flag_changed", D->NTopnodes * sizeof(char));
+
+  for(int i = 0; i < Tp->TimeBinsHydro.NActiveParticles; i++)
+    {
+      int target = Tp->TimeBinsHydro.ActiveParticleList[i];
+      if(Tp->P[target].getType() == 0)
+        {
+          int no = Father[target];
+
+          while(no >= 0)
+            {
+              ngbnode *nop = get_nodep(no);
+
+              if(nop->Ti_Current != All.Ti_Current)
+                nop->drift_node(All.Ti_Current, Tp);
+
+              if(Tp->SphP[target].Hsml <= nop->MaxHsml && Tp->SphP[target].DtHsml <= nop->MaxDtHsml)
+                break;
+              else
+                {
+                  if(Tp->SphP[target].Hsml > nop->MaxHsml)
+                    nop->MaxHsml = Tp->SphP[target].Hsml;
+
+                  if(Tp->SphP[target].DtHsml > nop->MaxDtHsml)
+                    nop->MaxDtHsml = Tp->SphP[target].DtHsml;
+                }
+
+              if(no < FirstNonTopLevelNode) /* top-level tree-node reached */
+                {
+                  int top_no = no - MaxPart;
+
+                  if(top_no < 0 || top_no >= D->NTopnodes)
+                    Terminate("top_no=%d   D->NTopleaves=%d\n", top_no, D->NTopnodes);
+
+                  if(flag_changed[top_no] == 0)
+                    {
+                      flag_changed[top_no] = 1;
+
+                      nodelist[nchanged++] = no;
+                    }
+                  break;
+                }
+
+              no = nop->father;
+            }
+        }
+    }
+
+  finish_maxhsml_update(nchanged, nodelist);
+
+  Mem.myfree(flag_changed);
+  Mem.myfree(nodelist);
+
+  TIMER_STOP(CPU_NGBTREEUPDATEMAXHSML);
+}
diff --git a/src/ngenic/grid.cc b/src/ngenic/grid.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c7f66e52d3fe71bc1078d75fc3bc0355247b389
--- /dev/null
+++ b/src/ngenic/grid.cc
@@ -0,0 +1,106 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  grid.cc
+ *
+ *  \brief routines for setting up an unperturbed particle load
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef CREATE_GRID
+
+#include <gsl/gsl_rng.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdlib.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngenic/ngenic.h"
+#include "../system/system.h"
+
+void ngenic::create_grid(void)
+{
+  long long gridSize    = All.GridSize;
+  long long partTotal   = gridSize * gridSize * gridSize;
+  long long partPerTask = partTotal / NTask;
+
+  Sp->RegionLen     = All.BoxSize;
+  Sp->FacCoordToInt = pow(2.0, BITS_FOR_POSITIONS) / Sp->RegionLen;
+  Sp->FacIntToCoord = Sp->RegionLen / pow(2.0, BITS_FOR_POSITIONS);
+
+  All.Time = All.TimeBegin;
+
+  double masstot = All.Omega0 * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G) * All.BoxSize * All.BoxSize * All.BoxSize;
+
+  double m = masstot / (partTotal);
+
+  for(int i = 0; i < NTYPES; i++)
+    All.MassTable[i] = 0.;
+
+  All.MassTable[1] = m;
+
+  Sp->NumGas  = 0;
+  Sp->NumPart = partPerTask;
+
+  if(ThisTask == NTask - 1)
+    {
+      Sp->NumPart = partTotal - Sp->NumPart * (NTask - 1);
+    }
+
+  int max_load, max_sphload;
+  MPI_Allreduce(&Sp->NumPart, &max_load, 1, MPI_INT, MPI_MAX, Communicator);
+  MPI_Allreduce(&Sp->NumGas, &max_sphload, 1, MPI_INT, MPI_MAX, Communicator);
+
+#ifdef GENERATE_GAS_IN_ICS
+  Sp->TotNumGas  = partTotal;
+  Sp->TotNumPart = 2 * partTotal;
+  max_sphload    = max_load;
+  max_load *= 2;
+#else
+  Sp->TotNumPart = partTotal;
+  Sp->TotNumGas  = 0;
+#endif
+
+  Sp->MaxPart    = max_load / (1.0 - 2 * ALLOC_TOLERANCE);
+  Sp->MaxPartSph = max_sphload / (1.0 - 2 * ALLOC_TOLERANCE);
+
+  Sp->allocate_memory();
+
+  for(int i = 0; i < Sp->NumPart; i++)
+    {
+      long long ipcell = ThisTask * partPerTask + i;
+      int x            = ipcell / (All.GridSize * All.GridSize);
+      int xr           = ipcell % (All.GridSize * All.GridSize);
+      int y            = xr / All.GridSize;
+      int z            = xr % All.GridSize;
+
+      double xyz[3];
+      xyz[0] = x * All.BoxSize / All.GridSize;
+      xyz[1] = y * All.BoxSize / All.GridSize;
+      xyz[2] = z * All.BoxSize / All.GridSize;
+
+      Sp->pos_to_intpos(xyz, Sp->P[i].IntPos);
+
+      Sp->P[i].Vel[0] = 0.;
+      Sp->P[i].Vel[1] = 0.;
+      Sp->P[i].Vel[2] = 0.;
+
+      Sp->P[i].ID.set(ipcell + 1);
+
+      Sp->P[i].setType(1);
+    }
+
+  mpi_printf("NGENIC: generated grid of size %d\n", All.GridSize);
+}
+
+#endif
diff --git a/src/ngenic/ngenic.cc b/src/ngenic/ngenic.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c592100fb756906d821e8631b9258f01a33fa08
--- /dev/null
+++ b/src/ngenic/ngenic.cc
@@ -0,0 +1,1191 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  ngenic.cc
+ *
+ *  \brief sets up cosmological initial conditions
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef NGENIC
+
+#include <gsl/gsl_rng.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdlib.h>
+#include <algorithm>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../logs/logs.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../mpi_utils/shared_mem_handler.h"
+#include "../ngenic/ngenic.h"
+#include "../pm/pm_mpi_fft.h"
+#include "../system/system.h"
+
+#ifdef GRIDX
+#undef GRIDX
+#undef GRIDY
+#undef GRIDZ
+#undef INTCELL
+#endif
+
+#define GRIDX (NGENIC)
+#define GRIDY (NGENIC)
+#define GRIDZ (NGENIC)
+
+#define INTCELL ((~((MyIntPosType)0)) / GRIDX + 1)
+
+#define GRIDz (GRIDZ / 2 + 1)
+#define GRID2 (2 * GRIDz)
+
+#define FI(x, y, z) (((large_array_offset)GRID2) * (GRIDY * (x) + (y)) + (z))
+#define FC(c, z) (((large_array_offset)GRID2) * ((c)-myplan.firstcol_XY) + (z))
+
+#if(GRIDZ > 1024)
+typedef long long large_array_offset; /* use a larger data type in this case so that we can always address all cells of the 3D grid
+                                         with a single index */
+#else
+typedef unsigned int large_array_offset;
+#endif
+
+#ifdef NUMPART_PER_TASK_LARGE
+typedef long long large_numpart_type; /* if there is a risk that the local particle number times 8 overflows a 32-bit integer, this
+                                         data type should be used */
+#else
+typedef int large_numpart_type;
+#endif
+
+void ngenic::ngenic_displace_particles(void)
+{
+  TIMER_START(CPU_NGENIC);
+
+  mpi_printf("NGENIC: computing displacement fields...\n");
+
+  All.set_cosmo_factors_for_current_time();
+
+  double vel_prefac1 = All.cf_atime * All.cf_hubble_a * ngenic_f1_omega(All.cf_atime);
+  double vel_prefac2 = All.cf_atime * All.cf_hubble_a * ngenic_f2_omega(All.cf_atime);
+
+  vel_prefac1 /= sqrt(All.cf_atime); /* converts to Gadget velocity */
+  vel_prefac2 /= sqrt(All.cf_atime); /* converts to Gadget velocity */
+
+  mpi_printf("NGENIC: vel_prefac1= %g  hubble_a=%g   fom1=%g\n", vel_prefac1, All.cf_hubble_a, ngenic_f1_omega(All.cf_atime));
+  mpi_printf("NGENIC: vel_prefac2= %g  hubble_a=%g   fom2=%g\n", vel_prefac2, All.cf_hubble_a, ngenic_f2_omega(All.cf_atime));
+
+  rnd_generator_conjugate = gsl_rng_alloc(gsl_rng_ranlxd1);
+  rnd_generator           = gsl_rng_alloc(gsl_rng_ranlxd1);
+  gsl_rng_set(rnd_generator, All.NgenicSeed);
+
+  ngenic_initialize_powerspectrum();
+
+  ngenic_initialize_ffts();
+
+  if(!(seedtable = (unsigned int *)Mem.mymalloc("seedtable", NGENIC * NGENIC * sizeof(unsigned int))))
+    Terminate("could not allocate seed table");
+
+  for(int i = 0; i < NGENIC / 2; i++)
+    {
+      for(int j = 0; j < i; j++)
+        seedtable[i * NGENIC + j] = 0x7fffffff * gsl_rng_uniform(rnd_generator);
+
+      for(int j = 0; j < i + 1; j++)
+        seedtable[j * NGENIC + i] = 0x7fffffff * gsl_rng_uniform(rnd_generator);
+
+      for(int j = 0; j < i; j++)
+        seedtable[(NGENIC - 1 - i) * NGENIC + j] = 0x7fffffff * gsl_rng_uniform(rnd_generator);
+
+      for(int j = 0; j < i + 1; j++)
+        seedtable[(NGENIC - 1 - j) * NGENIC + i] = 0x7fffffff * gsl_rng_uniform(rnd_generator);
+
+      for(int j = 0; j < i; j++)
+        seedtable[i * NGENIC + (NGENIC - 1 - j)] = 0x7fffffff * gsl_rng_uniform(rnd_generator);
+
+      for(int j = 0; j < i + 1; j++)
+        seedtable[j * NGENIC + (NGENIC - 1 - i)] = 0x7fffffff * gsl_rng_uniform(rnd_generator);
+
+      for(int j = 0; j < i; j++)
+        seedtable[(NGENIC - 1 - i) * NGENIC + (NGENIC - 1 - j)] = 0x7fffffff * gsl_rng_uniform(rnd_generator);
+
+      for(int j = 0; j < i + 1; j++)
+        seedtable[(NGENIC - 1 - j) * NGENIC + (NGENIC - 1 - i)] = 0x7fffffff * gsl_rng_uniform(rnd_generator);
+    }
+
+  if(Shmem.Island_NTask != Shmem.World_NTask)
+    {
+      // We actually have multiple shared memory nodes in which we set aside one MPI rank for shared memory communictiona.
+      // In this casem move the seedtable to the communication rank in order to consume this memory only once on the node
+
+      if(Shmem.Island_ThisTask == 0)
+        {
+          size_t tab_len = NGENIC * NGENIC * sizeof(unsigned int);
+
+          MPI_Send(&tab_len, sizeof(tab_len), MPI_BYTE, Shmem.MyShmRankInGlobal, TAG_TABLE_ALLOC, MPI_COMM_WORLD);
+          MPI_Send(seedtable, tab_len, MPI_BYTE, Shmem.MyShmRankInGlobal, TAG_DMOM, MPI_COMM_WORLD);
+        }
+
+      Mem.myfree(seedtable);
+
+      ptrdiff_t off;
+      MPI_Bcast(&off, sizeof(ptrdiff_t), MPI_BYTE, Shmem.Island_NTask - 1, Shmem.SharedMemComm);
+
+      seedtable = (unsigned int *)((char *)Shmem.SharedMemBaseAddr[Shmem.Island_NTask - 1] + off);
+    }
+
+  ngenic_distribute_particles();
+
+  /* allocate displacement vectors */
+  Pdisp = (disp_data *)Mem.mymalloc_clear("disp_data", Sp->NumPart * sizeof(disp_data));
+
+#if defined(MULTICOMPONENTGLASSFILE) && defined(DIFFERENT_TRANSFER_FUNC)
+  for(Type = MinType; Type <= MaxType; Type++)
+#endif
+    {
+#ifdef NGENIC_2LPT
+
+      /* allocate temporary buffers for second derivatives */
+      fft_real *d2phi1[3];
+      for(int axes = 0; axes < 3; axes++)
+        d2phi1[axes] = (fft_real *)Mem.mymalloc_clear("d2Phi1", maxfftsize * sizeof(fft_real));
+
+      for(int axes = 0; axes < 3; axes++)
+        {
+          mpi_printf("NGENIC_2LPT: Computing secondary source term, derivatices %d %d\n", axes, axes);
+
+          fft_real *disp = (fft_real *)Mem.mymalloc("disp", maxfftsize * sizeof(fft_real));
+
+          ngenic_setup_modes_in_kspace((fft_complex *)disp);
+          ngenic_get_derivate_from_fourier_field(axes, axes, (fft_complex *)disp);
+
+          memcpy(d2phi1[axes], disp, maxfftsize * sizeof(fft_real));
+
+          Mem.myfree(disp);
+        }
+
+      /* allocate second source potential */
+      fft_real *Phi2 = (fft_real *)Mem.mymalloc_movable(&Phi2, "Phi2", maxfftsize * sizeof(fft_real));
+
+      for(size_t n = 0; n < maxfftsize; n++)
+        Phi2[n] = d2phi1[0][n] * d2phi1[1][n] + d2phi1[0][n] * d2phi1[2][n] + d2phi1[1][n] * d2phi1[2][n];
+
+      for(int axes = 2; axes >= 0; axes--)
+        Mem.myfree_movable(d2phi1[axes]);
+
+      for(int i = 0; i < 3; i++)
+        for(int j = i + 1; j < 3; j++)
+          {
+            mpi_printf("NGENIC_2LPT: Computing secondary source term, derivatices %d %d\n", i, j);
+
+            fft_real *disp = (fft_real *)Mem.mymalloc("disp", maxfftsize * sizeof(fft_real));
+
+            ngenic_setup_modes_in_kspace((fft_complex *)disp);
+            ngenic_get_derivate_from_fourier_field(i, j, (fft_complex *)disp);
+
+            for(size_t n = 0; n < maxfftsize; n++)
+              Phi2[n] -= disp[n] * disp[n];
+
+            Mem.myfree(disp);
+          }
+
+      mpi_printf("NGENIC_2LPT: Secondary source term computed in real space\n");
+
+      /* Do a forward inplace-FFT to get Phi2 in Fourier space */
+      ngenic_compute_transform_of_source_potential(Phi2);
+
+      mpi_printf("NGENIC_2LPT: Done transforming it to k-space\n");
+
+      for(int axes = 0; axes < 3; axes++)
+        {
+          mpi_printf("NGENIC_2LPT: Obtaining second order displacements for axes=%d\n", axes);
+
+          fft_real *disp = (fft_real *)Mem.mymalloc("disp", maxfftsize * sizeof(fft_real));
+
+          memcpy(disp, Phi2, maxfftsize * sizeof(fft_real));
+
+          ngenic_get_derivate_from_fourier_field(axes, -1, (fft_complex *)disp);
+
+          ngenic_readout_disp(disp, axes, 3.0 / 7, 3.0 / 7 * vel_prefac2);
+
+          Mem.myfree(disp);
+        }
+
+      Mem.myfree(Phi2);
+#endif
+
+      /* now carry out Zeldovich approximation, yielding first order displacements */
+      for(int axes = 0; axes < 3; axes++)
+        {
+          mpi_printf("NGENIC_2LPT: Obtaining Zeldovich displacements for axes=%d\n", axes);
+
+          fft_real *disp = (fft_real *)Mem.mymalloc("disp", maxfftsize * sizeof(fft_real));
+
+          ngenic_setup_modes_in_kspace((fft_complex *)disp);
+
+          ngenic_get_derivate_from_fourier_field(axes, -1, (fft_complex *)disp);
+
+          ngenic_readout_disp(disp, axes, 1.0, vel_prefac1);
+
+          Mem.myfree(disp);
+        }
+    }
+
+  /* now add displacement to Lagrangian coordinates  */
+  double maxdisp = 0;
+  double maxvel  = 0;
+  for(int n = 0; n < Sp->NumPart; n++)
+    {
+      double posdiff[3] = {Pdisp[n].deltapos[0], Pdisp[n].deltapos[1], Pdisp[n].deltapos[2]};
+
+      MyIntPosType delta[3];
+      Sp->pos_to_signedintpos(posdiff, (MySignedIntPosType *)delta);
+
+      for(int axes = 0; axes < 3; axes++)
+        {
+          Sp->P[n].IntPos[axes] += delta[axes];
+
+          if(Pdisp[n].deltapos[axes] > maxdisp)
+            maxdisp = Pdisp[n].deltapos[axes];
+
+          if(Sp->P[n].Vel[axes] > maxvel)
+            maxvel = Sp->P[n].Vel[axes];
+        }
+    }
+
+  double max_disp_global, maxvel_global;
+  MPI_Reduce(&maxdisp, &max_disp_global, 1, MPI_DOUBLE, MPI_MAX, 0, Communicator);
+  MPI_Reduce(&maxvel, &maxvel_global, 1, MPI_DOUBLE, MPI_MAX, 0, Communicator);
+
+  mpi_printf("\nNGENIC: Maximum displacement: %g, in units of the part-spacing= %g\n\n", max_disp_global,
+             max_disp_global / (All.BoxSize / NGENIC));
+  mpi_printf("\nNGENIC: Maximum velocity component: %g\n\n", maxvel_global);
+
+  Mem.myfree(Pdisp);
+
+  Mem.myfree(partin);
+  Mem.myfree(Rcvpm_offset);
+  Mem.myfree(Rcvpm_count);
+  Mem.myfree(Sndpm_offset);
+  Mem.myfree(Sndpm_count);
+
+  if(Shmem.Island_NTask != Shmem.World_NTask)
+    {
+      if(Shmem.Island_ThisTask == 0)
+        {
+          // need to send this flag to the correct processor rank (our shared memory handler) so that the table is freed there
+          size_t tab_len = NGENIC * NGENIC * sizeof(unsigned int);
+          MPI_Send(&tab_len, sizeof(tab_len), MPI_BYTE, Shmem.MyShmRankInGlobal, TAG_TABLE_FREE, MPI_COMM_WORLD);
+        }
+    }
+  else
+    {
+      Mem.myfree(seedtable);
+    }
+
+#ifndef FFT_COLUMN_BASED
+  my_slab_based_fft_free(&myplan);
+#else
+  my_column_based_fft_free(&myplan);
+#endif
+
+  FFTW(destroy_plan)(myplan.forward_plan_zdir);
+  FFTW(destroy_plan)(myplan.forward_plan_ydir);
+  FFTW(destroy_plan)(myplan.forward_plan_xdir);
+
+  FFTW(destroy_plan)(myplan.backward_plan_zdir);
+  FFTW(destroy_plan)(myplan.backward_plan_ydir);
+  FFTW(destroy_plan)(myplan.backward_plan_xdir);
+
+  if(All.PowerSpectrumType == 2)
+    free_power_table();
+
+  gsl_rng_free(rnd_generator);
+  gsl_rng_free(rnd_generator_conjugate);
+
+  print_spec();
+
+  TIMER_STOP(CPU_NGENIC);
+}
+
+void ngenic::ngenic_distribute_particles(void)
+{
+  Sndpm_count  = (size_t *)Mem.mymalloc("Sndpm_count", NTask * sizeof(size_t));
+  Sndpm_offset = (size_t *)Mem.mymalloc("Sndpm_offset", NTask * sizeof(size_t));
+  Rcvpm_count  = (size_t *)Mem.mymalloc("Rcvpm_count", NTask * sizeof(size_t));
+  Rcvpm_offset = (size_t *)Mem.mymalloc("Rcvpm_offset", NTask * sizeof(size_t));
+
+#ifdef FFT_COLUMN_BASED
+  int columns         = GRIDX * GRIDY;
+  int avg             = (columns - 1) / NTask + 1;
+  int exc             = NTask * avg - columns;
+  int tasklastsection = NTask - exc;
+  int pivotcol        = tasklastsection * avg;
+#endif
+
+  /* determine the slabs/columns each particles accesses */
+  {
+    size_t *send_count = Sndpm_count;
+
+    for(int j = 0; j < NTask; j++)
+      send_count[j] = 0;
+
+    for(int i = 0; i < Sp->NumPart; i++)
+      {
+        int slab_x  = Sp->P[i].IntPos[0] / INTCELL;
+        int slab_xx = slab_x + 1;
+
+        if(slab_xx >= GRIDX)
+          slab_xx = 0;
+
+#ifndef FFT_COLUMN_BASED
+        int task0 = myplan.slab_to_task[slab_x];
+        int task1 = myplan.slab_to_task[slab_xx];
+
+        send_count[task0]++;
+        if(task0 != task1)
+          send_count[task1]++;
+#else
+        int slab_y  = Sp->P[i].IntPos[1] / INTCELL;
+        int slab_yy = slab_y + 1;
+
+        if(slab_yy >= GRIDY)
+          slab_yy = 0;
+
+        int column0 = slab_x * GRIDY + slab_y;
+        int column1 = slab_x * GRIDY + slab_yy;
+        int column2 = slab_xx * GRIDY + slab_y;
+        int column3 = slab_xx * GRIDY + slab_yy;
+
+        int task0, task1, task2, task3;
+
+        if(column0 < pivotcol)
+          task0 = column0 / avg;
+        else
+          task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column1 < pivotcol)
+          task1 = column1 / avg;
+        else
+          task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column2 < pivotcol)
+          task2 = column2 / avg;
+        else
+          task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column3 < pivotcol)
+          task3 = column3 / avg;
+        else
+          task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+        send_count[task0]++;
+        if(task1 != task0)
+          send_count[task1]++;
+        if(task2 != task1 && task2 != task0)
+          send_count[task2]++;
+        if(task3 != task0 && task3 != task1 && task3 != task2)
+          send_count[task3]++;
+#endif
+      }
+  }
+
+  /* collect thread-specific offset table and collect the results from the other threads */
+  Sndpm_offset[0] = 0;
+  for(int i = 1; i < NTask; i++)
+    {
+      int ind      = i;
+      int ind_prev = i - 1;
+
+      Sndpm_offset[ind] = Sndpm_offset[ind_prev] + Sndpm_count[ind_prev];
+    }
+
+  MPI_Alltoall(Sndpm_count, sizeof(size_t), MPI_BYTE, Rcvpm_count, sizeof(size_t), MPI_BYTE, Communicator);
+
+  nimport = 0, nexport = 0, Rcvpm_offset[0] = 0, Sndpm_offset[0] = 0;
+  for(int j = 0; j < NTask; j++)
+    {
+      nexport += Sndpm_count[j];
+      nimport += Rcvpm_count[j];
+
+      if(j > 0)
+        {
+          Sndpm_offset[j] = Sndpm_offset[j - 1] + Sndpm_count[j - 1];
+          Rcvpm_offset[j] = Rcvpm_offset[j - 1] + Rcvpm_count[j - 1];
+        }
+    }
+
+  /* allocate import and export buffer */
+  partin  = (partbuf *)Mem.mymalloc("partin", nimport * sizeof(partbuf));
+  partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
+
+  {
+    size_t *send_count  = Sndpm_count;
+    size_t *send_offset = Sndpm_offset;
+
+    for(int j = 0; j < NTask; j++)
+      send_count[j] = 0;
+
+    /* fill export buffer */
+    for(int i = 0; i < Sp->NumPart; i++)
+      {
+        int slab_x  = Sp->P[i].IntPos[0] / INTCELL;
+        int slab_xx = slab_x + 1;
+
+        if(slab_xx >= GRIDX)
+          slab_xx = 0;
+
+#ifndef FFT_COLUMN_BASED
+        int task0 = myplan.slab_to_task[slab_x];
+        int task1 = myplan.slab_to_task[slab_xx];
+
+        size_t ind0 = send_offset[task0] + send_count[task0]++;
+        for(int j = 0; j < 3; j++)
+          partout[ind0].IntPos[j] = Sp->P[i].IntPos[j];
+
+        if(task0 != task1)
+          {
+            size_t ind1 = send_offset[task1] + send_count[task1]++;
+            for(int j = 0; j < 3; j++)
+              partout[ind1].IntPos[j] = Sp->P[i].IntPos[j];
+          }
+#else
+        int slab_y  = Sp->P[i].IntPos[1] / INTCELL;
+        int slab_yy = slab_y + 1;
+
+        if(slab_yy >= GRIDY)
+          slab_yy = 0;
+
+        int column0 = slab_x * GRIDY + slab_y;
+        int column1 = slab_x * GRIDY + slab_yy;
+        int column2 = slab_xx * GRIDY + slab_y;
+        int column3 = slab_xx * GRIDY + slab_yy;
+
+        int task0, task1, task2, task3;
+
+        if(column0 < pivotcol)
+          task0 = column0 / avg;
+        else
+          task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column1 < pivotcol)
+          task1 = column1 / avg;
+        else
+          task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column2 < pivotcol)
+          task2 = column2 / avg;
+        else
+          task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column3 < pivotcol)
+          task3 = column3 / avg;
+        else
+          task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+        size_t ind0 = send_offset[task0] + send_count[task0]++;
+        for(int j = 0; j < 3; j++)
+          partout[ind0].IntPos[j] = Sp->P[i].IntPos[j];
+
+        if(task1 != task0)
+          {
+            size_t ind1 = send_offset[task1] + send_count[task1]++;
+
+            for(int j = 0; j < 3; j++)
+              partout[ind1].IntPos[j] = Sp->P[i].IntPos[j];
+          }
+        if(task2 != task1 && task2 != task0)
+          {
+            size_t ind2 = send_offset[task2] + send_count[task2]++;
+
+            for(int j = 0; j < 3; j++)
+              partout[ind2].IntPos[j] = Sp->P[i].IntPos[j];
+          }
+        if(task3 != task0 && task3 != task1 && task3 != task2)
+          {
+            size_t ind3 = send_offset[task3] + send_count[task3]++;
+
+            for(int j = 0; j < 3; j++)
+              partout[ind3].IntPos[j] = Sp->P[i].IntPos[j];
+          }
+#endif
+      }
+  }
+
+  int flag_big = 0, flag_big_all;
+  for(int i = 0; i < NTask; i++)
+    if(Sndpm_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+      flag_big = 1;
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  /* exchange particle data */
+  myMPI_Alltoallv(partout, Sndpm_count, Sndpm_offset, partin, Rcvpm_count, Rcvpm_offset, sizeof(partbuf), flag_big_all, Communicator);
+
+  Mem.myfree(partout);
+}
+
+void ngenic::ngenic_compute_transform_of_source_potential(fft_real *pot)
+{
+  fft_real *workspace = (fft_real *)Mem.mymalloc("workspace", maxfftsize * sizeof(fft_real));
+
+#ifndef FFT_COLUMN_BASED
+  my_slab_based_fft(&myplan, &pot[0], &workspace[0], +1);
+#else
+  my_column_based_fft(&myplan, pot, workspace, +1);  // result is in workspace, not in Phi2
+  memcpy(pot, workspace, maxfftsize * sizeof(fft_real));
+#endif
+
+  Mem.myfree(workspace);
+
+  double normfac = 1 / (((double)GRIDX) * GRIDY * GRIDZ);
+
+  for(size_t n = 0; n < maxfftsize; n++)
+    pot[n] *= normfac;
+}
+
+/* this function returns the component 'axes' (0, 1, or 2) of the gradient of a field phi,
+ * which is the solution of nabla^2 phi = grid.
+ * We input the Fourier transform of grid to the function, and this field is overwritten with
+ * the gradient.
+ */
+void ngenic::ngenic_get_derivate_from_fourier_field(int axes1, int axes2, fft_complex *fft_of_grid)
+{
+  double kfacx = 2.0 * M_PI / All.BoxSize;
+  double kfacy = 2.0 * M_PI / All.BoxSize;
+  double kfacz = 2.0 * M_PI / All.BoxSize;
+
+#ifdef FFT_COLUMN_BASED
+  for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip += GRIDX)
+    {
+      large_array_offset ipcell = ip + ((large_array_offset)myplan.second_transposed_firstcol) * GRIDX;
+      int y                     = ipcell / (GRIDX * GRIDz);
+      int yr                    = ipcell % (GRIDX * GRIDz);
+      int z                     = yr / GRIDX;
+      if(yr % GRIDX != 0)  // Note: check that x-columns are really complete
+        Terminate("x-column seems incomplete. This is not expected");
+#else
+  for(int y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
+    for(int z = 0; z < GRIDz; z++)
+      {
+#endif
+
+      for(int x = 0; x < GRIDX; x++)
+        {
+          int xx, yy, zz;
+
+          if(x >= (GRIDX / 2))
+            xx = x - GRIDX;
+          else
+            xx = x;
+          if(y >= (GRIDY / 2))
+            yy = y - GRIDY;
+          else
+            yy = y;
+          if(z >= (GRIDZ / 2))
+            zz = z - GRIDZ;
+          else
+            zz = z;
+
+          double kvec[3];
+          kvec[0] = kfacx * xx;
+          kvec[1] = kfacy * yy;
+          kvec[2] = kfacz * zz;
+
+          double kmag2 = kvec[0] * kvec[0] + kvec[1] * kvec[1] + kvec[2] * kvec[2];
+
+          double smth = 1;
+
+#ifdef CORRECT_CIC
+          if(axes2 >= 0)
+            {
+              /* do deconvolution of CIC interpolation */
+              double fx = 1, fy = 1, fz = 1;
+              if(kvec[0] != 0)
+                {
+                  fx = (kvec[0] * All.BoxSize / 2) / NGENIC;
+                  fx = sin(fx) / fx;
+                }
+              if(kvec[1] != 0)
+                {
+                  fy = (kvec[1] * All.BoxSize / 2) / NGENIC;
+                  fy = sin(fy) / fy;
+                }
+              if(kvec[2] != 0)
+                {
+                  fz = (kvec[2] * All.BoxSize / 2) / NGENIC;
+                  fz = sin(fz) / fz;
+                }
+              double ff = 1 / (fx * fy * fz);
+              smth      = ff * ff;
+              /* end deconvolution */
+            }
+#endif
+
+#ifndef FFT_COLUMN_BASED
+          large_array_offset elem = ((large_array_offset)GRIDz) * (GRIDX * (y - myplan.slabstart_y) + x) + z;
+#else
+            large_array_offset elem = ip + x;
+#endif
+
+          fft_real re = smth * fft_of_grid[elem][0];
+          fft_real im = smth * fft_of_grid[elem][1];
+
+          if(axes2 < 0)
+            {
+              /* first derivative */
+              fft_of_grid[elem][0] = (kmag2 > 0.0 ? -kvec[axes1] / kmag2 * im : 0.0);
+              fft_of_grid[elem][1] = (kmag2 > 0.0 ? kvec[axes1] / kmag2 * re : 0.0);
+            }
+          else
+            {
+              /* second derivative */
+              fft_of_grid[elem][0] = (kmag2 > 0.0 ? kvec[axes1] * kvec[axes2] / kmag2 * re : 0.0);
+              fft_of_grid[elem][1] = (kmag2 > 0.0 ? kvec[axes1] * kvec[axes2] / kmag2 * im : 0.0);
+            }
+        }
+    }
+
+#ifdef FFT_COLUMN_BASED
+  if(myplan.second_transposed_firstcol == 0)
+    fft_of_grid[0][0] = fft_of_grid[0][1] = 0.0;
+#else
+  if(myplan.slabstart_y == 0)
+    fft_of_grid[0][0] = fft_of_grid[0][1] = 0.0;
+#endif
+
+  /* Do the inverse FFT to get the displacement field */
+  fft_real *workspace = (fft_real *)Mem.mymalloc("workspace", maxfftsize * sizeof(fft_real));
+
+#ifndef FFT_COLUMN_BASED
+  my_slab_based_fft(&myplan, &fft_of_grid[0], &workspace[0], -1);
+#else
+  my_column_based_fft(&myplan, fft_of_grid, workspace, -1);  // result is in workspace
+  memcpy(fft_of_grid, workspace, maxfftsize * sizeof(fft_real));
+#endif
+
+  Mem.myfree(workspace);
+}
+
+void ngenic::ngenic_setup_modes_in_kspace(fft_complex *fft_of_grid)
+{
+  double fac = pow(2 * M_PI / All.BoxSize, 1.5);
+
+  /* clear local FFT-mesh */
+  memset(fft_of_grid, 0, maxfftsize * sizeof(fft_real));
+
+  mpi_printf("NGENIC: setting up modes in kspace...\n");
+
+  double kfacx = 2.0 * M_PI / All.BoxSize;
+  double kfacy = 2.0 * M_PI / All.BoxSize;
+  double kfacz = 2.0 * M_PI / All.BoxSize;
+
+#ifdef FFT_COLUMN_BASED
+  for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip += GRIDX)
+    {
+      large_array_offset ipcell = ip + ((large_array_offset)myplan.second_transposed_firstcol) * GRIDX;
+      int y                     = ipcell / (GRIDX * GRIDz);
+      int yr                    = ipcell % (GRIDX * GRIDz);
+      int z                     = yr / GRIDX;
+      if(yr % GRIDX != 0)  // Note: check that x-columns are really complete
+        Terminate("x-column seems incomplete. This is not expected");
+#else
+  for(int y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
+    for(int z = 0; z < GRIDz; z++)
+      {
+#endif
+
+      // let's use the y and z plane here, because the x-column is available in full for both FFT schemes
+      gsl_rng_set(rnd_generator, seedtable[y * NGENIC + z]);
+
+      // we also create the modes for the conjugate column so that we can fulfill the reality constraint
+      // by using the conjugate of the corresponding mode if needed
+      int y_conj, z_conj;
+      if(y > 0)
+        y_conj = GRIDY - y;
+      else
+        y_conj = 0;
+
+      if(z > 0)
+        z_conj = GRIDZ - z;
+      else
+        z_conj = 0;
+
+      gsl_rng_set(rnd_generator_conjugate, seedtable[y_conj * NGENIC + z_conj]);
+
+#ifndef NGENIC_FIX_MODE_AMPLITUDES
+      double mode_ampl[GRIDX], mode_ampl_conj[GRIDX];
+#endif
+      double mode_phase[GRIDX], mode_phase_conj[GRIDX];
+
+      // in this loop we precompute the modes for both columns, from low-k to high-k,
+      // so that after an increase of resolution, one gets the same modes plus new ones
+      for(int xoff = 0; xoff < GRIDX / 2; xoff++)
+        for(int side = 0; side < 2; side++)
+          {
+            int x;
+            if(side == 0)
+              x = xoff;
+            else
+              x = GRIDX - 1 - xoff;
+
+            double phase      = gsl_rng_uniform(rnd_generator) * 2 * M_PI;
+            double phase_conj = gsl_rng_uniform(rnd_generator_conjugate) * 2 * M_PI;
+
+#ifdef NGENIC_MIRROR_PHASES
+            phase += M_PI;
+            if(phase >= 2 * M_PI)
+              phase -= 2 * M_PI;
+
+            phase_conj += M_PI;
+            if(phase_conj >= 2 * M_PI)
+              phase_conj -= 2 * M_PI;
+#endif
+            mode_phase[x]      = phase;
+            mode_phase_conj[x] = phase_conj;
+
+#ifndef NGENIC_FIX_MODE_AMPLITUDES
+            double ampl;
+            do
+              {
+                ampl = gsl_rng_uniform(rnd_generator);
+              }
+            while(ampl == 0);
+
+            double ampl_conj;
+            do
+              {
+                ampl_conj = gsl_rng_uniform(rnd_generator_conjugate);
+              }
+            while(ampl_conj == 0);
+
+            mode_ampl[x] = ampl;
+
+            mode_ampl_conj[x] = ampl_conj;
+#endif
+          }
+
+      // now let's populate the full x-column of modes
+      for(int x = 0; x < GRIDX; x++)
+        {
+          int xx, yy, zz;
+
+          if(x >= (GRIDX / 2))
+            xx = x - GRIDX;
+          else
+            xx = x;
+          if(y >= (GRIDY / 2))
+            yy = y - GRIDY;
+          else
+            yy = y;
+          if(z >= (GRIDZ / 2))
+            zz = z - GRIDZ;
+          else
+            zz = z;
+
+          double kvec[3];
+          kvec[0] = kfacx * xx;
+          kvec[1] = kfacy * yy;
+          kvec[2] = kfacz * zz;
+
+          double kmag2 = kvec[0] * kvec[0] + kvec[1] * kvec[1] + kvec[2] * kvec[2];
+          double kmag  = sqrt(kmag2);
+
+          if(All.SphereMode == 1)
+            {
+              if(kmag * All.BoxSize / (2 * M_PI) > All.NSample / 2) /* select a sphere in k-space */
+                continue;
+            }
+          else
+            {
+              if(fabs(kvec[0]) * All.BoxSize / (2 * M_PI) > All.NSample / 2)
+                continue;
+              if(fabs(kvec[1]) * All.BoxSize / (2 * M_PI) > All.NSample / 2)
+                continue;
+              if(fabs(kvec[2]) * All.BoxSize / (2 * M_PI) > All.NSample / 2)
+                continue;
+            }
+
+          double p_of_k = ngenic_power_spec(kmag);
+
+          /* Note: kmag and p_of_k are unaffected by whether or not we use the conjugate mode */
+
+          int conjugate_flag = 0;
+
+          if(z == 0 || z == GRIDZ / 2)
+            {
+              if(x > GRIDX / 2 && x < GRIDX)
+                conjugate_flag = 1;
+              else if(x == 0 || x == GRIDX / 2)
+                {
+                  if(y > GRIDY / 2 && y < GRIDY)
+                    conjugate_flag = 1;
+                  else if(y == 0 || y == GRIDX / 2)
+                    {
+                      continue;
+                    }
+                }
+            }
+
+          // determine location of conjugate mode in x column
+          int x_conj;
+
+          if(x > 0)
+            x_conj = GRIDX - x;
+          else
+            x_conj = 0;
+
+#ifndef NGENIC_FIX_MODE_AMPLITUDES
+          if(conjugate_flag)
+            p_of_k *= -log(mode_ampl_conj[x_conj]);
+          else
+            p_of_k *= -log(mode_ampl[x]);
+#endif
+
+          double delta = fac * sqrt(p_of_k) / Dplus; /* scale back to starting redshift */
+
+#ifndef FFT_COLUMN_BASED
+          large_array_offset elem = ((large_array_offset)GRIDz) * (GRIDX * (y - myplan.slabstart_y) + x) + z;
+#else
+            large_array_offset elem = ip + x;
+#endif
+
+          if(conjugate_flag)
+            {
+              fft_of_grid[elem][0] = delta * cos(mode_phase_conj[x_conj]);
+              fft_of_grid[elem][1] = -delta * sin(mode_phase_conj[x_conj]);
+            }
+          else
+            {
+              fft_of_grid[elem][0] = delta * cos(mode_phase[x]);
+              fft_of_grid[elem][1] = delta * sin(mode_phase[x]);
+            }
+        }
+    }
+}
+
+void ngenic::ngenic_readout_disp(fft_real *grid, int axis, double pfac, double vfac)
+{
+#ifdef FFT_COLUMN_BASED
+  int columns         = GRIDX * GRIDY;
+  int avg             = (columns - 1) / NTask + 1;
+  int exc             = NTask * avg - columns;
+  int tasklastsection = NTask - exc;
+  int pivotcol        = tasklastsection * avg;
+#endif
+
+  double *flistin  = (double *)Mem.mymalloc("flistin", nimport * sizeof(double));
+  double *flistout = (double *)Mem.mymalloc("flistout", nexport * sizeof(double));
+
+  for(size_t i = 0; i < nimport; i++)
+    {
+      flistin[i] = 0;
+
+      int slab_x         = partin[i].IntPos[0] / INTCELL;
+      MyIntPosType rmd_x = partin[i].IntPos[0] % INTCELL;
+
+      int slab_y         = partin[i].IntPos[1] / INTCELL;
+      MyIntPosType rmd_y = partin[i].IntPos[1] % INTCELL;
+
+      int slab_z         = partin[i].IntPos[2] / INTCELL;
+      MyIntPosType rmd_z = partin[i].IntPos[2] % INTCELL;
+
+      double dx = rmd_x * (1.0 / INTCELL);
+      double dy = rmd_y * (1.0 / INTCELL);
+      double dz = rmd_z * (1.0 / INTCELL);
+
+      int slab_xx = slab_x + 1;
+      int slab_yy = slab_y + 1;
+      int slab_zz = slab_z + 1;
+
+      if(slab_xx >= GRIDX)
+        slab_xx = 0;
+      if(slab_yy >= GRIDY)
+        slab_yy = 0;
+      if(slab_zz >= GRIDZ)
+        slab_zz = 0;
+
+#ifndef FFT_COLUMN_BASED
+      if(myplan.slab_to_task[slab_x] == ThisTask)
+        {
+          slab_x -= myplan.first_slab_x_of_task[ThisTask];
+
+          flistin[i] += grid[FI(slab_x, slab_y, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
+                        grid[FI(slab_x, slab_y, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz) +
+                        grid[FI(slab_x, slab_yy, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) +
+                        grid[FI(slab_x, slab_yy, slab_zz)] * (1.0 - dx) * (dy) * (dz);
+        }
+
+      if(myplan.slab_to_task[slab_xx] == ThisTask)
+        {
+          slab_xx -= myplan.first_slab_x_of_task[ThisTask];
+
+          flistin[i] += grid[FI(slab_xx, slab_y, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) +
+                        grid[FI(slab_xx, slab_y, slab_zz)] * (dx) * (1.0 - dy) * (dz) +
+                        grid[FI(slab_xx, slab_yy, slab_z)] * (dx) * (dy) * (1.0 - dz) +
+                        grid[FI(slab_xx, slab_yy, slab_zz)] * (dx) * (dy) * (dz);
+        }
+#else
+      int column0 = slab_x * GRIDY + slab_y;
+      int column1 = slab_x * GRIDY + slab_yy;
+      int column2 = slab_xx * GRIDY + slab_y;
+      int column3 = slab_xx * GRIDY + slab_yy;
+
+      if(column0 >= myplan.firstcol_XY && column0 <= myplan.lastcol_XY)
+        {
+          flistin[i] += grid[FC(column0, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
+                        grid[FC(column0, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz);
+        }
+      if(column1 >= myplan.firstcol_XY && column1 <= myplan.lastcol_XY)
+        {
+          flistin[i] +=
+              grid[FC(column1, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) + grid[FC(column1, slab_zz)] * (1.0 - dx) * (dy) * (dz);
+        }
+
+      if(column2 >= myplan.firstcol_XY && column2 <= myplan.lastcol_XY)
+        {
+          flistin[i] +=
+              grid[FC(column2, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) + grid[FC(column2, slab_zz)] * (dx) * (1.0 - dy) * (dz);
+        }
+
+      if(column3 >= myplan.firstcol_XY && column3 <= myplan.lastcol_XY)
+        {
+          flistin[i] += grid[FC(column3, slab_z)] * (dx) * (dy) * (1.0 - dz) + grid[FC(column3, slab_zz)] * (dx) * (dy) * (dz);
+        }
+#endif
+    }
+
+  /* exchange the potential component data */
+  int flag_big = 0, flag_big_all;
+  for(int i = 0; i < NTask; i++)
+    if(Sndpm_count[i] * sizeof(double) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+      flag_big = 1;
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  /* exchange  data */
+  myMPI_Alltoallv(flistin, Rcvpm_count, Rcvpm_offset, flistout, Sndpm_count, Sndpm_offset, sizeof(double), flag_big_all, Communicator);
+
+  {
+    size_t *send_count  = Sndpm_count;
+    size_t *send_offset = Sndpm_offset;
+
+    for(int j = 0; j < NTask; j++)
+      send_count[j] = 0;
+
+    for(int i = 0; i < Sp->NumPart; i++)
+      {
+        int slab_x  = Sp->P[i].IntPos[0] / INTCELL;
+        int slab_xx = slab_x + 1;
+
+        if(slab_xx >= GRIDX)
+          slab_xx = 0;
+
+#ifndef FFT_COLUMN_BASED
+        int task0 = myplan.slab_to_task[slab_x];
+        int task1 = myplan.slab_to_task[slab_xx];
+
+        double value = flistout[send_offset[task0] + send_count[task0]++];
+
+        if(task0 != task1)
+          value += flistout[send_offset[task1] + send_count[task1]++];
+#else
+        int slab_y  = Sp->P[i].IntPos[1] / INTCELL;
+        int slab_yy = slab_y + 1;
+
+        if(slab_yy >= GRIDY)
+          slab_yy = 0;
+
+        int column0 = slab_x * GRIDY + slab_y;
+        int column1 = slab_x * GRIDY + slab_yy;
+        int column2 = slab_xx * GRIDY + slab_y;
+        int column3 = slab_xx * GRIDY + slab_yy;
+
+        int task0, task1, task2, task3;
+
+        if(column0 < pivotcol)
+          task0 = column0 / avg;
+        else
+          task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column1 < pivotcol)
+          task1 = column1 / avg;
+        else
+          task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column2 < pivotcol)
+          task2 = column2 / avg;
+        else
+          task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column3 < pivotcol)
+          task3 = column3 / avg;
+        else
+          task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+        double value = flistout[send_offset[task0] + send_count[task0]++];
+
+        if(task1 != task0)
+          value += flistout[send_offset[task1] + send_count[task1]++];
+
+        if(task2 != task1 && task2 != task0)
+          value += flistout[send_offset[task2] + send_count[task2]++];
+
+        if(task3 != task0 && task3 != task1 && task3 != task2)
+          value += flistout[send_offset[task3] + send_count[task3]++];
+#endif
+
+        Pdisp[i].deltapos[axis] += pfac * value;
+        Sp->P[i].Vel[axis] += vfac * value;
+      }
+  }
+
+  Mem.myfree(flistout);
+  Mem.myfree(flistin);
+}
+
+void ngenic::ngenic_initialize_ffts(void)
+{
+#ifdef LONG_X
+  if(LONG_X != (int)(LONG_X))
+    Terminate("LONG_X must be an integer if used with PMGRID");
+#endif
+
+#ifdef LONG_Y
+  if(LONG_Y != (int)(LONG_Y))
+    Terminate("LONG_Y must be an integer if used with PMGRID");
+#endif
+
+#ifdef LONG_Z
+  if(LONG_Z != (int)(LONG_Z))
+    Terminate("LONG_Z must be an integer if used with PMGRID");
+#endif
+
+  /* Set up the FFTW-3 plan files. */
+  int ndimx[1] = {GRIDX}; /* dimension of the 1D transforms */
+  int ndimy[1] = {GRIDY}; /* dimension of the 1D transforms */
+  int ndimz[1] = {GRIDZ}; /* dimension of the 1D transforms */
+
+  int max_GRID2 = 2 * (std::max<int>(std::max<int>(GRIDX, GRIDY), GRIDZ) / 2 + 1);
+
+  /* temporarily allocate some arrays to make sure that out-of-place plans are created */
+  fft_real *DispGrid           = (fft_real *)Mem.mymalloc("DispGrid", max_GRID2 * sizeof(fft_real));
+  fft_complex *fft_of_DispGrid = (fft_complex *)Mem.mymalloc("DispGrid", max_GRID2 * sizeof(fft_real));
+
+#ifdef DOUBLEPRECISION_FFTW
+  int alignflag = 0;
+#else
+  /* for single precision, the start of our FFT columns is presently only guaranteed to be 8-byte aligned */
+  int alignflag = FFTW_UNALIGNED;
+#endif
+
+#ifndef FFT_COLUMN_BASED
+  int stride = GRIDz;
+#else
+  int stride    = 1;
+#endif
+
+  myplan.backward_plan_xdir =
+      FFTW(plan_many_dft)(1, ndimx, 1, (fft_complex *)DispGrid, 0, stride, GRIDz * GRIDX, fft_of_DispGrid, 0, stride, GRIDz * GRIDX,
+                          FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.backward_plan_ydir =
+      FFTW(plan_many_dft)(1, ndimy, 1, (fft_complex *)DispGrid, 0, stride, GRIDz * GRIDY, fft_of_DispGrid, 0, stride, GRIDz * GRIDY,
+                          FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.backward_plan_zdir = FFTW(plan_many_dft_c2r)(1, ndimz, 1, (fft_complex *)DispGrid, 0, 1, GRIDz, (fft_real *)fft_of_DispGrid,
+                                                      0, 1, GRID2, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.forward_plan_xdir = FFTW(plan_many_dft)(1, ndimx, 1, (fft_complex *)DispGrid, 0, stride, GRIDz * GRIDX, fft_of_DispGrid, 0,
+                                                 stride, GRIDz * GRIDX, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.forward_plan_ydir = FFTW(plan_many_dft)(1, ndimy, 1, (fft_complex *)DispGrid, 0, stride, GRIDz * GRIDY, fft_of_DispGrid, 0,
+                                                 stride, GRIDz * GRIDY, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.forward_plan_zdir = FFTW(plan_many_dft_r2c)(1, ndimz, 1, DispGrid, 0, 1, GRID2, (fft_complex *)fft_of_DispGrid, 0, 1, GRIDz,
+                                                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  Mem.myfree(fft_of_DispGrid);
+  Mem.myfree(DispGrid);
+
+#ifndef FFT_COLUMN_BASED
+
+  my_slab_based_fft_init(&myplan, GRIDX, GRIDY, GRIDZ);
+
+  maxfftsize = std::max<int>(myplan.largest_x_slab * GRIDY, myplan.largest_y_slab * GRIDX) * ((size_t)GRID2);
+
+#else
+
+  my_column_based_fft_init(&myplan, GRIDX, GRIDY, GRIDZ);
+
+  maxfftsize = myplan.max_datasize;
+
+#endif
+}
+
+void ngenic::print_spec(void)
+{
+  if(ThisTask == 0)
+    {
+      char buf[3 * MAXLEN_PATH];
+      sprintf(buf, "%s/inputspec_%s.txt", All.OutputDir, All.SnapshotFileBase);
+
+      FILE *fd = fopen(buf, "w");
+
+      double gf = ngenic_growth_factor(0.001, 1.0) / (1.0 / 0.001);
+
+      double DDD = ngenic_growth_factor(All.cf_atime, 1.0);
+
+      fprintf(fd, "%12g %12g\n", All.cf_redshift, DDD); /* print actual starting redshift and
+                                                           linear growth factor for this cosmology */
+
+      double kstart = 2 * M_PI / (1000.0 * (1e6 * PARSEC / All.UnitLength_in_cm));  /* 1000 Mpc/h */
+      double kend   = 2 * M_PI / (0.001 * (3.1e6 * PARSEC / All.UnitLength_in_cm)); /* 0.001 Mpc/h */
+
+      for(double k = kstart; k < kend; k *= 1.025)
+        {
+          double po = ngenic_power_spec(k);
+          double dl = 4.0 * M_PI * k * k * k * po;
+
+          double kf = 0.5;
+
+          double po2 = ngenic_power_spec(1.001 * k * kf);
+          double po1 = ngenic_power_spec(k * kf);
+          double dnl = 0, knl = 0;
+
+          if(po != 0 && po1 != 0 && po2 != 0)
+            {
+              double neff = (log(po2) - log(po1)) / (log(1.001 * k * kf) - log(k * kf));
+
+              if(1 + neff / 3 > 0)
+                {
+                  double A     = 0.482 * pow(1 + neff / 3, -0.947);
+                  double B     = 0.226 * pow(1 + neff / 3, -1.778);
+                  double alpha = 3.310 * pow(1 + neff / 3, -0.244);
+                  double beta  = 0.862 * pow(1 + neff / 3, -0.287);
+                  double V     = 11.55 * pow(1 + neff / 3, -0.423) * 1.2;
+
+                  dnl = fnl(dl, A, B, alpha, beta, V, gf);
+                  knl = k * pow(1 + dnl, 1.0 / 3);
+                }
+            }
+
+          fprintf(fd, "%12g %12g    %12g %12g\n", k, dl, knl, dnl);
+        }
+      fclose(fd);
+    }
+}
+
+#endif
diff --git a/src/ngenic/ngenic.h b/src/ngenic/ngenic.h
new file mode 100644
index 0000000000000000000000000000000000000000..81671c963c9a832f7f54462f2930a73039357be5
--- /dev/null
+++ b/src/ngenic/ngenic.h
@@ -0,0 +1,152 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  ngenic.h
+ *
+ *  \brief definition of a class for the construction of cosmological initial conditions
+ */
+
+#ifndef NGENIC_H
+#define NGENIC_H
+
+#ifdef NGENIC
+
+#ifndef PERIODIC
+#error NGENIC requires PERIODIC
+#endif
+
+#include <fftw3.h>
+
+#ifdef DOUBLEPRECISION_FFTW
+typedef double fft_real;
+typedef fftw_complex fft_complex;
+#else
+typedef float fft_real;
+typedef fftwf_complex fft_complex;
+#endif
+
+#include "../data/simparticles.h"
+#include "../pm/pm_mpi_fft.h"
+
+class ngenic : public pm_mpi_fft
+{
+ private:
+  simparticles *Sp;
+
+ public:
+  ngenic(MPI_Comm comm, simparticles *Sp_ptr) : setcomm(comm), pm_mpi_fft(comm) /* constructor */ { Sp = Sp_ptr; }
+
+ public:
+  void ngenic_displace_particles(void);
+
+  void create_grid(void);
+
+ private:
+  double ngenic_power_spec(double k);
+  double ngenic_f1_omega(double a);
+  double ngenic_f2_omega(double a);
+  double ngenic_growth_factor(double astart, double aend);
+  void ngenic_initialize_powerspectrum(void);
+  void free_power_table(void);
+
+  double Dplus;
+
+  unsigned int *seedtable;
+
+  fft_plan myplan;
+  size_t maxfftsize;
+
+  struct partbuf
+  {
+    MyIntPosType IntPos[3];
+  };
+  partbuf *partin, *partout;
+
+  size_t nimport, nexport;
+
+  size_t *Sndpm_count, *Sndpm_offset;
+  size_t *Rcvpm_count, *Rcvpm_offset;
+
+  gsl_rng *rnd_generator;
+  gsl_rng *rnd_generator_conjugate;
+
+  struct disp_data
+  {
+    fft_real deltapos[3];
+  };
+
+  disp_data *Pdisp;
+
+  void ngenic_distribute_particles();
+  void ngenic_setup_modes_in_kspace(fft_complex *fft_of_grid);
+  void ngenic_readout_disp(fft_real *grid, int axis, double pfac, double vfac);
+  void ngenic_initialize_ffts(void);
+  void ngenic_get_derivate_from_fourier_field(int axes1, int axes2, fft_complex *fft_of_grid);
+  void ngenic_compute_transform_of_source_potential(fft_real *pot);
+  void print_spec(void);
+
+  double R8;
+
+  double AA, BB, CC;
+  double nu;
+  double Norm;
+
+  int NPowerTable;
+
+  struct pow_table
+  {
+    double logk, logD;
+    bool operator<(const pow_table &other) const { return logk < other.logk; }
+  };
+  pow_table *PowerTable;
+
+  double ngenic_powerspec_tabulated(double k);
+  double ngenic_powerspec_efstathiou(double k);
+  double ngenic_powerspec_eh(double k);
+  double ngenic_tophat_sigma2(double R);
+  double ngenic_tk_eh(double k);
+  double ngenic_growth(double a);
+  void read_power_table(void);
+
+  static double ngenic_growth_int(double a, void *param)
+  {
+    return pow(a / (All.Omega0 + (1 - All.Omega0 - All.OmegaLambda) * a + All.OmegaLambda * a * a * a), 1.5);
+  }
+
+  double fnl(double x, double A, double B, double alpha, double beta, double V, double gf) /* Peacock & Dodds formula */
+  {
+    return x * pow((1 + B * beta * x + pow(A * x, alpha * beta)) / (1 + pow(pow(A * x, alpha) * gf * gf * gf / (V * sqrt(x)), beta)),
+                   1 / beta);
+  }
+
+  struct myparams
+  {
+    double R;
+    ngenic *obj;
+  };
+
+  static double sigma2_int(double lnk, void *param)
+  {
+    myparams *par   = (myparams *)param;
+    double r_tophat = par->R;
+
+    double k   = exp(lnk);
+    double kr  = r_tophat * k;
+    double kr2 = kr * kr;
+    double kr3 = kr2 * kr;
+
+    if(kr < 1e-8)
+      return 0;
+
+    double w = 3 * (sin(kr) / kr3 - cos(kr) / kr2);
+    double x = 4 * M_PI * k * k * w * w * par->obj->ngenic_power_spec(k);
+
+    return k * x;
+  }
+};
+
+#endif
+#endif
diff --git a/src/ngenic/power.cc b/src/ngenic/power.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5062b0aece56a438b6f4d551313a3e7decae281f
--- /dev/null
+++ b/src/ngenic/power.cc
@@ -0,0 +1,341 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  power.cc
+ *
+ *  \brief auxiliary routines for computing the linear power spectrum for the ICs
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef NGENIC
+
+#include <gsl/gsl_integration.h>
+#include <gsl/gsl_rng.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdlib.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngenic/ngenic.h"
+#include "../pm/pm_mpi_fft.h"
+#include "../system/system.h"
+
+double ngenic::ngenic_power_spec(double k)
+{
+  double power = 0;
+
+#if defined(MULTICOMPONENTGLASSFILE) && defined(DIFFERENT_TRANSFER_FUNC)
+  if(Type == 1)
+#endif
+    switch(All.PowerSpectrumType)
+      {
+        case 1:
+          power = ngenic_powerspec_eh(k);
+          break;
+
+        case 2:
+          power = ngenic_powerspec_tabulated(k);
+          break;
+
+        default:
+          power = ngenic_powerspec_efstathiou(k);
+          break;
+      }
+
+#if defined(MULTICOMPONENTGLASSFILE) && defined(DIFFERENT_TRANSFER_FUNC)
+  if(Type == 2)
+    {
+      power = PowerSpec_DM_2ndSpecies(k);
+    }
+#endif
+
+  power *= pow(k, All.PrimordialIndex - 1.0);
+
+  return power;
+}
+
+void ngenic::free_power_table(void) { Mem.myfree(PowerTable); }
+
+void ngenic::read_power_table(void)
+{
+  FILE *fd;
+  char buf[MAXLEN_PATH];
+  double k, p;
+
+  sprintf(buf, All.PowerSpectrumFile);
+
+  if(!(fd = fopen(buf, "r")))
+    {
+      Terminate("can't read input spectrum in file '%s' on task %d\n", buf, ThisTask);
+    }
+
+  NPowerTable = 0;
+  do
+    {
+      if(fscanf(fd, " %lg %lg ", &k, &p) == 2)
+        NPowerTable++;
+      else
+        break;
+    }
+  while(1);
+
+  fclose(fd);
+
+  mpi_printf("found %d rows in input spectrum table\n", NPowerTable);
+
+  PowerTable = (pow_table *)Mem.mymalloc("PowerTable", NPowerTable * sizeof(pow_table));
+
+  sprintf(buf, All.PowerSpectrumFile);
+
+  if(!(fd = fopen(buf, "r")))
+    {
+      Terminate("can't read input spectrum in file '%s' on task %d\n", buf, ThisTask);
+    }
+
+  NPowerTable = 0;
+  do
+    {
+      double p;
+
+      if(fscanf(fd, " %lg %lg ", &k, &p) == 2)
+        {
+          PowerTable[NPowerTable].logk = k;
+          PowerTable[NPowerTable].logD = p;
+          NPowerTable++;
+        }
+      else
+        break;
+    }
+  while(1);
+
+  fclose(fd);
+
+  std::sort(PowerTable, PowerTable + NPowerTable);
+}
+
+void ngenic::ngenic_initialize_powerspectrum(void)
+{
+  double res;
+
+  AA = 6.4 / All.ShapeGamma * (3.085678e24 / All.UnitLength_in_cm);
+  BB = 3.0 / All.ShapeGamma * (3.085678e24 / All.UnitLength_in_cm);
+  CC = 1.7 / All.ShapeGamma * (3.085678e24 / All.UnitLength_in_cm);
+  nu = 1.13;
+
+  R8 = 8 * (3.085678e24 / All.UnitLength_in_cm); /* 8 Mpc/h */
+
+  if(All.PowerSpectrumType == 2)
+    read_power_table();
+
+  if(All.ReNormalizeInputSpectrum == 0 && All.PowerSpectrumType == 2)
+    {
+      Norm = 1.0;
+      /* tabulated file is already at the initial redshift */
+      Dplus = 1.0;
+    }
+  else
+    {
+#ifdef DIFFERENT_TRANSFER_FUNC
+      Type = 1;
+#endif
+      Norm = 1.0;
+      res  = ngenic_tophat_sigma2(R8);
+
+      if(ThisTask == 0 && All.PowerSpectrumType == 2)
+        printf("\nNormalization of spectrum in file:  Sigma8 = %g\n", sqrt(res));
+
+      Norm = All.Sigma8 * All.Sigma8 / res;
+
+      if(ThisTask == 0 && All.PowerSpectrumType == 2)
+        printf("Normalization adjusted to  Sigma8=%g   (Normfac=%g)\n\n", All.Sigma8, Norm);
+
+      Dplus = ngenic_growth_factor(All.cf_atime, 1.0);
+    }
+  mpi_printf("NGENIC: Dplus=%g\n", Dplus);
+}
+
+double ngenic::ngenic_powerspec_tabulated(double k)
+{
+  double kold = k;
+
+  k *= (All.InputSpectrum_UnitLength_in_cm / All.UnitLength_in_cm);  // convert to h/Mpc
+
+  double logk = log10(k);
+
+  if(logk < PowerTable[0].logk || logk > PowerTable[NPowerTable - 1].logk)
+    return 0;
+
+  int binlow  = 0;
+  int binhigh = NPowerTable - 1;
+
+  while(binhigh - binlow > 1)
+    {
+      int binmid = (binhigh + binlow) / 2;
+      if(logk < PowerTable[binmid].logk)
+        binhigh = binmid;
+      else
+        binlow = binmid;
+    }
+
+  double dlogk = PowerTable[binhigh].logk - PowerTable[binlow].logk;
+
+  if(dlogk == 0)
+    Terminate("dlogk == 0");
+
+  double u = (logk - PowerTable[binlow].logk) / dlogk;
+
+  double logD = (1 - u) * PowerTable[binlow].logD + u * PowerTable[binhigh].logD;
+
+  double Delta2 = pow(10.0, logD);
+
+  double P = Norm * Delta2 / (4 * M_PI * kold * kold * kold);
+
+  return P;
+}
+
+double ngenic::ngenic_powerspec_efstathiou(double k)
+{
+  return Norm * k / pow(1 + pow(AA * k + pow(BB * k, 1.5) + CC * CC * k * k, nu), 2 / nu);
+}
+
+double ngenic::ngenic_powerspec_eh(double k) /* Eisenstein & Hu */ { return Norm * k * pow(ngenic_tk_eh(k), 2); }
+
+double ngenic::ngenic_tk_eh(double k) /* from Martin White */
+{
+  double q, theta, ommh2, a, s, gamma, L0, C0;
+  double tmp;
+  double omegam, ombh2;
+
+  /* other input parameters */
+
+  omegam = All.Omega0;
+  ombh2  = All.OmegaBaryon * All.HubbleParam * All.HubbleParam;
+
+  if(All.OmegaBaryon == 0)  // TODO AB: remove this???
+    ombh2 = 0.04 * All.HubbleParam * All.HubbleParam;
+
+  k *= (3.085678e24 / All.UnitLength_in_cm); /* convert to h/Mpc */
+
+  theta = 2.728 / 2.7;
+  ommh2 = omegam * All.HubbleParam * All.HubbleParam;
+  s     = 44.5 * log(9.83 / ommh2) / sqrt(1. + 10. * exp(0.75 * log(ombh2))) * All.HubbleParam;
+  a     = 1. - 0.328 * log(431. * ommh2) * ombh2 / ommh2 + 0.380 * log(22.3 * ommh2) * (ombh2 / ommh2) * (ombh2 / ommh2);
+  gamma = a + (1. - a) / (1. + exp(4 * log(0.43 * k * s)));
+  gamma *= omegam * All.HubbleParam;
+  q   = k * theta * theta / gamma;
+  L0  = log(2. * exp(1.) + 1.8 * q);
+  C0  = 14.2 + 731. / (1. + 62.5 * q);
+  tmp = L0 / (L0 + C0 * q * q);
+  return (tmp);
+}
+
+double ngenic::ngenic_tophat_sigma2(double R)
+{
+  const int worksize = 1000000;
+
+  double result, abserr, kmin, kmax;
+  gsl_function F;
+
+  myparams par = {R, this};
+
+  gsl_integration_workspace *workspace = gsl_integration_workspace_alloc(worksize);
+  F.function                           = &sigma2_int;
+  F.params                             = &par;
+
+  if(All.PowerSpectrumType == 2)
+    {
+      kmin = pow(10.0, PowerTable[0].logk) * (All.UnitLength_in_cm / All.InputSpectrum_UnitLength_in_cm);
+      kmax = pow(10.0, PowerTable[NPowerTable - 1].logk) * (All.UnitLength_in_cm / All.InputSpectrum_UnitLength_in_cm);
+    }
+  else
+    {
+      kmin = 1.0e-15 / R;
+      kmax = 1.0e3 / R;
+    }
+
+  if(All.PowerSpectrumType == 2)
+    {
+      /* because of the oscillatory behaviour of the integrand, the gsl_integration_qag() has trouble with its error estimates
+       * when the function is piece-wise interpolated. That's why we integrate the tabulated function segment by segment.
+       */
+
+      /* first get a rough result with up to 10% relative error */
+      gsl_integration_qag(&F, log(kmin), log(kmax), 0, 0.1, worksize, GSL_INTEG_GAUSS15, workspace, &result, &abserr);
+
+      /* now set a low absolute error bound for each segment */
+      double errbound = 1.0e-8 / NPowerTable * result;
+      result          = 0.0;
+
+      for(int i = 0; i < NPowerTable - 2; i++)
+        {
+          double k0 = pow(10.0, PowerTable[i].logk) * (All.UnitLength_in_cm / All.InputSpectrum_UnitLength_in_cm);
+          double k1 = pow(10.0, PowerTable[i + 1].logk) * (All.UnitLength_in_cm / All.InputSpectrum_UnitLength_in_cm);
+          double x;
+
+          gsl_integration_qag(&F, log(k0), log(k1), errbound, 0, worksize, GSL_INTEG_GAUSS15, workspace, &x, &abserr);
+
+          result += x;
+        }
+    }
+  else
+    {
+      /* for the smooth analytic function, we integrate directly with a relative error estimate */
+      gsl_integration_qag(&F, log(kmin), log(kmax), 0, 1.0e-8, worksize, GSL_INTEG_GAUSS15, workspace, &result, &abserr);
+    }
+
+  gsl_integration_workspace_free(workspace);
+
+  return result;
+}
+
+double ngenic::ngenic_growth_factor(double astart, double aend) { return ngenic_growth(aend) / ngenic_growth(astart); }
+
+double ngenic::ngenic_growth(double a)
+{
+  double hubble_a;
+
+  hubble_a = sqrt(All.Omega0 / (a * a * a) + (1 - All.Omega0 - All.OmegaLambda) / (a * a) + All.OmegaLambda);
+
+  const int worksize = 100000;
+
+  double result, abserr;
+  gsl_function F;
+
+  gsl_integration_workspace *workspace = gsl_integration_workspace_alloc(worksize);
+  F.function                           = &ngenic_growth_int;
+
+  gsl_integration_qag(&F, 0, a, 0, 1.0e-8, worksize, GSL_INTEG_GAUSS41, workspace, &result, &abserr);
+
+  gsl_integration_workspace_free(workspace);
+
+  return hubble_a * result;
+}
+
+double ngenic::ngenic_f1_omega(double a)
+{
+  double omega_a;
+
+  omega_a = All.Omega0 / (All.Omega0 + a * (1 - All.Omega0 - All.OmegaLambda) + a * a * a * All.OmegaLambda);
+
+  return pow(omega_a, 0.6);
+}
+
+double ngenic::ngenic_f2_omega(double a)
+{
+  double omega_a;
+
+  omega_a = All.Omega0 / (All.Omega0 + a * (1 - All.Omega0 - All.OmegaLambda) + a * a * a * All.OmegaLambda);
+
+  return 2 * pow(omega_a, 4.0 / 7);
+}
+
+#endif
diff --git a/src/pm/pm.h b/src/pm/pm.h
new file mode 100644
index 0000000000000000000000000000000000000000..91cc56ef2cfb330fe1641b6e311dd9fbf309f2f1
--- /dev/null
+++ b/src/pm/pm.h
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  pm.h
+ *
+ *  \brief definition of a class to bundle the PM-force calculation algorithms
+ */
+
+#ifndef PM_H
+#define PM_H
+
+#if defined(PMGRID) || defined(NGENIC)
+
+#include <fftw3.h>
+
+typedef ptrdiff_t fft_ptrdiff_t;
+
+#ifdef DOUBLEPRECISION_FFTW
+typedef double fft_real;
+typedef fftw_complex fft_complex;
+#else
+typedef float fft_real;
+typedef fftwf_complex fft_complex;
+#endif
+
+#include "../mpi_utils/setcomm.h"
+#include "../pm/pm_nonperiodic.h"
+#include "../pm/pm_periodic.h"
+
+class pm : public pm_periodic, public pm_nonperiodic
+{
+ public:
+  pm(MPI_Comm comm) : setcomm(comm), pm_periodic(comm), pm_nonperiodic(comm) {}
+};
+
+#endif
+
+#endif
diff --git a/src/pm/pm_mpi_fft.cc b/src/pm/pm_mpi_fft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..767a901e0cb2bbcf7c9a4b4a755fbc8a2aef2a6c
--- /dev/null
+++ b/src/pm/pm_mpi_fft.cc
@@ -0,0 +1,1208 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  pm_mpi_fft.cc
+ *
+ *  \brief code for doing different variants of parallel FFT transforms
+ */
+
+#include "gadgetconfig.h"
+
+#if defined(PMGRID) || defined(NGENIC)
+
+#include <fftw3.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../pm/pm.h"
+#include "../system/system.h"
+
+/* We only use the one-dimensional FFTW3 routines, because the MPI versions of FFTW3
+ * allocated memory for themselves during the transforms (which we want to strictly avoid),
+ * and because we want to allow transforms that are so big that more than 2GB may be
+ * transferred betweeen processors.
+ */
+
+#ifndef FFT_COLUMN_BASED
+
+void pm_mpi_fft::my_slab_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
+{
+  subdivide_evenly(NgridX, NTask, ThisTask, &plan->slabstart_x, &plan->nslab_x);
+  subdivide_evenly(NgridY, NTask, ThisTask, &plan->slabstart_y, &plan->nslab_y);
+
+  plan->slab_to_task = (int *)Mem.mymalloc_movable(&plan->slab_to_task, "slab_to_task", NgridX * sizeof(int));
+
+  for(int task = 0; task < NTask; task++)
+    {
+      int start, n;
+
+      subdivide_evenly(NgridX, NTask, task, &start, &n);
+
+      for(int i = start; i < start + n; i++)
+        plan->slab_to_task[i] = task;
+    }
+
+  MPI_Allreduce(&plan->nslab_x, &plan->largest_x_slab, 1, MPI_INT, MPI_MAX, Communicator);
+  MPI_Allreduce(&plan->nslab_y, &plan->largest_y_slab, 1, MPI_INT, MPI_MAX, Communicator);
+
+  plan->slabs_x_per_task = (int *)Mem.mymalloc_movable(&plan->slabs_x_per_task, "slabs_x_per_task", NTask * sizeof(int));
+  MPI_Allgather(&plan->nslab_x, 1, MPI_INT, plan->slabs_x_per_task, 1, MPI_INT, Communicator);
+
+  plan->first_slab_x_of_task = (int *)Mem.mymalloc_movable(&plan->first_slab_x_of_task, "first_slab_x_of_task", NTask * sizeof(int));
+  MPI_Allgather(&plan->slabstart_x, 1, MPI_INT, plan->first_slab_x_of_task, 1, MPI_INT, Communicator);
+
+  plan->slabs_y_per_task = (int *)Mem.mymalloc_movable(&plan->slabs_y_per_task, "slabs_y_per_task", NTask * sizeof(int));
+  MPI_Allgather(&plan->nslab_y, 1, MPI_INT, plan->slabs_y_per_task, 1, MPI_INT, Communicator);
+
+  plan->first_slab_y_of_task = (int *)Mem.mymalloc_movable(&plan->first_slab_y_of_task, "first_slab_y_of_task", NTask * sizeof(int));
+  MPI_Allgather(&plan->slabstart_y, 1, MPI_INT, plan->first_slab_y_of_task, 1, MPI_INT, Communicator);
+
+  plan->NgridX = NgridX;
+  plan->NgridY = NgridY;
+  plan->NgridZ = NgridZ;
+
+  int Ngridz = NgridZ / 2 + 1; /* dimension needed in complex space */
+
+  plan->Ngridz = Ngridz;
+  plan->Ngrid2 = 2 * Ngridz;
+}
+
+void pm_mpi_fft::my_slab_based_fft_free(fft_plan *plan)
+{
+  Mem.myfree(plan->first_slab_y_of_task);
+  Mem.myfree(plan->slabs_y_per_task);
+  Mem.myfree(plan->first_slab_x_of_task);
+  Mem.myfree(plan->slabs_x_per_task);
+  Mem.myfree(plan->slab_to_task);
+}
+
+/*! \brief Transposes the array field
+ *
+ * The array field is transposed such that the data in x direction is local to only one task.
+ * This is done, so the force in x-direction can be obtained by finite differencing.
+ * However the array is not fully transposed, i.e. the x-direction is not the fastest
+ * running array index
+ *
+ * \param field The array to transpose
+ * \param scratch scratch space used during communication (same size as field)
+ */
+void pm_mpi_fft::my_slab_transposeA(fft_plan *plan, fft_real *field, fft_real *scratch)
+{
+  int n, prod, task, flag_big = 0, flag_big_all = 0;
+
+  prod = NTask * plan->nslab_x;
+
+  for(n = 0; n < prod; n++)
+    {
+      int x    = n / NTask;
+      int task = n % NTask;
+
+      int y;
+
+      for(y = plan->first_slab_y_of_task[task]; y < plan->first_slab_y_of_task[task] + plan->slabs_y_per_task[task]; y++)
+        memcpy(scratch + ((size_t)plan->NgridZ) * (plan->first_slab_y_of_task[task] * plan->nslab_x +
+                                                   x * plan->slabs_y_per_task[task] + (y - plan->first_slab_y_of_task[task])),
+               field + ((size_t)plan->Ngrid2) * (plan->NgridY * x + y), plan->NgridZ * sizeof(fft_real));
+    }
+
+  size_t *scount = (size_t *)Mem.mymalloc("scount", NTask * sizeof(size_t));
+  size_t *rcount = (size_t *)Mem.mymalloc("rcount", NTask * sizeof(size_t));
+  size_t *soff   = (size_t *)Mem.mymalloc("soff", NTask * sizeof(size_t));
+  size_t *roff   = (size_t *)Mem.mymalloc("roff", NTask * sizeof(size_t));
+
+  for(task = 0; task < NTask; task++)
+    {
+      scount[task] = plan->nslab_x * plan->slabs_y_per_task[task] * (plan->NgridZ * sizeof(fft_real));
+      rcount[task] = plan->nslab_y * plan->slabs_x_per_task[task] * (plan->NgridZ * sizeof(fft_real));
+
+      soff[task] = plan->first_slab_y_of_task[task] * plan->nslab_x * (plan->NgridZ * sizeof(fft_real));
+      roff[task] = plan->first_slab_x_of_task[task] * plan->nslab_y * (plan->NgridZ * sizeof(fft_real));
+
+      if(scount[task] > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+        flag_big = 1;
+    }
+
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  myMPI_Alltoallv(scratch, scount, soff, field, rcount, roff, 1, flag_big_all, Communicator);
+
+  Mem.myfree(roff);
+  Mem.myfree(soff);
+  Mem.myfree(rcount);
+  Mem.myfree(scount);
+}
+
+/*! \brief Undo the transposition of the array field
+ *
+ * The transposition of the array field is undone such that the data in
+ * x direction is distributed among all tasks again. Thus the result of
+ * force computation in x-direction is sent back to the original task.
+ *
+ * \param field The array to transpose
+ * \param scratch scratch space used during communication (same size as field)
+ */
+void pm_mpi_fft::my_slab_transposeB(fft_plan *plan, fft_real *field, fft_real *scratch)
+{
+  int n, prod, task, flag_big = 0, flag_big_all = 0;
+
+  size_t *scount = (size_t *)Mem.mymalloc("scount", NTask * sizeof(size_t));
+  size_t *rcount = (size_t *)Mem.mymalloc("rcount", NTask * sizeof(size_t));
+  size_t *soff   = (size_t *)Mem.mymalloc("soff", NTask * sizeof(size_t));
+  size_t *roff   = (size_t *)Mem.mymalloc("roff", NTask * sizeof(size_t));
+
+  for(task = 0; task < NTask; task++)
+    {
+      rcount[task] = plan->nslab_x * plan->slabs_y_per_task[task] * (plan->NgridZ * sizeof(fft_real));
+      scount[task] = plan->nslab_y * plan->slabs_x_per_task[task] * (plan->NgridZ * sizeof(fft_real));
+
+      roff[task] = plan->first_slab_y_of_task[task] * plan->nslab_x * (plan->NgridZ * sizeof(fft_real));
+      soff[task] = plan->first_slab_x_of_task[task] * plan->nslab_y * (plan->NgridZ * sizeof(fft_real));
+
+      if(scount[task] > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+        flag_big = 1;
+    }
+
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  myMPI_Alltoallv(field, scount, soff, scratch, rcount, roff, 1, flag_big_all, Communicator);
+
+  Mem.myfree(roff);
+  Mem.myfree(soff);
+  Mem.myfree(rcount);
+  Mem.myfree(scount);
+
+  prod = NTask * plan->nslab_x;
+
+  for(n = 0; n < prod; n++)
+    {
+      int x    = n / NTask;
+      int task = n % NTask;
+
+      int y;
+      for(y = plan->first_slab_y_of_task[task]; y < plan->first_slab_y_of_task[task] + plan->slabs_y_per_task[task]; y++)
+        memcpy(field + ((size_t)plan->Ngrid2) * (plan->NgridY * x + y),
+               scratch + ((size_t)plan->NgridZ) * (plan->first_slab_y_of_task[task] * plan->nslab_x +
+                                                   x * plan->slabs_y_per_task[task] + (y - plan->first_slab_y_of_task[task])),
+               plan->NgridZ * sizeof(fft_real));
+    }
+}
+
+/* Given a slab-decomposed 3D field a[...] with total dimension [nx x ny x nz], whose first dimension is
+ * split across the processors, this routine outputs in b[] the transpose where then the second dimension is split
+ * across the processors. sx[] gives for each MPI task how many slabs it has, and firstx[] is the first
+ * slab for a given task. Likewise, sy[]/firsty[] gives the same thing for the transposed order. Note, the
+ * contents of the array a[] will be destroyed by the routine.
+ *
+ * An element (x,y,z) is accessed in a[] with index [([x - firstx] * ny + y) * nz + z]
+ * and in b[] as [((y - firsty) * nx + x) * nz + z]
+ *
+ * if mode = 1, the reverse operation is carried out.
+ */
+void pm_mpi_fft::my_slab_transpose(void *av, void *bv, int *sx, int *firstx, int *sy, int *firsty, int nx, int ny, int nz, int mode)
+{
+  char *a = (char *)av;
+  char *b = (char *)bv;
+
+  size_t *scount = (size_t *)Mem.mymalloc("scount", NTask * sizeof(size_t));
+  size_t *rcount = (size_t *)Mem.mymalloc("rcount", NTask * sizeof(size_t));
+  size_t *soff   = (size_t *)Mem.mymalloc("soff", NTask * sizeof(size_t));
+  size_t *roff   = (size_t *)Mem.mymalloc("roff", NTask * sizeof(size_t));
+  int i, n, prod, flag_big = 0, flag_big_all = 0;
+
+  for(i = 0; i < NTask; i++)
+    {
+      scount[i] = sy[i] * sx[ThisTask] * ((size_t)nz);
+      rcount[i] = sy[ThisTask] * sx[i] * ((size_t)nz);
+      soff[i]   = firsty[i] * sx[ThisTask] * ((size_t)nz);
+      roff[i]   = sy[ThisTask] * firstx[i] * ((size_t)nz);
+
+      if(scount[i] * sizeof(fft_complex) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+        flag_big = 1;
+    }
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  if(mode == 0)
+    {
+      /* first pack the data into contiguous blocks */
+      prod = NTask * sx[ThisTask];
+      for(n = 0; n < prod; n++)
+        {
+          int k = n / NTask;
+          int i = n % NTask;
+          int j;
+
+          for(j = 0; j < sy[i]; j++)
+            memcpy(b + (k * sy[i] + j + firsty[i] * sx[ThisTask]) * (nz * sizeof(fft_complex)),
+                   a + (k * ny + (firsty[i] + j)) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
+        }
+
+      /* tranfer the data */
+      myMPI_Alltoallv(b, scount, soff, a, rcount, roff, sizeof(fft_complex), flag_big_all, Communicator);
+
+      /* unpack the data into the right order */
+      prod = NTask * sy[ThisTask];
+      for(n = 0; n < prod; n++)
+        {
+          int j = n / NTask;
+          int i = n % NTask;
+          int k;
+
+          for(k = 0; k < sx[i]; k++)
+            memcpy(b + (j * nx + k + firstx[i]) * (nz * sizeof(fft_complex)),
+                   a + ((k + firstx[i]) * sy[ThisTask] + j) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
+        }
+    }
+  else
+    {
+      /* first pack the data into contiguous blocks */
+      prod = NTask * sy[ThisTask];
+      for(n = 0; n < prod; n++)
+        {
+          int j = n / NTask;
+          int i = n % NTask;
+          int k;
+
+          for(k = 0; k < sx[i]; k++)
+            memcpy(b + ((k + firstx[i]) * sy[ThisTask] + j) * (nz * sizeof(fft_complex)),
+                   a + (j * nx + k + firstx[i]) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
+        }
+
+      /* tranfer the data */
+      myMPI_Alltoallv(b, rcount, roff, a, scount, soff, sizeof(fft_complex), flag_big_all, Communicator);
+
+      /* unpack the data into the right order */
+      prod = NTask * sx[ThisTask];
+      for(n = 0; n < prod; n++)
+        {
+          int k = n / NTask;
+          int i = n % NTask;
+          int j;
+
+          for(j = 0; j < sy[i]; j++)
+            memcpy(b + (k * ny + (firsty[i] + j)) * (nz * sizeof(fft_complex)),
+                   a + (k * sy[i] + j + firsty[i] * sx[ThisTask]) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
+        }
+    }
+
+  /* now the result is in b[] */
+
+  Mem.myfree(roff);
+  Mem.myfree(soff);
+  Mem.myfree(rcount);
+  Mem.myfree(scount);
+}
+
+void pm_mpi_fft::my_slab_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
+{
+  int n, prod;
+  int slabsx = plan->slabs_x_per_task[ThisTask];
+  int slabsy = plan->slabs_y_per_task[ThisTask];
+
+  int ngridx  = plan->NgridX;
+  int ngridy  = plan->NgridY;
+  int ngridz  = plan->Ngridz;
+  int ngridz2 = 2 * ngridz;
+
+  size_t ngridx_long  = ngridx;
+  size_t ngridy_long  = ngridy;
+  size_t ngridz_long  = ngridz;
+  size_t ngridz2_long = ngridz2;
+
+  fft_real *data_real       = (fft_real *)data;
+  fft_complex *data_complex = (fft_complex *)data, *workspace_complex = (fft_complex *)workspace;
+
+  if(forward == 1)
+    {
+      /* do the z-direction FFT, real to complex */
+      prod = slabsx * ngridy;
+      for(n = 0; n < prod; n++)
+        {
+          FFTW(execute_dft_r2c)(plan->forward_plan_zdir, data_real + n * ngridz2_long, workspace_complex + n * ngridz_long);
+        }
+
+      /* do the y-direction FFT, complex to complex */
+      prod = slabsx * ngridz;
+      for(n = 0; n < prod; n++)
+        {
+          int i = n / ngridz;
+          int j = n % ngridz;
+
+          FFTW(execute_dft)
+          (plan->forward_plan_ydir, workspace_complex + i * ngridz * ngridy_long + j, data_complex + i * ngridz * ngridy_long + j);
+        }
+
+      /* now our data resides in data_complex[] */
+
+      /* do the transpose */
+      my_slab_transpose(data_complex, workspace_complex, plan->slabs_x_per_task, plan->first_slab_x_of_task, plan->slabs_y_per_task,
+                        plan->first_slab_y_of_task, ngridx, ngridy, ngridz, 0);
+
+      /* now the data is in workspace_complex[] */
+
+      /* finally, do the transform along the x-direction (we are in transposed order, x and y have interchanged */
+      prod = slabsy * ngridz;
+      for(n = 0; n < prod; n++)
+        {
+          int i = n / ngridz;
+          int j = n % ngridz;
+
+          FFTW(execute_dft)
+          (plan->forward_plan_xdir, workspace_complex + i * ngridz * ngridx_long + j, data_complex + i * ngridz * ngridx_long + j);
+        }
+
+      /* now the result is in data_complex[] */
+    }
+  else
+    {
+      prod = slabsy * ngridz;
+
+      for(n = 0; n < prod; n++)
+        {
+          int i = n / ngridz;
+          int j = n % ngridz;
+
+          FFTW(execute_dft)
+          (plan->backward_plan_xdir, data_complex + i * ngridz * ngridx_long + j, workspace_complex + i * ngridz * ngridx_long + j);
+        }
+
+      my_slab_transpose(workspace_complex, data_complex, plan->slabs_x_per_task, plan->first_slab_x_of_task, plan->slabs_y_per_task,
+                        plan->first_slab_y_of_task, ngridx, ngridy, ngridz, 1);
+
+      prod = slabsx * ngridz;
+
+      for(n = 0; n < prod; n++)
+        {
+          int i = n / ngridz;
+          int j = n % ngridz;
+
+          FFTW(execute_dft)
+          (plan->backward_plan_ydir, data_complex + i * ngridz * ngridy_long + j, workspace_complex + i * ngridz * ngridy_long + j);
+        }
+
+      prod = slabsx * ngridy;
+
+      for(n = 0; n < prod; n++)
+        {
+          FFTW(execute_dft_c2r)(plan->backward_plan_zdir, workspace_complex + n * ngridz_long, data_real + n * ngridz2_long);
+        }
+
+      /* now the result is in data[] */
+    }
+}
+
+#else
+
+void pm_mpi_fft::my_column_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
+{
+  plan->NgridX = NgridX;
+  plan->NgridY = NgridY;
+  plan->NgridZ = NgridZ;
+
+  int Ngridz = NgridZ / 2 + 1;
+
+  plan->Ngridz = Ngridz;
+  plan->Ngrid2 = 2 * Ngridz;
+
+  subdivide_evenly(plan->NgridX * plan->NgridY, NTask, ThisTask, &plan->firstcol_XY, &plan->ncol_XY);
+  subdivide_evenly(plan->NgridX * plan->Ngrid2, NTask, ThisTask, &plan->firstcol_XZ, &plan->ncol_XZ);
+  subdivide_evenly(plan->Ngrid2 * plan->NgridY, NTask, ThisTask, &plan->firstcol_ZY, &plan->ncol_ZY);
+
+  plan->lastcol_XY = plan->firstcol_XY + plan->ncol_XY - 1;
+  plan->lastcol_XZ = plan->firstcol_XZ + plan->ncol_XZ - 1;
+  plan->lastcol_ZY = plan->firstcol_ZY + plan->ncol_ZY - 1;
+
+  subdivide_evenly(NgridX * Ngridz, NTask, ThisTask, &plan->transposed_firstcol, &plan->transposed_ncol);
+  subdivide_evenly(NgridY * Ngridz, NTask, ThisTask, &plan->second_transposed_firstcol, &plan->second_transposed_ncol);
+
+  plan->second_transposed_ncells = ((size_t)plan->NgridX) * plan->second_transposed_ncol;
+
+  plan->max_datasize = ((size_t)plan->Ngrid2) * plan->ncol_XY;
+  plan->max_datasize = std::max<size_t>(plan->max_datasize, 2 * ((size_t)plan->NgridY) * plan->transposed_ncol);
+  plan->max_datasize = std::max<size_t>(plan->max_datasize, 2 * ((size_t)plan->NgridX) * plan->second_transposed_ncol);
+  plan->max_datasize = std::max<size_t>(plan->max_datasize, ((size_t)plan->ncol_XZ) * plan->NgridY);
+  plan->max_datasize = std::max<size_t>(plan->max_datasize, ((size_t)plan->ncol_ZY) * plan->NgridX);
+
+  plan->fftsize = plan->max_datasize;
+
+  plan->offsets_send_A = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_A, "offsets_send_A", NTask * sizeof(size_t));
+  plan->offsets_recv_A = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_A, "offsets_recv_A", NTask * sizeof(size_t));
+  plan->offsets_send_B = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_B, "offsets_send_B", NTask * sizeof(size_t));
+  plan->offsets_recv_B = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_B, "offsets_recv_B", NTask * sizeof(size_t));
+  plan->offsets_send_C = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_C, "offsets_send_C", NTask * sizeof(size_t));
+  plan->offsets_recv_C = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_C, "offsets_recv_C", NTask * sizeof(size_t));
+  plan->offsets_send_D = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_D, "offsets_send_D", NTask * sizeof(size_t));
+  plan->offsets_recv_D = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_D, "offsets_recv_D", NTask * sizeof(size_t));
+
+  plan->count_send_A  = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_A, "count_send_A", NTask * sizeof(size_t));
+  plan->count_recv_A  = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_A, "count_recv_A", NTask * sizeof(size_t));
+  plan->count_send_B  = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_B, "count_send_B", NTask * sizeof(size_t));
+  plan->count_recv_B  = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_B, "count_recv_B", NTask * sizeof(size_t));
+  plan->count_send_C  = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_C, "count_send_C", NTask * sizeof(size_t));
+  plan->count_recv_C  = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_C, "count_recv_C", NTask * sizeof(size_t));
+  plan->count_send_D  = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_D, "count_send_D", NTask * sizeof(size_t));
+  plan->count_recv_D  = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_D, "count_recv_D", NTask * sizeof(size_t));
+  plan->count_send_13 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_13, "count_send_13", NTask * sizeof(size_t));
+  plan->count_recv_13 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_13, "count_recv_13", NTask * sizeof(size_t));
+  plan->count_send_23 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_23, "count_send_23", NTask * sizeof(size_t));
+  plan->count_recv_23 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_23, "count_recv_23", NTask * sizeof(size_t));
+  plan->count_send_13back =
+      (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_13back, "count_send_13back", NTask * sizeof(size_t));
+  plan->count_recv_13back =
+      (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_13back, "count_recv_13back", NTask * sizeof(size_t));
+  plan->count_send_23back =
+      (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_23back, "count_send_23back", NTask * sizeof(size_t));
+  plan->count_recv_23back =
+      (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_23back, "count_recv_23back", NTask * sizeof(size_t));
+
+  int dimA[3]  = {plan->NgridX, plan->NgridY, plan->Ngridz};
+  int permA[3] = {0, 2, 1};
+
+  my_fft_column_remap(NULL, dimA, plan->firstcol_XY, plan->ncol_XY, NULL, permA, plan->transposed_firstcol, plan->transposed_ncol,
+                      plan->offsets_send_A, plan->offsets_recv_A, plan->count_send_A, plan->count_recv_A, 1);
+
+  int dimB[3]  = {plan->NgridX, plan->Ngridz, plan->NgridY};
+  int permB[3] = {2, 1, 0};
+
+  my_fft_column_remap(NULL, dimB, plan->transposed_firstcol, plan->transposed_ncol, NULL, permB, plan->second_transposed_firstcol,
+                      plan->second_transposed_ncol, plan->offsets_send_B, plan->offsets_recv_B, plan->count_send_B, plan->count_recv_B,
+                      1);
+
+  int dimC[3]  = {plan->NgridY, plan->Ngridz, plan->NgridX};
+  int permC[3] = {2, 1, 0};
+
+  my_fft_column_remap(NULL, dimC, plan->second_transposed_firstcol, plan->second_transposed_ncol, NULL, permC,
+                      plan->transposed_firstcol, plan->transposed_ncol, plan->offsets_send_C, plan->offsets_recv_C, plan->count_send_C,
+                      plan->count_recv_C, 1);
+
+  int dimD[3]  = {plan->NgridX, plan->Ngridz, plan->NgridY};
+  int permD[3] = {0, 2, 1};
+
+  my_fft_column_remap(NULL, dimD, plan->transposed_firstcol, plan->transposed_ncol, NULL, permD, plan->firstcol_XY, plan->ncol_XY,
+                      plan->offsets_send_D, plan->offsets_recv_D, plan->count_send_D, plan->count_recv_D, 1);
+
+  int dim23[3]  = {plan->NgridX, plan->NgridY, plan->Ngrid2};
+  int perm23[3] = {0, 2, 1};
+
+  my_fft_column_transpose(NULL, dim23, plan->firstcol_XY, plan->ncol_XY, NULL, perm23, plan->firstcol_XZ, plan->ncol_XZ,
+                          plan->count_send_23, plan->count_recv_23, 1);
+
+  int dim23back[3]  = {plan->NgridX, plan->Ngrid2, plan->NgridY};
+  int perm23back[3] = {0, 2, 1};
+
+  my_fft_column_transpose(NULL, dim23back, plan->firstcol_XZ, plan->ncol_XZ, NULL, perm23back, plan->firstcol_XY, plan->ncol_XY,
+                          plan->count_send_23back, plan->count_recv_23back, 1);
+
+  int dim13[3]  = {plan->NgridX, plan->NgridY, plan->Ngrid2};
+  int perm13[3] = {2, 1, 0};
+
+  my_fft_column_transpose(NULL, dim13, plan->firstcol_XY, plan->ncol_XY, NULL, perm13, plan->firstcol_ZY, plan->ncol_ZY,
+                          plan->count_send_13, plan->count_recv_13, 1);
+
+  int dim13back[3]  = {plan->Ngrid2, plan->NgridY, plan->NgridX};
+  int perm13back[3] = {2, 1, 0};
+
+  my_fft_column_transpose(NULL, dim13back, plan->firstcol_ZY, plan->ncol_ZY, NULL, perm13back, plan->firstcol_XY, plan->ncol_XY,
+                          plan->count_send_13back, plan->count_recv_13back, 1);
+}
+
+void pm_mpi_fft::my_column_based_fft_free(fft_plan *plan)
+{
+  Mem.myfree(plan->count_recv_23back);
+  Mem.myfree(plan->count_send_23back);
+  Mem.myfree(plan->count_recv_13back);
+  Mem.myfree(plan->count_send_13back);
+  Mem.myfree(plan->count_recv_23);
+  Mem.myfree(plan->count_send_23);
+  Mem.myfree(plan->count_recv_13);
+  Mem.myfree(plan->count_send_13);
+  Mem.myfree(plan->count_recv_D);
+  Mem.myfree(plan->count_send_D);
+  Mem.myfree(plan->count_recv_C);
+  Mem.myfree(plan->count_send_C);
+  Mem.myfree(plan->count_recv_B);
+  Mem.myfree(plan->count_send_B);
+  Mem.myfree(plan->count_recv_A);
+  Mem.myfree(plan->count_send_A);
+
+  Mem.myfree(plan->offsets_recv_D);
+  Mem.myfree(plan->offsets_send_D);
+  Mem.myfree(plan->offsets_recv_C);
+  Mem.myfree(plan->offsets_send_C);
+  Mem.myfree(plan->offsets_recv_B);
+  Mem.myfree(plan->offsets_send_B);
+  Mem.myfree(plan->offsets_recv_A);
+  Mem.myfree(plan->offsets_send_A);
+}
+
+void pm_mpi_fft::my_fft_swap23(fft_plan *plan, fft_real *data, fft_real *out)
+{
+  int dim23[3]  = {plan->NgridX, plan->NgridY, plan->Ngrid2};
+  int perm23[3] = {0, 2, 1};
+
+  my_fft_column_transpose(data, dim23, plan->firstcol_XY, plan->ncol_XY, out, perm23, plan->firstcol_XZ, plan->ncol_XZ,
+                          plan->count_send_23, plan->count_recv_23, 0);
+}
+
+void pm_mpi_fft::my_fft_swap23back(fft_plan *plan, fft_real *data, fft_real *out)
+{
+  int dim23back[3]  = {plan->NgridX, plan->Ngrid2, plan->NgridY};
+  int perm23back[3] = {0, 2, 1};
+
+  my_fft_column_transpose(data, dim23back, plan->firstcol_XZ, plan->ncol_XZ, out, perm23back, plan->firstcol_XY, plan->ncol_XY,
+                          plan->count_send_23back, plan->count_recv_23back, 0);
+}
+
+void pm_mpi_fft::my_fft_swap13(fft_plan *plan, fft_real *data, fft_real *out)
+{
+  int dim13[3]  = {plan->NgridX, plan->NgridY, plan->Ngrid2};
+  int perm13[3] = {2, 1, 0};
+
+  my_fft_column_transpose(data, dim13, plan->firstcol_XY, plan->ncol_XY, out, perm13, plan->firstcol_ZY, plan->ncol_ZY,
+                          plan->count_send_13, plan->count_recv_13, 0);
+}
+
+void pm_mpi_fft::my_fft_swap13back(fft_plan *plan, fft_real *data, fft_real *out)
+{
+  int dim13back[3]  = {plan->Ngrid2, plan->NgridY, plan->NgridX};
+  int perm13back[3] = {2, 1, 0};
+
+  my_fft_column_transpose(data, dim13back, plan->firstcol_ZY, plan->ncol_ZY, out, perm13back, plan->firstcol_XY, plan->ncol_XY,
+                          plan->count_send_13back, plan->count_recv_13back, 0);
+}
+
+void pm_mpi_fft::my_column_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
+{
+  size_t n;
+  fft_real *data_real = (fft_real *)data, *workspace_real = (fft_real *)workspace;
+  fft_complex *data_complex = (fft_complex *)data, *workspace_complex = (fft_complex *)workspace;
+
+  if(forward == 1)
+    {
+      /* do the z-direction FFT, real to complex */
+      for(n = 0; n < plan->ncol_XY; n++)
+        FFTW(execute_dft_r2c)(plan->forward_plan_zdir, data_real + n * plan->Ngrid2, workspace_complex + n * plan->Ngridz);
+
+      int dimA[3]  = {plan->NgridX, plan->NgridY, plan->Ngridz};
+      int permA[3] = {0, 2, 1};
+
+      my_fft_column_remap(workspace_complex, dimA, plan->firstcol_XY, plan->ncol_XY, data_complex, permA, plan->transposed_firstcol,
+                          plan->transposed_ncol, plan->offsets_send_A, plan->offsets_recv_A, plan->count_send_A, plan->count_recv_A,
+                          0);
+
+      /* do the y-direction FFT in 'data', complex to complex */
+      for(n = 0; n < plan->transposed_ncol; n++)
+        FFTW(execute_dft)(plan->forward_plan_ydir, data_complex + n * plan->NgridY, workspace_complex + n * plan->NgridY);
+
+      int dimB[3]  = {plan->NgridX, plan->Ngridz, plan->NgridY};
+      int permB[3] = {2, 1, 0};
+
+      my_fft_column_remap(workspace_complex, dimB, plan->transposed_firstcol, plan->transposed_ncol, data_complex, permB,
+                          plan->second_transposed_firstcol, plan->second_transposed_ncol, plan->offsets_send_B, plan->offsets_recv_B,
+                          plan->count_send_B, plan->count_recv_B, 0);
+
+      /* do the x-direction FFT in 'data', complex to complex */
+      for(n = 0; n < plan->second_transposed_ncol; n++)
+        FFTW(execute_dft)(plan->forward_plan_xdir, data_complex + n * plan->NgridX, workspace_complex + n * plan->NgridX);
+
+      /* result is now in workspace */
+    }
+  else
+    {
+      /* do inverse FFT in 'data' */
+      for(n = 0; n < plan->second_transposed_ncol; n++)
+        FFTW(execute_dft)(plan->backward_plan_xdir, data_complex + n * plan->NgridX, workspace_complex + n * plan->NgridX);
+
+      int dimC[3]  = {plan->NgridY, plan->Ngridz, plan->NgridX};
+      int permC[3] = {2, 1, 0};
+
+      my_fft_column_remap(workspace_complex, dimC, plan->second_transposed_firstcol, plan->second_transposed_ncol, data_complex, permC,
+                          plan->transposed_firstcol, plan->transposed_ncol, plan->offsets_send_C, plan->offsets_recv_C,
+                          plan->count_send_C, plan->count_recv_C, 0);
+
+      /* do inverse FFT in 'data' */
+      for(n = 0; n < plan->transposed_ncol; n++)
+        FFTW(execute_dft)(plan->backward_plan_ydir, data_complex + n * plan->NgridY, workspace_complex + n * plan->NgridY);
+
+      int dimD[3]  = {plan->NgridX, plan->Ngridz, plan->NgridY};
+      int permD[3] = {0, 2, 1};
+
+      my_fft_column_remap(workspace_complex, dimD, plan->transposed_firstcol, plan->transposed_ncol, data_complex, permD,
+                          plan->firstcol_XY, plan->ncol_XY, plan->offsets_send_D, plan->offsets_recv_D, plan->count_send_D,
+                          plan->count_recv_D, 0);
+
+      /* do complex-to-real inverse transform on z-coordinates */
+      for(n = 0; n < plan->ncol_XY; n++)
+        FFTW(execute_dft_c2r)(plan->backward_plan_zdir, data_complex + n * plan->Ngridz, workspace_real + n * plan->Ngrid2);
+    }
+}
+
+void pm_mpi_fft::my_fft_column_remap(fft_complex *data, int Ndims[3], /* global dimensions of data cube */
+                                     int in_firstcol, int in_ncol,    /* first column and number of columns */
+                                     fft_complex *out, int perm[3], int out_firstcol, int out_ncol, size_t *offset_send,
+                                     size_t *offset_recv, size_t *count_send, size_t *count_recv, size_t just_count_flag)
+{
+  int j, target, origin, ngrp, recvTask, perm_rev[3], xyz[3], uvw[3];
+  size_t nimport, nexport;
+
+  /* determine the inverse permutation */
+  for(j = 0; j < 3; j++)
+    perm_rev[j] = perm[j];
+
+  if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2)) /* not yet the inverse */
+    {
+      for(j = 0; j < 3; j++)
+        perm_rev[j] = perm[perm[j]];
+
+      if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2))
+        Terminate("bummer");
+    }
+
+  int in_colums          = Ndims[0] * Ndims[1];
+  int in_avg             = (in_colums - 1) / NTask + 1;
+  int in_exc             = NTask * in_avg - in_colums;
+  int in_tasklastsection = NTask - in_exc;
+  int in_pivotcol        = in_tasklastsection * in_avg;
+
+  int out_colums          = Ndims[perm[0]] * Ndims[perm[1]];
+  int out_avg             = (out_colums - 1) / NTask + 1;
+  int out_exc             = NTask * out_avg - out_colums;
+  int out_tasklastsection = NTask - out_exc;
+  int out_pivotcol        = out_tasklastsection * out_avg;
+
+  size_t i, ncells = ((size_t)in_ncol) * Ndims[2];
+
+  xyz[0] = in_firstcol / Ndims[1];
+  xyz[1] = in_firstcol % Ndims[1];
+  xyz[2] = 0;
+
+  memset(count_send, 0, NTask * sizeof(size_t));
+
+  /* loop over all cells in input array and determine target processor */
+  for(i = 0; i < ncells; i++)
+    {
+      /* determine target task */
+      uvw[0] = xyz[perm[0]];
+      uvw[1] = xyz[perm[1]];
+      uvw[2] = xyz[perm[2]];
+
+      int newcol = Ndims[perm[1]] * uvw[0] + uvw[1];
+      if(newcol < out_pivotcol)
+        target = newcol / out_avg;
+      else
+        target = (newcol - out_pivotcol) / (out_avg - 1) + out_tasklastsection;
+
+      /* move data element to targettask */
+
+      if(just_count_flag)
+        count_send[target]++;
+      else
+        {
+          size_t off  = offset_send[target] + count_send[target]++;
+          out[off][0] = data[i][0];
+          out[off][1] = data[i][1];
+        }
+      xyz[2]++;
+      if(xyz[2] == Ndims[2])
+        {
+          xyz[2] = 0;
+          xyz[1]++;
+          if(xyz[1] == Ndims[1])
+            {
+              xyz[1] = 0;
+              xyz[0]++;
+            }
+        }
+    }
+
+  if(just_count_flag)
+    {
+      MPI_Alltoall(count_send, sizeof(size_t), MPI_BYTE, count_recv, sizeof(size_t), MPI_BYTE, Communicator);
+
+      for(j = 0, nimport = 0, nexport = 0, offset_send[0] = 0, offset_recv[0] = 0; j < NTask; j++)
+        {
+          nexport += count_send[j];
+          nimport += count_recv[j];
+
+          if(j > 0)
+            {
+              offset_send[j] = offset_send[j - 1] + count_send[j - 1];
+              offset_recv[j] = offset_recv[j - 1] + count_recv[j - 1];
+            }
+        }
+
+      if(nexport != ncells)
+        Terminate("nexport=%lld != ncells=%lld", (long long)nexport, (long long)ncells);
+    }
+  else
+    {
+      nimport = 0;
+
+      /* exchange all the data */
+      for(ngrp = 0; ngrp < (1 << PTask); ngrp++)
+        {
+          recvTask = ThisTask ^ ngrp;
+
+          if(recvTask < NTask)
+            {
+              if(count_send[recvTask] > 0 || count_recv[recvTask] > 0)
+                myMPI_Sendrecv(&out[offset_send[recvTask]], count_send[recvTask] * sizeof(fft_complex), MPI_BYTE, recvTask, TAG_DENS_A,
+                               &data[offset_recv[recvTask]], count_recv[recvTask] * sizeof(fft_complex), MPI_BYTE, recvTask,
+                               TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
+
+              nimport += count_recv[recvTask];
+            }
+        }
+
+      /* now loop over the new cell layout */
+      /* find enclosing rectangle around columns in new plane */
+
+      int first[3], last[3];
+
+      first[0] = out_firstcol / Ndims[perm[1]];
+      first[1] = out_firstcol % Ndims[perm[1]];
+      first[2] = 0;
+
+      last[0] = (out_firstcol + out_ncol - 1) / Ndims[perm[1]];
+      last[1] = (out_firstcol + out_ncol - 1) % Ndims[perm[1]];
+      last[2] = Ndims[perm[2]] - 1;
+
+      if(first[1] + out_ncol >= Ndims[perm[1]])
+        {
+          first[1] = 0;
+          last[1]  = Ndims[perm[1]] - 1;
+        }
+
+      /* now need to map this back to the old coordinates */
+
+      int xyz_first[3], xyz_last[3];
+
+      for(j = 0; j < 3; j++)
+        {
+          xyz_first[j] = first[perm_rev[j]];
+          xyz_last[j]  = last[perm_rev[j]];
+        }
+
+      memset(count_recv, 0, NTask * sizeof(size_t));
+
+      size_t count = 0;
+
+      /* traverse an enclosing box around the new cell layout in the old order */
+      for(xyz[0] = xyz_first[0]; xyz[0] <= xyz_last[0]; xyz[0]++)
+        for(xyz[1] = xyz_first[1]; xyz[1] <= xyz_last[1]; xyz[1]++)
+          for(xyz[2] = xyz_first[2]; xyz[2] <= xyz_last[2]; xyz[2]++)
+            {
+              /* check that the point is actually part of a column */
+              uvw[0] = xyz[perm[0]];
+              uvw[1] = xyz[perm[1]];
+              uvw[2] = xyz[perm[2]];
+
+              int col = uvw[0] * Ndims[perm[1]] + uvw[1];
+
+              if(col >= out_firstcol && col < out_firstcol + out_ncol)
+                {
+                  /* determine origin task */
+                  int newcol = Ndims[1] * xyz[0] + xyz[1];
+                  if(newcol < in_pivotcol)
+                    origin = newcol / in_avg;
+                  else
+                    origin = (newcol - in_pivotcol) / (in_avg - 1) + in_tasklastsection;
+
+                  size_t index = ((size_t)Ndims[perm[2]]) * (col - out_firstcol) + uvw[2];
+
+                  /* move data element from origin task */
+                  size_t off    = offset_recv[origin] + count_recv[origin]++;
+                  out[index][0] = data[off][0];
+                  out[index][1] = data[off][1];
+
+                  count++;
+                }
+            }
+
+      if(count != nimport)
+        {
+          int fi = out_firstcol % Ndims[perm[1]];
+          int la = (out_firstcol + out_ncol - 1) % Ndims[perm[1]];
+
+          Terminate("count=%lld nimport=%lld   ncol=%d fi=%d la=%d first=%d last=%d\n", (long long)count, (long long)nimport, out_ncol,
+                    fi, la, first[1], last[1]);
+        }
+    }
+}
+
+void pm_mpi_fft::my_fft_column_transpose(fft_real *data, int Ndims[3], /* global dimensions of data cube */
+                                         int in_firstcol, int in_ncol, /* first column and number of columns */
+                                         fft_real *out, int perm[3], int out_firstcol, int out_ncol, size_t *count_send,
+                                         size_t *count_recv, size_t just_count_flag)
+{
+  /* determine the inverse permutation */
+  int perm_rev[3];
+  for(int j = 0; j < 3; j++)
+    perm_rev[j] = perm[j];
+
+  if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2)) /* not yet the inverse */
+    {
+      for(int j = 0; j < 3; j++)
+        perm_rev[j] = perm[perm[j]];
+
+      if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2))
+        Terminate("bummer");
+    }
+
+  int in_colums          = Ndims[0] * Ndims[1];
+  int in_avg             = (in_colums - 1) / NTask + 1;
+  int in_exc             = NTask * in_avg - in_colums;
+  int in_tasklastsection = NTask - in_exc;
+  int in_pivotcol        = in_tasklastsection * in_avg;
+
+  int out_colums          = Ndims[perm[0]] * Ndims[perm[1]];
+  int out_avg             = (out_colums - 1) / NTask + 1;
+  int out_exc             = NTask * out_avg - out_colums;
+  int out_tasklastsection = NTask - out_exc;
+  int out_pivotcol        = out_tasklastsection * out_avg;
+
+  if(just_count_flag)
+    memset(count_send, 0, NTask * sizeof(size_t));
+
+  /* exchange all the data */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
+    {
+      int target = ThisTask ^ ngrp;
+
+      if(target < NTask)
+        {
+          // check whether we have anything to do
+          if(count_send[target] == 0 && count_recv[target] == 0 && just_count_flag == 0)
+            continue;
+
+          /* determine enclosing rectangle of current region */
+          int source_first[3];
+          source_first[0] = in_firstcol / Ndims[1];
+          source_first[1] = in_firstcol % Ndims[1];
+          source_first[2] = 0;
+
+          int source_last[3];
+          source_last[0] = (in_firstcol + in_ncol - 1) / Ndims[1];
+          source_last[1] = (in_firstcol + in_ncol - 1) % Ndims[1];
+          source_last[2] = Ndims[2] - 1;
+
+          if(source_first[1] + in_ncol >= Ndims[1])
+            {
+              source_first[1] = 0;
+              source_last[1]  = Ndims[1] - 1;
+            }
+
+          /* determine target columns */
+
+          int target_first_col    = 0;
+          int long target_num_col = 0;
+
+          if(target < out_tasklastsection)
+            {
+              target_first_col = target * out_avg;
+              target_num_col   = out_avg;
+            }
+          else
+            {
+              target_first_col = (target - out_tasklastsection) * (out_avg - 1) + out_pivotcol;
+              target_num_col   = (out_avg - 1);
+            }
+
+          /* find enclosing rectangle around columns in new plane */
+          int first[3], last[3];
+
+          first[0] = target_first_col / Ndims[perm[1]];
+          first[1] = target_first_col % Ndims[perm[1]];
+          first[2] = 0;
+
+          last[0] = (target_first_col + target_num_col - 1) / Ndims[perm[1]];
+          last[1] = (target_first_col + target_num_col - 1) % Ndims[perm[1]];
+          last[2] = Ndims[perm[2]] - 1;
+
+          if(first[1] + target_num_col >= Ndims[perm[1]])
+            {
+              first[1] = 0;
+              last[1]  = Ndims[perm[1]] - 1;
+            }
+
+          /* now we map this back to the old coordinates */
+          int xyz_first[3], xyz_last[3];
+
+          for(int j = 0; j < 3; j++)
+            {
+              xyz_first[j] = first[perm_rev[j]];
+              xyz_last[j]  = last[perm_rev[j]];
+            }
+
+          /* determine common box */
+          int xyz_start[3], xyz_end[3];
+          for(int j = 0; j < 3; j++)
+            {
+              xyz_start[j] = std::max<int>(xyz_first[j], source_first[j]);
+              xyz_end[j]   = std::min<int>(xyz_last[j], source_last[j]);
+            }
+
+          int xyz[3];
+          for(int j = 0; j < 3; j++)
+            xyz[j] = xyz_start[j];
+
+          /* now do the same determination for the flipped situation on the target side */
+
+          int flip_in_firstcol = 0;
+          int flip_in_ncol     = 0;
+
+          if(target < in_tasklastsection)
+            {
+              flip_in_firstcol = target * in_avg;
+              flip_in_ncol     = in_avg;
+            }
+          else
+            {
+              flip_in_firstcol = (target - in_tasklastsection) * (in_avg - 1) + in_pivotcol;
+              flip_in_ncol     = (in_avg - 1);
+            }
+
+          /* determine enclosing rectangle of current region */
+          int flip_source_first[3];
+          flip_source_first[0] = flip_in_firstcol / Ndims[1];
+          flip_source_first[1] = flip_in_firstcol % Ndims[1];
+          flip_source_first[2] = 0;
+
+          int flip_source_last[3];
+          flip_source_last[0] = (flip_in_firstcol + flip_in_ncol - 1) / Ndims[1];
+          flip_source_last[1] = (flip_in_firstcol + flip_in_ncol - 1) % Ndims[1];
+          flip_source_last[2] = Ndims[2] - 1;
+
+          if(flip_source_first[1] + flip_in_ncol >= Ndims[1])
+            {
+              flip_source_first[1] = 0;
+              flip_source_last[1]  = Ndims[1] - 1;
+            }
+
+          /* determine target columns */
+
+          int flip_first_col = 0;
+          int flip_num_col   = 0;
+
+          if(ThisTask < out_tasklastsection)
+            {
+              flip_first_col = ThisTask * out_avg;
+              flip_num_col   = out_avg;
+            }
+          else
+            {
+              flip_first_col = (ThisTask - out_tasklastsection) * (out_avg - 1) + out_pivotcol;
+              flip_num_col   = (out_avg - 1);
+            }
+
+          /* find enclosing rectangle around columns in new plane */
+          int flip_first[3], flip_last[3];
+
+          flip_first[0] = flip_first_col / Ndims[perm[1]];
+          flip_first[1] = flip_first_col % Ndims[perm[1]];
+          flip_first[2] = 0;
+
+          flip_last[0] = (flip_first_col + flip_num_col - 1) / Ndims[perm[1]];
+          flip_last[1] = (flip_first_col + flip_num_col - 1) % Ndims[perm[1]];
+          flip_last[2] = Ndims[perm[2]] - 1;
+
+          if(flip_first[1] + flip_num_col >= Ndims[perm[1]])
+            {
+              flip_first[1] = 0;
+              flip_last[1]  = Ndims[perm[1]] - 1;
+            }
+
+          /* now we map this back to the old coordinates */
+          int abc_first[3], abc_last[3];
+
+          for(int j = 0; j < 3; j++)
+            {
+              abc_first[j] = flip_first[perm_rev[j]];
+              abc_last[j]  = flip_last[perm_rev[j]];
+            }
+
+          /* determine common box */
+          int abc_start[3], abc_end[3];
+          for(int j = 0; j < 3; j++)
+            {
+              abc_start[j] = std::max<int>(abc_first[j], flip_source_first[j]);
+              abc_end[j]   = std::min<int>(abc_last[j], flip_source_last[j]);
+            }
+
+          int abc[3];
+
+          for(int j = 0; j < 3; j++)
+            abc[j] = abc_start[j];
+
+          size_t tot_count_send = 0;
+          size_t tot_count_recv = 0;
+
+          /* now check how much free memory there is on the two partners, use at most half of it */
+          size_t parnter_freebytes;
+          myMPI_Sendrecv(&Mem.FreeBytes, sizeof(size_t), MPI_BYTE, target, TAG_DENS_B, &parnter_freebytes, sizeof(size_t), MPI_BYTE,
+                         target, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+
+          size_t freeb = std::min<size_t>(parnter_freebytes, Mem.FreeBytes);
+
+          size_t limit = 0.5 * freeb / (sizeof(fft_real) + sizeof(fft_real));
+
+          if(just_count_flag)
+            limit = SIZE_MAX;
+
+          int iter = 0;
+          do
+            {
+              size_t limit_send = count_send[target] - tot_count_send;
+              size_t limit_recv = count_recv[target] - tot_count_recv;
+
+              if(just_count_flag)
+                {
+                  limit_send = SIZE_MAX;
+                  limit_recv = SIZE_MAX;
+                }
+              else
+                {
+                  if(limit_send > limit)
+                    limit_send = limit;
+
+                  if(limit_recv > limit)
+                    limit_recv = limit;
+                }
+
+              fft_real *buffer_send = NULL;
+              fft_real *buffer_recv = NULL;
+
+              if(just_count_flag == 0)
+                {
+                  buffer_send = (fft_real *)Mem.mymalloc("buffer_send", limit_send * sizeof(fft_real));
+                  buffer_recv = (fft_real *)Mem.mymalloc("buffer_recv", limit_recv * sizeof(fft_real));
+                }
+
+              /* traverse the common box between the new and old layout  */
+              size_t count = 0;
+
+              while(count < limit_send && xyz[0] <= xyz_end[0] && xyz[1] <= xyz_end[1] && xyz[2] <= xyz_end[2])
+                {
+                  /* check that the point is actually part of a column in the old layout */
+                  int col_old = xyz[0] * Ndims[1] + xyz[1];
+
+                  if(col_old >= in_firstcol && col_old < in_firstcol + in_ncol)
+                    {
+                      /* check that the point is actually part of a column in the new layout */
+                      int uvw[3];
+                      uvw[0] = xyz[perm[0]];
+                      uvw[1] = xyz[perm[1]];
+                      uvw[2] = xyz[perm[2]];
+
+                      int col_new = uvw[0] * Ndims[perm[1]] + uvw[1];
+
+                      if(col_new >= target_first_col && col_new < target_first_col + target_num_col)
+                        {
+                          // ok, we found a match
+
+                          if(just_count_flag)
+                            count_send[target]++;
+                          else
+                            {
+                              long long source_cell = (Ndims[1] * xyz[0] + xyz[1] - in_firstcol) * Ndims[2] + xyz[2];
+
+                              buffer_send[count++] = data[source_cell];
+                              tot_count_send++;
+                            }
+                        }
+                    }
+
+                  xyz[2]++;
+                  if(xyz[2] > xyz_end[2])
+                    {
+                      xyz[2] = xyz_start[2];
+                      xyz[1]++;
+                      if(xyz[1] > xyz_end[1])
+                        {
+                          xyz[1] = xyz_start[1];
+                          xyz[0]++;
+                        }
+                    }
+                }
+
+              if(just_count_flag == 0)
+                {
+                  myMPI_Sendrecv(buffer_send, limit_send * sizeof(fft_real), MPI_BYTE, target, TAG_DENS_A, buffer_recv,
+                                 limit_recv * sizeof(fft_real), MPI_BYTE, target, TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
+
+                  size_t count = 0;
+                  while(count < limit_recv && abc[0] <= abc_end[0] && abc[1] <= abc_end[1] && abc[2] <= abc_end[2])
+                    {
+                      /* check that the point is actually part of a column in the old layout */
+                      int col_old = abc[0] * Ndims[1] + abc[1];
+
+                      if(col_old >= flip_in_firstcol && col_old < flip_in_firstcol + flip_in_ncol)
+                        {
+                          /* check that the point is actually part of a column in the new layout */
+                          int uvw[3];
+                          uvw[0] = abc[perm[0]];
+                          uvw[1] = abc[perm[1]];
+                          uvw[2] = abc[perm[2]];
+
+                          int col_new = uvw[0] * Ndims[perm[1]] + uvw[1];
+
+                          if(col_new >= flip_first_col && col_new < flip_first_col + flip_num_col)
+                            {
+                              // ok, we found a match
+
+                              long long target_cell = (Ndims[perm[1]] * uvw[0] + uvw[1] - flip_first_col) * Ndims[perm[2]] + uvw[2];
+
+                              out[target_cell] = buffer_recv[count++];
+                              tot_count_recv++;
+                            }
+                        }
+
+                      abc[2]++;
+                      if(abc[2] > abc_end[2])
+                        {
+                          abc[2] = abc_start[2];
+                          abc[1]++;
+                          if(abc[1] > abc_end[1])
+                            {
+                              abc[1] = abc_start[1];
+                              abc[0]++;
+                            }
+                        }
+                    }
+
+                  Mem.myfree(buffer_recv);
+                  Mem.myfree(buffer_send);
+                }
+              else
+                break;
+
+              iter++;
+
+              if(iter > 20)
+                Terminate("high number of iterations: limit=%lld", (long long)limit);
+            }
+          while(tot_count_send < count_send[target] || tot_count_recv < count_recv[target]);
+        }
+    }
+  if(just_count_flag)
+    MPI_Alltoall(count_send, sizeof(size_t), MPI_BYTE, count_recv, sizeof(size_t), MPI_BYTE, Communicator);
+}
+
+#endif
+
+#endif
diff --git a/src/pm/pm_mpi_fft.h b/src/pm/pm_mpi_fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a54ef1ea636a627073e61a9228b04929806c9c6
--- /dev/null
+++ b/src/pm/pm_mpi_fft.h
@@ -0,0 +1,134 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  pm_mpi_fft.h
+ *
+ *  \brief declaration of a class for carrying out different variants of parallel FFT transforms
+ */
+
+#ifndef PM_MPI_FFT_H
+#define PM_MPI_FFT_H
+
+#include "../mpi_utils/setcomm.h"
+
+#ifndef FFTW
+#define CONCAT(prefix, name) prefix##name
+#ifdef DOUBLEPRECISION_FFTW
+#define FFTW(x) CONCAT(fftw_, x)
+#else
+#define FFTW(x) CONCAT(fftwf_, x)
+#endif
+#endif
+
+class pm_mpi_fft : public virtual setcomm
+{
+ public:
+  pm_mpi_fft(MPI_Comm comm) : setcomm(comm) {}
+
+  struct fft_plan
+  {
+    int NgridX, NgridY, NgridZ;
+    int Ngridz, Ngrid2;
+
+    FFTW(plan) forward_plan_zdir;
+    FFTW(plan) forward_plan_xdir;
+    FFTW(plan) forward_plan_ydir;
+
+    FFTW(plan) backward_plan_zdir;
+    FFTW(plan) backward_plan_ydir;
+    FFTW(plan) backward_plan_xdir;
+
+#ifndef FFT_COLUMN_BASED
+
+    int *slab_to_task; /*!< Maps a slab index to the task responsible for the slab */
+    int *slabs_x_per_task;
+    int *first_slab_x_of_task; /*!< Array containing the index of the first slab of each task */
+    int *slabs_y_per_task;     /*!< Array containing the number of slabs each task is responsible for */
+    int *first_slab_y_of_task; /*!< Array containing the index of the first slab of each task */
+
+    int nslab_x, slabstart_x, nslab_y, slabstart_y;
+    int largest_x_slab; /*!< size of the largest slab in x direction */
+    int largest_y_slab; /*!< size of the largest slab in y direction */
+
+#else
+    size_t max_datasize;
+    size_t fftsize;
+
+    int firstcol_XY, ncol_XY, lastcol_XY;
+    int firstcol_XZ, ncol_XZ, lastcol_XZ;
+    int firstcol_ZY, ncol_ZY, lastcol_ZY;
+
+    int transposed_firstcol, transposed_ncol;
+    int second_transposed_firstcol, second_transposed_ncol;
+    size_t second_transposed_ncells;
+
+    //  int pivotcol; /* to go from column number to task */
+    //   int avg;
+    //   int tasklastsection;
+
+    size_t *offsets_send_A;
+    size_t *offsets_recv_A;
+    size_t *offsets_send_B;
+    size_t *offsets_recv_B;
+    size_t *offsets_send_C;
+    size_t *offsets_recv_C;
+    size_t *offsets_send_D;
+    size_t *offsets_recv_D;
+
+    size_t *count_send_A;
+    size_t *count_recv_A;
+    size_t *count_send_B;
+    size_t *count_recv_B;
+    size_t *count_send_C;
+    size_t *count_recv_C;
+    size_t *count_send_D;
+    size_t *count_recv_D;
+    size_t *count_send_13;
+    size_t *count_recv_13;
+    size_t *count_send_23;
+    size_t *count_recv_23;
+    size_t *count_send_13back;
+    size_t *count_recv_13back;
+    size_t *count_send_23back;
+    size_t *count_recv_23back;
+#endif
+  };
+
+  void my_slab_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ);
+  void my_slab_based_fft(fft_plan *plan, void *data, void *workspace, int forward);
+  void my_slab_based_fft_free(fft_plan *plan);
+
+  void my_column_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ);
+  void my_column_based_fft(fft_plan *plan, void *data, void *workspace, int forward);
+  void my_column_based_fft_free(fft_plan *plan);
+
+  void my_slab_transposeA(fft_plan *plan, fft_real *field, fft_real *scratch);
+  void my_slab_transposeB(fft_plan *plan, fft_real *field, fft_real *scratch);
+
+  void my_fft_swap23(fft_plan *plan, fft_real *data, fft_real *out);
+  void my_fft_swap13(fft_plan *plan, fft_real *data, fft_real *out);
+  void my_fft_swap23back(fft_plan *plan, fft_real *data, fft_real *out);
+  void my_fft_swap13back(fft_plan *plan, fft_real *data, fft_real *out);
+
+ private:
+#ifndef FFT_COLUMN_BASED
+
+  void my_slab_transpose(void *av, void *bv, int *sx, int *firstx, int *sy, int *firsty, int nx, int ny, int nz, int mode);
+
+#else
+  void my_fft_column_remap(fft_complex *data, int Ndims[3], int in_firstcol, int in_ncol, fft_complex *out, int perm[3],
+                           int out_firstcol, int out_ncol, size_t *offset_send, size_t *offset_recv, size_t *count_send,
+                           size_t *count_recv, size_t just_count_flag);
+
+  void my_fft_column_transpose(fft_real *data, int Ndims[3], /* global dimensions of data cube */
+                               int in_firstcol, int in_ncol, /* first column and number of columns */
+                               fft_real *out, int perm[3], int out_firstcol, int out_ncol, size_t *count_send, size_t *count_recv,
+                               size_t just_count_flag);
+
+#endif
+};
+
+#endif
diff --git a/src/pm/pm_nonperiodic.cc b/src/pm/pm_nonperiodic.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fca4575786ec64d00eacf0f892d64986e133c513
--- /dev/null
+++ b/src/pm/pm_nonperiodic.cc
@@ -0,0 +1,1901 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  pm_nonperiodic.cc
+ *
+ *  \brief code for non-periodic long-range PM force calculation
+ */
+
+#include "gadgetconfig.h"
+
+#if defined(PMGRID) && (!defined(PERIODIC) || defined(PLACEHIGHRESREGION))
+
+#include <fftw3.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../pm/pm.h"
+#include "../pm/pm_mpi_fft.h"
+#include "../pm/pm_nonperiodic.h"
+#include "../sort/cxxsort.h"
+#include "../src/gravtree/gravtree.h"
+#include "../src/time_integration/timestep.h"
+#include "../system/system.h"
+
+#define GRID (HRPMGRID)
+#define GRIDz (GRID / 2 + 1)
+#define GRID2 (2 * GRIDz)
+
+#define FI(x, y, z) (((large_array_offset)GRID2) * (GRID * (x) + (y)) + (z))
+#define FC(c, z) (((large_array_offset)GRID2) * ((c)-myplan.firstcol_XY) + (z))
+#define TI(x, y, z) (((large_array_offset)GRID) * ((x) + (y)*myplan.nslab_x) + (z))
+
+/*! This function determines the particle extension of all particles, and for
+ *  those types selected with PLACEHIGHRESREGION if this is used, and then
+ *  determines the boundaries of the non-periodic FFT-mesh that can be placed
+ *  on this region. Note that a sufficient buffer region at the rim of the
+ *  occupied part of the mesh needs to be reserved in order to allow a correct
+ *  finite differencing using a 4-point formula. In addition, to allow
+ *  non-periodic boundaries, the actual FFT mesh used is twice as large in
+ *  each dimension compared with GRID.
+ */
+void pm_nonperiodic::pm_init_regionsize(void)
+{
+  /* first, find a reference coordinate by selecting an arbitrary particle in the respective regions. For definiteness, we choose the
+   * first particle */
+
+  particle_data *P  = Sp->P;
+  int have_low_mesh = NTask, have_high_mesh = NTask; /* default is we don't have a particle */
+
+  if(Sp->NumPart > 0)
+    {
+      for(int j = 0; j < 3; j++)
+        Sp->ReferenceIntPos[LOW_MESH][j] = P[0].IntPos[j];
+
+      have_low_mesh = ThisTask;
+    }
+
+  for(int i = 0; i < Sp->NumPart; i++)
+    {
+#ifdef PLACEHIGHRESREGION
+      if(((1 << P[i].getType()) & (PLACEHIGHRESREGION)))
+        {
+          for(int j = 0; j < 3; j++)
+            Sp->ReferenceIntPos[HIGH_MESH][j] = P[i].IntPos[j];
+
+          have_high_mesh = ThisTask;
+          break;
+        }
+#endif
+    }
+
+  int have_global[4] = {have_low_mesh, ThisTask, have_high_mesh, ThisTask};
+
+  MPI_Allreduce(MPI_IN_PLACE, have_global, 2, MPI_2INT, MPI_MINLOC, Communicator);
+
+  if(have_global[0] >= NTask)
+    Terminate("have_global[0] >= NTask: Don't we have any particle?");
+
+  MPI_Bcast(&Sp->ReferenceIntPos[LOW_MESH][0], 3 * sizeof(MyIntPosType), MPI_BYTE, have_global[1], Communicator);
+
+#ifdef PLACEHIGHRESREGION
+  if(have_global[2] >= NTask)
+    Terminate("have_global[2] >= NTask: Apparently there are no particles in high res region");
+
+  MPI_Bcast(&Sp->ReferenceIntPos[HIGH_MESH][0], 3 * sizeof(MyIntPosType), MPI_BYTE, have_global[3], Communicator);
+#endif
+
+  /* find enclosing rectangle */
+
+  MySignedIntPosType xmin[2][3], xmax[2][3];
+
+  for(int j = 0; j < 3; j++)
+    {
+      xmin[LOW_MESH][j] = xmin[HIGH_MESH][j] = 0;
+      xmax[LOW_MESH][j] = xmax[HIGH_MESH][j] = 0;
+    }
+
+  for(int i = 0; i < Sp->NumPart; i++)
+    {
+      MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[LOW_MESH][0], P[i].IntPos[1] - Sp->ReferenceIntPos[LOW_MESH][1],
+                              P[i].IntPos[2] - Sp->ReferenceIntPos[LOW_MESH][2]};
+
+      MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+      for(int j = 0; j < 3; j++)
+        {
+          if(delta[j] > xmax[LOW_MESH][j])
+            xmax[LOW_MESH][j] = delta[j];
+          if(delta[j] < xmin[LOW_MESH][j])
+            xmin[LOW_MESH][j] = delta[j];
+        }
+
+#ifdef PLACEHIGHRESREGION
+      if(((1 << P[i].getType()) & (PLACEHIGHRESREGION)))
+        {
+          MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[HIGH_MESH][0],
+                                  P[i].IntPos[1] - Sp->ReferenceIntPos[HIGH_MESH][1],
+                                  P[i].IntPos[2] - Sp->ReferenceIntPos[HIGH_MESH][2]};
+
+          MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+          for(int j = 0; j < 3; j++)
+            {
+              if(delta[j] > xmax[HIGH_MESH][j])
+                xmax[HIGH_MESH][j] = delta[j];
+              if(delta[j] < xmin[HIGH_MESH][j])
+                xmin[HIGH_MESH][j] = delta[j];
+            }
+        }
+#endif
+    }
+
+  MPI_Allreduce(xmin, Sp->Xmintot, 6, MPI_MyIntPosType, MPI_MIN_MySignedIntPosType, Communicator);
+  MPI_Allreduce(xmax, Sp->Xmaxtot, 6, MPI_MyIntPosType, MPI_MAX_MySignedIntPosType, Communicator);
+
+  for(int i = 0; i < 3; i++)
+    {
+      Sp->Xmaxtot[LOW_MESH][i] += 1; /* so that all particles fulfill   xmin <= pos < xmax instead of xmin <= pos <= xmax*/
+      Sp->Xmaxtot[HIGH_MESH][i] += 1;
+    }
+
+  MyIntPosType inner_meshsize[2], enclosing_meshsize[2];
+  int flag_recompute_kernel = 0;
+
+  for(int mesh = 0; mesh < 2; mesh++)
+    {
+      inner_meshsize[mesh] = (MyIntPosType)(Sp->Xmaxtot[mesh][0] - Sp->Xmintot[mesh][0]);
+
+      if((MyIntPosType)(Sp->Xmaxtot[mesh][1] - Sp->Xmintot[mesh][1]) > inner_meshsize[mesh])
+        inner_meshsize[mesh] = Sp->Xmaxtot[mesh][1] - Sp->Xmintot[mesh][1];
+
+      if((MyIntPosType)(Sp->Xmaxtot[mesh][2] - Sp->Xmintot[mesh][2]) > inner_meshsize[mesh])
+        inner_meshsize[mesh] = Sp->Xmaxtot[mesh][2] - Sp->Xmintot[mesh][2];
+    }
+
+  for(int mesh = 0; mesh < 2; mesh++)
+    {
+#ifdef PERIODIC
+      if(mesh == LOW_MESH)
+        continue;
+#endif
+#ifndef PLACEHIGHRESREGION
+      if(mesh == HIGH_MESH)
+        continue;
+#endif
+
+      MyIntPosType blocksize = 1;
+      MyIntPosType mask      = ~((MyIntPosType)0);
+
+      if(mesh == LOW_MESH)
+        {
+          MyIntPosType refsize = inner_meshsize[mesh] >> 4; /* pick 1/8 as reference size */
+
+          for(int i = 0; i < BITS_FOR_POSITIONS; i++)
+            {
+              if(blocksize >= refsize)
+                break;
+
+              blocksize <<= 1;
+              mask <<= 1;
+            }
+        }
+      else
+        {
+          blocksize = Sp->PlacingBlocksize;
+        }
+
+      mpi_printf(
+          "PM-NONPERIODIC: Allowed region for isolated PM mesh (%s):  BEFORE  (%g|%g|%g) -> (%g|%g|%g)  inner meshsize=%g  "
+          "blocksize=%g\n",
+          mesh == LOW_MESH ? "coarse" : "fine", Sp->FacIntToCoord * Sp->Xmintot[mesh][0], Sp->FacIntToCoord * Sp->Xmintot[mesh][1],
+          Sp->FacIntToCoord * Sp->Xmintot[mesh][2], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][0], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][1],
+          Sp->FacIntToCoord * Sp->Xmaxtot[mesh][2], Sp->FacIntToCoord * inner_meshsize[mesh], Sp->FacIntToCoord * blocksize);
+
+      enclosing_meshsize[mesh] = 0;
+
+      /* expand the box so that it aligns with blocksize */
+      for(int i = 0; i < 3; i++)
+        {
+          MyIntPosType left, right;
+          if(mesh == LOW_MESH)
+            {
+              /* now round it down */
+              left = ((Sp->Xmintot[mesh][i] + Sp->Xmaxtot[mesh][i]) / 2 - inner_meshsize[mesh] / 2) + Sp->ReferenceIntPos[mesh][i];
+              left &= mask;
+
+              /* now round it up */
+              right = ((Sp->Xmintot[mesh][i] + Sp->Xmaxtot[mesh][i]) / 2 + inner_meshsize[mesh] / 2) + Sp->ReferenceIntPos[mesh][i];
+              right &= mask;
+              right += blocksize;
+            }
+          else
+            {
+              left  = (Sp->ReferenceIntPos[HIGH_MESH][i] + Sp->Xmintot[HIGH_MESH][i]) & Sp->PlacingMask;
+              right = left + Sp->PlacingBlocksize;
+            }
+
+          Sp->Xmintot[mesh][i] = left - Sp->ReferenceIntPos[mesh][i];
+          Sp->Xmaxtot[mesh][i] = right - Sp->ReferenceIntPos[mesh][i];
+
+          Sp->Left[mesh][i] = left;
+
+          Sp->MeshSize[mesh][i] = Sp->Xmaxtot[mesh][i] - Sp->Xmintot[mesh][i];
+
+          if(Sp->MeshSize[mesh][i] > enclosing_meshsize[mesh])
+            enclosing_meshsize[mesh] = Sp->MeshSize[mesh][i];
+        }
+
+      mpi_printf(
+          "PM-NONPERIODIC: Allowed region for isolated PM mesh (%s):  AFTER   (%g|%g|%g) -> (%g|%g|%g)  enclosing_meshsize=%g   "
+          "absolute space (%g|%g|%g) -> (%g|%g|%g)\n",
+          mesh == LOW_MESH ? "coarse" : "fine", Sp->FacIntToCoord * Sp->Xmintot[mesh][0], Sp->FacIntToCoord * Sp->Xmintot[mesh][1],
+          Sp->FacIntToCoord * Sp->Xmintot[mesh][2], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][0], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][1],
+          Sp->FacIntToCoord * Sp->Xmaxtot[mesh][2], Sp->FacIntToCoord * enclosing_meshsize[mesh],
+          Sp->FacIntToCoord * (Sp->Xmintot[mesh][0] + Sp->ReferenceIntPos[mesh][0]),
+          Sp->FacIntToCoord * (Sp->Xmintot[mesh][1] + Sp->ReferenceIntPos[mesh][1]),
+          Sp->FacIntToCoord * (Sp->Xmintot[mesh][2] + Sp->ReferenceIntPos[mesh][2]),
+          Sp->FacIntToCoord * (Sp->Xmaxtot[mesh][0] + Sp->ReferenceIntPos[mesh][0]),
+          Sp->FacIntToCoord * (Sp->Xmaxtot[mesh][1] + Sp->ReferenceIntPos[mesh][1]),
+          Sp->FacIntToCoord * (Sp->Xmaxtot[mesh][2] + Sp->ReferenceIntPos[mesh][2]));
+
+      if(enclosing_meshsize[mesh] != Sp->OldMeshSize[mesh])
+        {
+          flag_recompute_kernel = 1;
+          Sp->OldMeshSize[mesh] = enclosing_meshsize[mesh];
+        }
+
+      /* this will produce enough room for zero-padding and buffer region to
+       allow finite differencing of the potential  */
+      Sp->TotalMeshSize[mesh] = 2.0 * enclosing_meshsize[mesh] * (GRID / ((double)(GRID - 10)));
+
+      /* move lower left corner by two cells to allow finite differencing of the potential by a 4-point function */
+      for(int i = 0; i < 3; i++)
+        Sp->Corner[mesh][i] = Sp->Xmintot[mesh][i] - (2.0 * Sp->TotalMeshSize[mesh] / GRID);
+
+      Sp->Asmth[mesh] = ASMTH * (Sp->FacIntToCoord * Sp->TotalMeshSize[mesh]) / GRID;
+      Sp->Rcut[mesh]  = RCUT * Sp->Asmth[mesh];
+    }
+
+  static int first_init_done = 0; /* for detecting restart from restartfiles */
+  if(flag_recompute_kernel || first_init_done == 0)
+    {
+#ifndef PERIODIC
+      mpi_printf("PM-NONPERIODIC: Recompute kernel course mesh:  Asmth=%g   Rcut=%g     mesh cell size=%g\n", Sp->Asmth[LOW_MESH],
+                 Sp->Rcut[LOW_MESH], Sp->FacIntToCoord * Sp->TotalMeshSize[LOW_MESH] / GRID);
+#endif
+#ifdef PLACEHIGHRESREGION
+      mpi_printf("PM-NONPERIODIC: Recompute kernel fine mesh:    Asmth=%g   Rcut=%g     mesh cell size=%g\n", Sp->Asmth[HIGH_MESH],
+                 Sp->Rcut[HIGH_MESH], Sp->FacIntToCoord * Sp->TotalMeshSize[HIGH_MESH] / GRID);
+#endif
+
+      pm_setup_nonperiodic_kernel();
+      first_init_done = 1;
+    }
+}
+
+/*! Initialization of the non-periodic PM routines. The plan-files for FFTW
+ *  are created. Finally, the routine to set-up the non-periodic Greens
+ *  function is called.
+ */
+void pm_nonperiodic::pm_init_nonperiodic(simparticles *Sp_ptr)
+{
+  Sp = Sp_ptr;
+
+  /* Set up the FFTW-3 plan files. */
+  int ndim[1] = {GRID}; /* dimension of the 1D transforms */
+
+  /* temporarily allocate some arrays to make sure that out-of-place plans are created */
+  rhogrid   = (fft_real *)Mem.mymalloc("rhogrid", GRID2 * sizeof(fft_real));
+  forcegrid = (fft_real *)Mem.mymalloc("forcegrid", GRID2 * sizeof(fft_real));
+
+#ifdef DOUBLEPRECISION_FFTW
+  int alignflag = 0;
+#else
+  /* for single precision, the start of our FFT columns is presently only guaranteed to be 8-byte aligned */
+  int alignflag = FFTW_UNALIGNED;
+#endif
+#ifndef FFT_COLUMN_BASED
+  int stride = GRIDz;
+#else
+  int stride    = 1;
+#endif
+
+  myplan.forward_plan_zdir = FFTW(plan_many_dft_r2c)(1, ndim, 1, rhogrid, 0, 1, GRID2, (fft_complex *)forcegrid, 0, 1, GRIDz,
+                                                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.forward_plan_xdir =
+      FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
+                          GRIDz * GRID, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.forward_plan_ydir =
+      FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
+                          GRIDz * GRID, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.backward_plan_zdir = FFTW(plan_many_dft_c2r)(1, ndim, 1, (fft_complex *)rhogrid, 0, 1, GRIDz, forcegrid, 0, 1, GRID2,
+                                                      FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.backward_plan_xdir =
+      FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
+                          GRIDz * GRID, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.backward_plan_ydir =
+      FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
+                          GRIDz * GRID, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  Mem.myfree(forcegrid);
+  Mem.myfree(rhogrid);
+
+#ifndef FFT_COLUMN_BASED
+
+  my_slab_based_fft_init(&myplan, GRID, GRID, GRID);
+
+  maxfftsize = myplan.largest_x_slab * GRID * ((size_t)GRID2);
+
+#else
+
+  my_column_based_fft_init(&myplan, GRID, GRID, GRID);
+
+  maxfftsize = myplan.max_datasize;
+
+#endif
+
+  /* now allocate memory to hold the FFT fields */
+
+  size_t bytes, bytes_tot = 0;
+
+#if !defined(PERIODIC)
+  kernel[0] = (fft_real *)Mem.mymalloc("kernel[0]", bytes = maxfftsize * sizeof(fft_real));
+  bytes_tot += bytes;
+  fft_of_kernel[0] = (fft_complex *)kernel[0];
+#endif
+
+#if defined(PLACEHIGHRESREGION)
+  kernel[1] = (fft_real *)Mem.mymalloc("kernel[1]", bytes = maxfftsize * sizeof(fft_real));
+  bytes_tot += bytes;
+  fft_of_kernel[1] = (fft_complex *)kernel[1];
+#endif
+
+  mpi_printf("\nPM-NONPERIODIC: Allocated %g MByte for FFT kernel(s).\n\n", bytes_tot / (1024.0 * 1024.0));
+}
+
+#ifdef PM_ZOOM_OPTIMIZED
+
+void pm_nonperiodic::pmforce_nonperiodic_zoom_optimized_prepare_density(int grnr)
+{
+  MPI_Status status;
+
+  particle_data *P = Sp->P;
+
+  double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
+
+  part = (part_slab_data *)Mem.mymalloc("part", 8 * (NSource * sizeof(part_slab_data)));
+  large_numpart_type *part_sortindex =
+      (large_numpart_type *)Mem.mymalloc("part_sortindex", 8 * (NSource * sizeof(large_numpart_type)));
+
+  int ngrid = 0;
+
+#ifdef FFT_COLUMN_BASED
+  int columns         = GRIDX * GRIDY;
+  int avg             = (columns - 1) / NTask + 1;
+  int exc             = NTask * avg - columns;
+  int tasklastsection = NTask - exc;
+  int pivotcol        = tasklastsection * avg;
+#endif
+
+  /* determine the cells each particle accesses */
+  for(int idx = 0; idx < NSource; idx++)
+    {
+      int i = Sp->get_active_index(idx);
+
+      if(P[i].Ti_Current != All.Ti_Current)
+        Sp->drift_particle(&P[i], &Sp->SphP[i], All.Ti_Current);
+
+      MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
+                              P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
+
+      MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+      if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
+        continue;
+
+      if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
+        continue;
+
+      if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
+        continue;
+
+      int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
+      int slab_y = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
+      int slab_z = (int)(to_slab_fac * (delta[2] - Sp->Corner[grnr][2]));
+      int myngrid;
+
+      myngrid = ngrid;
+      ngrid += 1;
+
+      large_numpart_type index_on_grid = ((large_numpart_type)myngrid) * 8;
+
+      for(int xx = 0; xx < 2; xx++)
+        for(int yy = 0; yy < 2; yy++)
+          for(int zz = 0; zz < 2; zz++)
+            {
+              int slab_xx = slab_x + xx;
+              int slab_yy = slab_y + yy;
+              int slab_zz = slab_z + zz;
+
+              if(slab_xx >= GRID)
+                slab_xx -= GRID;
+              if(slab_yy >= GRID)
+                slab_yy -= GRID;
+              if(slab_zz >= GRID)
+                slab_zz -= GRID;
+
+              large_array_offset offset = FI(slab_xx, slab_yy, slab_zz);
+
+              part[index_on_grid].partindex   = (i << 3) + (xx << 2) + (yy << 1) + zz;
+              part[index_on_grid].globalindex = offset;
+              part_sortindex[index_on_grid]   = index_on_grid;
+              index_on_grid++;
+            }
+    }
+
+  /* note: num_on_grid will be  8 times larger than the particle number, but num_field_points will generally be much smaller */
+  num_on_grid = ((large_numpart_type)ngrid) * 8;
+
+  /* bring the part-field into the order of the accessed cells. This allows the removal of duplicates */
+  mycxxsort(part_sortindex, part_sortindex + num_on_grid, pm_nonperiodic_sortindex_comparator(part));
+
+  large_array_offset num_field_points;
+
+  if(num_on_grid > 0)
+    num_field_points = 1;
+  else
+    num_field_points = 0;
+
+  /* determine the number of unique field points */
+  for(large_numpart_type i = 1; i < num_on_grid; i++)
+    {
+      if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
+        num_field_points++;
+    }
+
+  /* allocate the local field */
+  localfield_globalindex = (large_array_offset *)Mem.mymalloc_movable(&localfield_globalindex, "localfield_globalindex",
+                                                                      num_field_points * sizeof(large_array_offset));
+  localfield_data        = (fft_real *)Mem.mymalloc_movable(&localfield_data, "localfield_data", num_field_points * sizeof(fft_real));
+  localfield_first       = (size_t *)Mem.mymalloc_movable(&localfield_first, "localfield_first", NTask * sizeof(size_t));
+  localfield_sendcount   = (size_t *)Mem.mymalloc_movable(&localfield_sendcount, "localfield_sendcount", NTask * sizeof(size_t));
+  localfield_offset      = (size_t *)Mem.mymalloc_movable(&localfield_offset, "localfield_offset", NTask * sizeof(size_t));
+  localfield_recvcount   = (size_t *)Mem.mymalloc_movable(&localfield_recvcount, "localfield_recvcount", NTask * sizeof(size_t));
+
+  for(int i = 0; i < NTask; i++)
+    {
+      localfield_first[i]     = 0;
+      localfield_sendcount[i] = 0;
+    }
+
+  /* establish the cross link between the part[ ]-array and the local list of
+   * mesh points. Also, count on which CPU the needed field points are stored.
+   */
+  num_field_points = 0;
+  for(large_numpart_type i = 0; i < num_on_grid; i++)
+    {
+      if(i > 0)
+        if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
+          num_field_points++;
+
+      part[part_sortindex[i]].localindex = num_field_points;
+
+      if(i > 0)
+        if(part[part_sortindex[i]].globalindex == part[part_sortindex[i - 1]].globalindex)
+          continue;
+
+      localfield_globalindex[num_field_points] = part[part_sortindex[i]].globalindex;
+
+#ifndef FFT_COLUMN_BASED
+      int slab = part[part_sortindex[i]].globalindex / (GRID * GRID2);
+      int task = myplan.slab_to_task[slab];
+#else
+      int task, column = part[part_sortindex[i]].globalindex / (GRID2);
+
+      if(column < pivotcol)
+        task = column / avg;
+      else
+        task = (column - pivotcol) / (avg - 1) + tasklastsection;
+#endif
+
+      if(localfield_sendcount[task] == 0)
+        localfield_first[task] = num_field_points;
+
+      localfield_sendcount[task]++;
+    }
+  num_field_points++;
+
+  localfield_offset[0] = 0;
+  for(int i = 1; i < NTask; i++)
+    localfield_offset[i] = localfield_offset[i - 1] + localfield_sendcount[i - 1];
+
+  Mem.myfree_movable(part_sortindex);
+  part_sortindex = NULL;
+
+  /* now bin the local particle data onto the mesh list */
+  for(large_numpart_type i = 0; i < num_field_points; i++)
+    localfield_data[i] = 0;
+
+  for(large_numpart_type i = 0; i < num_on_grid; i += 8)
+    {
+      int pindex = (part[i].partindex >> 3);
+
+      MyIntPosType diff[3] = {P[pindex].IntPos[0] - Sp->ReferenceIntPos[grnr][0], P[pindex].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
+                              P[pindex].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
+
+      MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+      double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
+      double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
+      double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
+
+      int slab_x = (int)(dx);
+      int slab_y = (int)(dy);
+      int slab_z = (int)(dz);
+
+      dx -= slab_x;
+      dy -= slab_y;
+      dz -= slab_z;
+
+      double weight = P[pindex].getMass();
+
+      localfield_data[part[i + 0].localindex] += weight * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
+      localfield_data[part[i + 1].localindex] += weight * (1.0 - dx) * (1.0 - dy) * dz;
+      localfield_data[part[i + 2].localindex] += weight * (1.0 - dx) * dy * (1.0 - dz);
+      localfield_data[part[i + 3].localindex] += weight * (1.0 - dx) * dy * dz;
+      localfield_data[part[i + 4].localindex] += weight * (dx) * (1.0 - dy) * (1.0 - dz);
+      localfield_data[part[i + 5].localindex] += weight * (dx) * (1.0 - dy) * dz;
+      localfield_data[part[i + 6].localindex] += weight * (dx)*dy * (1.0 - dz);
+      localfield_data[part[i + 7].localindex] += weight * (dx)*dy * dz;
+    }
+
+  rhogrid = (fft_real *)Mem.mymalloc("rhogrid", maxfftsize * sizeof(fft_real));
+
+  /* clear local FFT-mesh density field */
+  for(large_array_offset ii = 0; ii < maxfftsize; ii++)
+    rhogrid[ii] = 0;
+
+  /* exchange data and add contributions to the local mesh-path */
+  MPI_Alltoall(localfield_sendcount, sizeof(size_t), MPI_BYTE, localfield_recvcount, sizeof(size_t), MPI_BYTE, Communicator);
+
+  for(int level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
+    {
+      int recvTask = ThisTask ^ level;
+
+      if(recvTask < NTask)
+        {
+          if(level > 0)
+            {
+              import_data        = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
+              import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
+                                                                      localfield_recvcount[recvTask] * sizeof(large_array_offset));
+
+              if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
+                {
+                  myMPI_Sendrecv(localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
+                                 MPI_BYTE, recvTask, TAG_NONPERIOD_A, import_data, localfield_recvcount[recvTask] * sizeof(fft_real),
+                                 MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
+
+                  myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
+                                 localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_B,
+                                 import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
+                                 TAG_NONPERIOD_B, Communicator, &status);
+                }
+            }
+          else
+            {
+              import_data        = localfield_data + localfield_offset[ThisTask];
+              import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
+            }
+
+          /* note: here every element in rhogrid is only accessed once, so there should be no race condition */
+          for(large_numpart_type i = 0; i < localfield_recvcount[recvTask]; i++)
+            {
+              /* determine offset in local FFT slab */
+#ifndef FFT_COLUMN_BASED
+              large_array_offset offset =
+                  import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRID * ((large_array_offset)GRID2);
+#else
+              large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
+#endif
+              rhogrid[offset] += import_data[i];
+            }
+
+          if(level > 0)
+            {
+              Mem.myfree(import_globalindex);
+              Mem.myfree(import_data);
+            }
+        }
+    }
+}
+
+/* Function to read out the force component corresponding to spatial dimension 'dim'.
+ * If dim is negative, potential values are read out and assigned to particles.
+ */
+void pm_nonperiodic::pmforce_nonperiodic_zoom_optimized_readout_forces_or_potential(int grnr, int dim)
+{
+#ifdef EVALPOTENTIAL
+  double fac = 1.0 / (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / pow(GRID, 3); /* to get potential */
+#endif
+
+  particle_data *P = Sp->P;
+
+  MPI_Status status;
+
+  fft_real *grid;
+
+  if(dim < 0)
+    grid = rhogrid;
+  else
+    grid = forcegrid;
+
+  double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
+
+  for(int level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
+    {
+      int recvTask = ThisTask ^ level;
+
+      if(recvTask < NTask)
+        {
+          if(level > 0)
+            {
+              import_data        = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
+              import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
+                                                                      localfield_recvcount[recvTask] * sizeof(large_array_offset));
+
+              if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
+                {
+                  myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
+                                 localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_C,
+                                 import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
+                                 TAG_NONPERIOD_C, Communicator, &status);
+                }
+            }
+          else
+            {
+              import_data        = localfield_data + localfield_offset[ThisTask];
+              import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
+            }
+
+          for(large_numpart_type i = 0; i < localfield_recvcount[recvTask]; i++)
+            {
+#ifndef FFT_COLUMN_BASED
+              large_array_offset offset =
+                  import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRID * ((large_array_offset)GRID2);
+#else
+              large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
+#endif
+              import_data[i] = grid[offset];
+            }
+
+          if(level > 0)
+            {
+              myMPI_Sendrecv(import_data, localfield_recvcount[recvTask] * sizeof(fft_real), MPI_BYTE, recvTask, TAG_NONPERIOD_A,
+                             localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
+                             MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
+
+              Mem.myfree(import_globalindex);
+              Mem.myfree(import_data);
+            }
+        }
+    }
+
+  /* read out the force/potential values, which all have been assembled in localfield_data */
+
+  int ngrid = (num_on_grid >> 3);
+
+  for(int k = 0; k < ngrid; k++)
+    {
+      large_numpart_type j = (((large_numpart_type)k) << 3);
+
+      int i = (part[j].partindex >> 3);
+
+#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
+      if(!Sp->TimeBinSynchronized[P[i].TimeBinGrav])
+        continue;
+#endif
+
+      MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
+                              P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
+
+      MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+      double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
+      double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
+      double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
+
+      int slab_x = (int)(dx);
+      int slab_y = (int)(dy);
+      int slab_z = (int)(dz);
+
+      dx -= slab_x;
+      dy -= slab_y;
+      dz -= slab_z;
+
+      double value = +localfield_data[part[j + 0].localindex] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
+                     localfield_data[part[j + 1].localindex] * (1.0 - dx) * (1.0 - dy) * dz +
+                     localfield_data[part[j + 2].localindex] * (1.0 - dx) * dy * (1.0 - dz) +
+                     localfield_data[part[j + 3].localindex] * (1.0 - dx) * dy * dz +
+                     localfield_data[part[j + 4].localindex] * (dx) * (1.0 - dy) * (1.0 - dz) +
+                     localfield_data[part[j + 5].localindex] * (dx) * (1.0 - dy) * dz +
+                     localfield_data[part[j + 6].localindex] * (dx)*dy * (1.0 - dz) +
+                     localfield_data[part[j + 7].localindex] * (dx)*dy * dz;
+
+      if(dim < 0)
+        {
+#ifdef EVALPOTENTIAL
+          P[i].Potential += value * fac;
+#endif
+        }
+      else
+        {
+          Sp->P[i].GravAccel[dim] += value;
+        }
+    }
+}
+
+#else
+
+void pm_nonperiodic::pmforce_nonperiodic_uniform_optimized_prepare_density(int grnr)
+{
+  double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
+
+  Sndpm_count = (size_t *)Mem.mymalloc("Sndpm_count", NTask * sizeof(size_t));
+  Sndpm_offset = (size_t *)Mem.mymalloc("Sndpm_offset", NTask * sizeof(size_t));
+  Rcvpm_count = (size_t *)Mem.mymalloc("Rcvpm_count", NTask * sizeof(size_t));
+  Rcvpm_offset = (size_t *)Mem.mymalloc("Rcvpm_offset", NTask * sizeof(size_t));
+
+#ifdef FFT_COLUMN_BASED
+  int columns = GRIDX * GRIDY;
+  int avg = (columns - 1) / NTask + 1;
+  int exc = NTask * avg - columns;
+  int tasklastsection = NTask - exc;
+  int pivotcol = tasklastsection * avg;
+#endif
+
+  /* determine the slabs/columns each particles accesses */
+  {
+    size_t *send_count = Sndpm_count;
+
+    for(int j = 0; j < NTask; j++)
+      send_count[j] = 0;
+
+    for(int idx = 0; idx < NSource; idx++)
+      {
+        int i = Sp->get_active_index(idx);
+
+        if(Sp->P[i].Ti_Current != All.Ti_Current)
+          Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
+
+        MyIntPosType diff[3] = {Sp->P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], Sp->P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
+                                Sp->P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
+
+        MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+        if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
+          continue;
+
+        if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
+          continue;
+
+        if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
+          continue;
+
+        int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
+        int slab_xx = slab_x + 1;
+
+#ifndef FFT_COLUMN_BASED
+        int task0 = myplan.slab_to_task[slab_x];
+        int task1 = myplan.slab_to_task[slab_xx];
+
+        send_count[task0]++;
+        if(task0 != task1)
+          send_count[task1]++;
+#else
+        int slab_y  = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
+        int slab_yy = slab_y + 1;
+
+        int column0 = slab_x * GRID + slab_y;
+        int column1 = slab_x * GRID + slab_yy;
+        int column2 = slab_xx * GRID + slab_y;
+        int column3 = slab_xx * GRID + slab_yy;
+
+        int task0, task1, task2, task3;
+
+        if(column0 < pivotcol)
+          task0 = column0 / avg;
+        else
+          task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column1 < pivotcol)
+          task1 = column1 / avg;
+        else
+          task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column2 < pivotcol)
+          task2 = column2 / avg;
+        else
+          task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column3 < pivotcol)
+          task3 = column3 / avg;
+        else
+          task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+        send_count[task0]++;
+        if(task1 != task0)
+          send_count[task1]++;
+        if(task2 != task1 && task2 != task0)
+          send_count[task2]++;
+        if(task3 != task0 && task3 != task1 && task3 != task2)
+          send_count[task3]++;
+#endif
+      }
+  }
+
+  /* collect thread-specific offset table and collect the results from the other threads */
+  Sndpm_offset[0] = 0;
+  for(int i = 1; i < NTask; i++)
+    {
+      int ind = i;
+      int ind_prev = i - 1;
+
+      Sndpm_offset[ind] = Sndpm_offset[ind_prev] + Sndpm_count[ind_prev];
+    }
+
+  MPI_Alltoall(Sndpm_count, sizeof(size_t), MPI_BYTE, Rcvpm_count, sizeof(size_t), MPI_BYTE, Communicator);
+
+  nimport = 0, nexport = 0, Rcvpm_offset[0] = 0, Sndpm_offset[0] = 0;
+  for(int j = 0; j < NTask; j++)
+    {
+      nexport += Sndpm_count[j];
+      nimport += Rcvpm_count[j];
+
+      if(j > 0)
+        {
+          Sndpm_offset[j] = Sndpm_offset[j - 1] + Sndpm_count[j - 1];
+          Rcvpm_offset[j] = Rcvpm_offset[j - 1] + Rcvpm_count[j - 1];
+        }
+    }
+
+  /* allocate import and export buffer */
+  partin = (partbuf *)Mem.mymalloc("partin", nimport * sizeof(partbuf));
+  partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
+
+  {
+    size_t *send_count = Sndpm_count;
+    size_t *send_offset = Sndpm_offset;
+
+    for(int j = 0; j < NTask; j++)
+      send_count[j] = 0;
+
+    /* fill export buffer */
+    for(int idx = 0; idx < NSource; idx++)
+      {
+        int i = Sp->get_active_index(idx);
+
+        MyIntPosType diff[3] = {Sp->P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], Sp->P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
+                                Sp->P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
+
+        MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+        if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
+          continue;
+
+        if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
+          continue;
+
+        if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
+          continue;
+
+        int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
+        int slab_xx = slab_x + 1;
+
+#ifndef FFT_COLUMN_BASED
+        int task0 = myplan.slab_to_task[slab_x];
+        int task1 = myplan.slab_to_task[slab_xx];
+
+        size_t ind0 = send_offset[task0] + send_count[task0]++;
+        partout[ind0].Mass = Sp->P[i].getMass();
+        for(int j = 0; j < 3; j++)
+          partout[ind0].IntPos[j] = Sp->P[i].IntPos[j];
+
+        if(task0 != task1)
+          {
+            size_t ind1 = send_offset[task1] + send_count[task1]++;
+            partout[ind1].Mass = Sp->P[i].getMass();
+            for(int j = 0; j < 3; j++)
+              partout[ind1].IntPos[j] = Sp->P[i].IntPos[j];
+          }
+#else
+        int slab_y  = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
+        int slab_yy = slab_y + 1;
+
+        int column0 = slab_x * GRID + slab_y;
+        int column1 = slab_x * GRID + slab_yy;
+        int column2 = slab_xx * GRID + slab_y;
+        int column3 = slab_xx * GRID + slab_yy;
+
+        int task0, task1, task2, task3;
+
+        if(column0 < pivotcol)
+          task0 = column0 / avg;
+        else
+          task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column1 < pivotcol)
+          task1 = column1 / avg;
+        else
+          task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column2 < pivotcol)
+          task2 = column2 / avg;
+        else
+          task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+        if(column3 < pivotcol)
+          task3 = column3 / avg;
+        else
+          task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+        size_t ind0        = send_offset[task0] + send_count[task0]++;
+        partout[ind0].Mass = Sp->P[i].getMass();
+        for(int j = 0; j < 3; j++)
+          partout[ind0].IntPos[j] = Sp->P[i].IntPos[j];
+
+        if(task1 != task0)
+          {
+            size_t ind1        = send_offset[task1] + send_count[task1]++;
+            partout[ind1].Mass = Sp->P[i].getMass();
+            for(int j = 0; j < 3; j++)
+              partout[ind1].IntPos[j] = Sp->P[i].IntPos[j];
+          }
+        if(task2 != task1 && task2 != task0)
+          {
+            size_t ind2        = send_offset[task2] + send_count[task2]++;
+            partout[ind2].Mass = Sp->P[i].getMass();
+            for(int j = 0; j < 3; j++)
+              partout[ind2].IntPos[j] = Sp->P[i].IntPos[j];
+          }
+        if(task3 != task0 && task3 != task1 && task3 != task2)
+          {
+            size_t ind3        = send_offset[task3] + send_count[task3]++;
+            partout[ind3].Mass = Sp->P[i].getMass();
+            for(int j = 0; j < 3; j++)
+              partout[ind3].IntPos[j] = Sp->P[i].IntPos[j];
+          }
+#endif
+      }
+  }
+
+  int flag_big = 0, flag_big_all;
+  for(int i = 0; i < NTask; i++)
+    if(Sndpm_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+      flag_big = 1;
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  /* exchange particle data */
+  myMPI_Alltoallv(partout, Sndpm_count, Sndpm_offset, partin, Rcvpm_count, Rcvpm_offset, sizeof(partbuf), flag_big_all, Communicator);
+
+  Mem.myfree(partout);
+
+  /* allocate density field */
+  rhogrid = (fft_real *)Mem.mymalloc("rhogrid", maxfftsize * sizeof(fft_real));
+
+  /* clear local FFT-mesh density field */
+  for(size_t ii = 0; ii < maxfftsize; ii++)
+    rhogrid[ii] = 0;
+
+#ifndef FFT_COLUMN_BASED
+  /* bin particle data onto mesh, in multi-threaded fashion */
+  {
+    int first_y, count_y;
+    subdivide_evenly(GRID, 1, 0, &first_y, &count_y);
+    int last_y = first_y + count_y - 1;
+
+    for(size_t i = 0; i < nimport; i++)
+      {
+        MyIntPosType diff[3] = {partin[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], partin[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
+                                partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
+
+        MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+        double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
+        int slab_y = (int)(dy);
+        dy -= slab_y;
+
+        int slab_yy = slab_y + 1;
+        int flag_slab_y, flag_slab_yy;
+
+        if(slab_y >= first_y && slab_y <= last_y)
+          flag_slab_y = 1;
+        else
+          flag_slab_y = 0;
+
+        if(slab_yy >= first_y && slab_yy <= last_y)
+          flag_slab_yy = 1;
+        else
+          flag_slab_yy = 0;
+
+        if(flag_slab_y || flag_slab_yy)
+          {
+            double mass = partin[i].Mass;
+
+            double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
+            double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
+            int slab_x = (int)(dx);
+            int slab_z = (int)(dz);
+            dx -= slab_x;
+            dz -= slab_z;
+
+            int slab_xx = slab_x + 1;
+            int slab_zz = slab_z + 1;
+
+            int flag_slab_x, flag_slab_xx;
+
+            if(myplan.slab_to_task[slab_x] == ThisTask)
+              {
+                slab_x -= myplan.first_slab_x_of_task[ThisTask];
+                flag_slab_x = 1;
+              }
+            else
+              flag_slab_x = 0;
+
+            if(myplan.slab_to_task[slab_xx] == ThisTask)
+              {
+                slab_xx -= myplan.first_slab_x_of_task[ThisTask];
+                flag_slab_xx = 1;
+              }
+            else
+              flag_slab_xx = 0;
+
+            if(flag_slab_x)
+              {
+                if(flag_slab_y)
+                  {
+                    rhogrid[FI(slab_x, slab_y, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
+                    rhogrid[FI(slab_x, slab_y, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
+                  }
+
+                if(flag_slab_yy)
+                  {
+                    rhogrid[FI(slab_x, slab_yy, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
+                    rhogrid[FI(slab_x, slab_yy, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
+                  }
+              }
+
+            if(flag_slab_xx)
+              {
+                if(flag_slab_y)
+                  {
+                    rhogrid[FI(slab_xx, slab_y, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
+                    rhogrid[FI(slab_xx, slab_y, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
+                  }
+
+                if(flag_slab_yy)
+                  {
+                    rhogrid[FI(slab_xx, slab_yy, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
+                    rhogrid[FI(slab_xx, slab_yy, slab_zz)] += (mass * (dx) * (dy) * (dz));
+                  }
+              }
+          }
+      }
+  }
+
+#else
+
+  struct data_cols
+  {
+    int col0, col1, col2, col3;
+    double dx, dy;
+  };
+
+  data_cols *aux = (data_cols *)Mem.mymalloc("aux", nimport * sizeof(data_cols));
+
+  for(int i = 0; i < nimport; i++)
+    {
+      MyIntPosType diff[3] = {partin[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], partin[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
+                              partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
+
+      MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+      aux[i].dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
+      aux[i].dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
+
+      int slab_x = (int)(aux[i].dx);
+      int slab_y = (int)(aux[i].dy);
+
+      aux[i].dx -= slab_x;
+      aux[i].dy -= slab_y;
+
+      int slab_xx = slab_x + 1;
+      int slab_yy = slab_y + 1;
+
+      aux[i].col0 = slab_x * GRID + slab_y;
+      aux[i].col1 = slab_x * GRID + slab_yy;
+      aux[i].col2 = slab_xx * GRID + slab_y;
+      aux[i].col3 = slab_xx * GRID + slab_yy;
+    }
+
+  {
+    int first_col, last_col, count_col;
+    subdivide_evenly(myplan.ncol_XY, 1, 0, &first_col, &count_col);
+    last_col = first_col + count_col - 1;
+    first_col += myplan.firstcol_XY;
+    last_col += myplan.firstcol_XY;
+
+    for(int i = 0; i < nimport; i++)
+      {
+        int flag0, flag1, flag2, flag3;
+        int col0 = aux[i].col0;
+        int col1 = aux[i].col1;
+        int col2 = aux[i].col2;
+        int col3 = aux[i].col3;
+
+        if(col0 >= first_col && col0 <= last_col)
+          flag0 = 1;
+        else
+          flag0 = 0;
+
+        if(col1 >= first_col && col1 <= last_col)
+          flag1 = 1;
+        else
+          flag1 = 0;
+
+        if(col2 >= first_col && col2 <= last_col)
+          flag2 = 1;
+        else
+          flag2 = 0;
+
+        if(col3 >= first_col && col3 <= last_col)
+          flag3 = 1;
+        else
+          flag3 = 0;
+
+        if(flag0 || flag1 || flag2 || flag3)
+          {
+            double mass = partin[i].Mass;
+
+            double dx = aux[i].dx;
+            double dy = aux[i].dy;
+
+            MySignedIntPosType deltaz = (MySignedIntPosType)(partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]);
+
+            double dz = to_slab_fac * (deltaz - Sp->Corner[grnr][2]);
+            int slab_z = (int)(dz);
+            dz -= slab_z;
+            int slab_zz = slab_z + 1;
+
+            if(flag0)
+              {
+                rhogrid[FC(col0, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
+                rhogrid[FC(col0, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
+              }
+
+            if(flag1)
+              {
+                rhogrid[FC(col1, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
+                rhogrid[FC(col1, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
+              }
+
+            if(flag2)
+              {
+                rhogrid[FC(col2, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
+                rhogrid[FC(col2, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
+              }
+
+            if(flag3)
+              {
+                rhogrid[FC(col3, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
+                rhogrid[FC(col3, slab_zz)] += (mass * (dx) * (dy) * (dz));
+              }
+          }
+      }
+  }
+
+  Mem.myfree(aux);
+
+#endif
+}
+
+/* If dim<0, this function reads out the potential, otherwise Cartesian force components.
+ */
+void pm_nonperiodic::pmforce_nonperiodic_uniform_optimized_readout_forces_or_potential(int grnr, int dim)
+{
+#ifdef EVALPOTENTIAL
+  double fac = 1.0 / (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / pow(GRID, 3); /* to get potential */
+#endif
+
+  double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
+
+  double *flistin = (double *)Mem.mymalloc("flistin", nimport * sizeof(double));
+  double *flistout = (double *)Mem.mymalloc("flistout", nexport * sizeof(double));
+
+  fft_real *grid;
+
+  if(dim < 0)
+    grid = rhogrid;
+  else
+    grid = forcegrid;
+
+#ifdef FFT_COLUMN_BASED
+  int columns = GRIDX * GRIDY;
+  int avg = (columns - 1) / NTask + 1;
+  int exc = NTask * avg - columns;
+  int tasklastsection = NTask - exc;
+  int pivotcol = tasklastsection * avg;
+#endif
+
+  for(size_t i = 0; i < nimport; i++)
+    {
+      flistin[i] = 0;
+
+      MyIntPosType diff[3] = {partin[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], partin[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
+                              partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
+
+      MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+      double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
+      double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
+      double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
+
+      int slab_x = (int)(dx);
+      int slab_y = (int)(dy);
+      int slab_z = (int)(dz);
+
+      dx -= slab_x;
+      dy -= slab_y;
+      dz -= slab_z;
+
+      int slab_xx = slab_x + 1;
+      int slab_yy = slab_y + 1;
+      int slab_zz = slab_z + 1;
+
+#ifndef FFT_COLUMN_BASED
+      if(myplan.slab_to_task[slab_x] == ThisTask)
+        {
+          slab_x -= myplan.first_slab_x_of_task[ThisTask];
+
+          flistin[i] += +grid[FI(slab_x, slab_y, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
+                        grid[FI(slab_x, slab_y, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz) +
+                        grid[FI(slab_x, slab_yy, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) +
+                        grid[FI(slab_x, slab_yy, slab_zz)] * (1.0 - dx) * (dy) * (dz);
+        }
+
+      if(myplan.slab_to_task[slab_xx] == ThisTask)
+        {
+          slab_xx -= myplan.first_slab_x_of_task[ThisTask];
+
+          flistin[i] += +grid[FI(slab_xx, slab_y, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) +
+                        grid[FI(slab_xx, slab_y, slab_zz)] * (dx) * (1.0 - dy) * (dz) +
+                        grid[FI(slab_xx, slab_yy, slab_z)] * (dx) * (dy) * (1.0 - dz) +
+                        grid[FI(slab_xx, slab_yy, slab_zz)] * (dx) * (dy) * (dz);
+        }
+#else
+      int column0 = slab_x * GRID + slab_y;
+      int column1 = slab_x * GRID + slab_yy;
+      int column2 = slab_xx * GRID + slab_y;
+      int column3 = slab_xx * GRID + slab_yy;
+
+      if(column0 >= myplan.firstcol_XY && column0 <= myplan.lastcol_XY)
+        {
+          flistin[i] += +grid[FC(column0, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
+                        grid[FC(column0, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz);
+        }
+      if(column1 >= myplan.firstcol_XY && column1 <= myplan.lastcol_XY)
+        {
+          flistin[i] +=
+              +grid[FC(column1, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) + grid[FC(column1, slab_zz)] * (1.0 - dx) * (dy) * (dz);
+        }
+
+      if(column2 >= myplan.firstcol_XY && column2 <= myplan.lastcol_XY)
+        {
+          flistin[i] +=
+              +grid[FC(column2, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) + grid[FC(column2, slab_zz)] * (dx) * (1.0 - dy) * (dz);
+        }
+
+      if(column3 >= myplan.firstcol_XY && column3 <= myplan.lastcol_XY)
+        {
+          flistin[i] += +grid[FC(column3, slab_z)] * (dx) * (dy) * (1.0 - dz) + grid[FC(column3, slab_zz)] * (dx) * (dy) * (dz);
+        }
+#endif
+    }
+
+  /* exchange the potential component data */
+  int flag_big = 0, flag_big_all;
+  for(int i = 0; i < NTask; i++)
+    if(Sndpm_count[i] * sizeof(double) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+      flag_big = 1;
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  /* exchange  data */
+  myMPI_Alltoallv(flistin, Rcvpm_count, Rcvpm_offset, flistout, Sndpm_count, Sndpm_offset, sizeof(double), flag_big_all, Communicator);
+
+  /* now assign them to the correct particles */
+
+  size_t *send_count = Sndpm_count;
+  size_t *send_offset = Sndpm_offset;
+
+  for(int j = 0; j < NTask; j++)
+    send_count[j] = 0;
+
+  for(int idx = 0; idx < NSource; idx++)
+    {
+      int i = Sp->get_active_index(idx);
+
+      MyIntPosType diff[3] = {Sp->P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], Sp->P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
+                              Sp->P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
+
+      MySignedIntPosType *delta = (MySignedIntPosType *)diff;
+
+      if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
+        continue;
+
+      if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
+        continue;
+
+      if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
+        continue;
+
+      int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
+      int slab_xx = slab_x + 1;
+
+#ifndef FFT_COLUMN_BASED
+      int task0 = myplan.slab_to_task[slab_x];
+      int task1 = myplan.slab_to_task[slab_xx];
+
+      double value = flistout[send_offset[task0] + send_count[task0]++];
+
+      if(task0 != task1)
+        value += flistout[send_offset[task1] + send_count[task1]++];
+#else
+      int slab_y = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
+      int slab_yy = slab_y + 1;
+
+      int column0 = slab_x * GRID + slab_y;
+      int column1 = slab_x * GRID + slab_yy;
+      int column2 = slab_xx * GRID + slab_y;
+      int column3 = slab_xx * GRID + slab_yy;
+
+      int task0, task1, task2, task3;
+
+      if(column0 < pivotcol)
+        task0 = column0 / avg;
+      else
+        task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column1 < pivotcol)
+        task1 = column1 / avg;
+      else
+        task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column2 < pivotcol)
+        task2 = column2 / avg;
+      else
+        task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column3 < pivotcol)
+        task3 = column3 / avg;
+      else
+        task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+      double value = flistout[send_offset[task0] + send_count[task0]++];
+
+      if(task1 != task0)
+        value += flistout[send_offset[task1] + send_count[task1]++];
+
+      if(task2 != task1 && task2 != task0)
+        value += flistout[send_offset[task2] + send_count[task2]++];
+
+      if(task3 != task0 && task3 != task1 && task3 != task2)
+        value += flistout[send_offset[task3] + send_count[task3]++];
+#endif
+
+#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
+      if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
+        continue;
+#endif
+
+      if(dim < 0)
+        {
+#ifdef EVALPOTENTIAL
+          Sp->P[i].Potential += value * fac;
+#endif
+        }
+      else
+        {
+          Sp->P[i].GravAccel[dim] += value;
+        }
+    }
+
+  Mem.myfree(flistout);
+  Mem.myfree(flistin);
+}
+#endif
+
+/*! Calculates the long-range non-periodic forces using the PM method.  The
+ *  potential is Gaussian filtered with Asmth, given in mesh-cell units. The
+ *  potential is finite differenced using a 4-point finite differencing
+ *  formula to obtain the force fields, which are then interpolated to the
+ *  particle positions. We carry out a CIC charge assignment, and compute the
+ *  potenial by Fourier transform methods. The CIC kernel is deconvolved.
+ */
+int pm_nonperiodic::pmforce_nonperiodic(int grnr)
+{
+  double tstart = Logs.second();
+
+  mpi_printf("PM-NONPERIODIC: Starting non-periodic PM calculation (Rcut=%g, grid=%d)  presently allocated=%g MB.\n", Sp->Rcut[grnr],
+             grnr, Mem.getAllocatedBytesInMB());
+
+  if(grnr == 1 && Sp->Rcut[1] > Sp->Rcut[0])
+    Terminate(
+        "We have Sp->Rcut[1]=%g  >  Sp->Rcut[0]=%g, which means that the high-res cut-off is larger than the normal one... this is "
+        "not good.",
+        Sp->Rcut[1], Sp->Rcut[0]);
+
+#ifdef HIERARCHICAL_GRAVITY
+  NSource = Sp->TimeBinsGravity.NActiveParticles;
+#else
+  NSource = Sp->NumPart;
+#endif
+
+#ifndef TREEPM_NOTIMESPLIT
+  if(NSource != Sp->NumPart)
+    Terminate("unexpected NSource != Sp->NumPart");
+#endif
+
+#ifndef NUMPART_PER_TASK_LARGE
+  if((((long long)Sp->NumPart) << 3) >= (((long long)1) << 31))
+    Terminate("We are dealing with a too large particle number per MPI rank - enabling NUMPART_PER_TASK_LARGE might help.");
+#endif
+
+  double fac = 1.0 / (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / pow(GRID, 3); /* to get potential */
+  fac *= 1 / (2 * (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / GRID);           /* for finite differencing */
+
+#ifdef PM_ZOOM_OPTIMIZED
+  pmforce_nonperiodic_zoom_optimized_prepare_density(grnr);
+#else
+  pmforce_nonperiodic_uniform_optimized_prepare_density(grnr);
+#endif
+
+  /* allocate the memory to hold the FFT fields */
+  forcegrid = (fft_real *)Mem.mymalloc("forcegrid", maxfftsize * sizeof(fft_real));
+
+  workspace = forcegrid;
+
+#ifndef FFT_COLUMN_BASED
+  fft_of_rhogrid = (fft_complex *)&rhogrid[0];
+#else
+  fft_of_rhogrid = (fft_complex *)&workspace[0];
+#endif
+
+  /* Do the FFT of the density field */
+#ifndef FFT_COLUMN_BASED
+  my_slab_based_fft(&myplan, &rhogrid[0], &workspace[0], 1);
+#else
+  my_column_based_fft(&myplan, rhogrid, workspace, 1); /* result is in workspace, not in rhogrid ! */
+#endif
+
+  /* multiply with kernel in Fourier space */
+  /* multiply with the Fourier transform of the Green's function (kernel) */
+
+  /* multiply with Green's function in order to obtain the potential */
+
+#ifdef FFT_COLUMN_BASED
+  for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
+    {
+#else
+  for(int x = 0; x < GRID; x++)
+    for(int y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
+      for(int z = 0; z < GRIDz; z++)
+        {
+#endif
+
+#ifndef FFT_COLUMN_BASED
+      large_array_offset ip = ((large_array_offset)GRIDz) * (GRID * (y - myplan.slabstart_y) + x) + z;
+#endif
+
+      double re = fft_of_rhogrid[ip][0] * fft_of_kernel[grnr][ip][0] - fft_of_rhogrid[ip][1] * fft_of_kernel[grnr][ip][1];
+      double im = fft_of_rhogrid[ip][0] * fft_of_kernel[grnr][ip][1] + fft_of_rhogrid[ip][1] * fft_of_kernel[grnr][ip][0];
+
+      fft_of_rhogrid[ip][0] = re;
+      fft_of_rhogrid[ip][1] = im;
+    }
+
+    /* Do the inverse FFT to get the potential */
+
+#ifndef FFT_COLUMN_BASED
+  my_slab_based_fft(&myplan, rhogrid, workspace, -1);
+#else
+  my_column_based_fft(&myplan, workspace, rhogrid, -1);
+#endif
+
+  /* Now rhogrid holds the potential */
+
+#ifdef EVALPOTENTIAL
+#ifdef PM_ZOOM_OPTIMIZED
+  pmforce_nonperiodic_zoom_optimized_readout_forces_or_potential(grnr, -1);
+#else
+  pmforce_nonperiodic_uniform_optimized_readout_forces_or_potential(grnr, -1);
+#endif
+#endif
+
+  /* get the force components by finite differencing of the potential for each dimension,
+   * and send the results back to the right CPUs
+   */
+  for(int dim = 2; dim >= 0; dim--) /* Calculate each component of the force. */
+    {
+      /* we do the x component last, because for differencing the potential in the x-direction, we need to construct the transpose */
+#ifndef FFT_COLUMN_BASED
+      if(dim == 0)
+        my_slab_transposeA(&myplan, rhogrid, forcegrid); /* compute the transpose of the potential field for finite differencing */
+
+      for(int y = 2; y < GRID / 2 - 2; y++)
+        for(int x = 0; x < myplan.nslab_x; x++)
+          if(x + myplan.slabstart_x >= 2 && x + myplan.slabstart_x < GRID / 2 - 2)
+            for(int z = 2; z < GRID / 2 - 2; z++)
+              {
+                int yrr = y, yll = y, yr = y, yl = y;
+                int zrr = z, zll = z, zr = z, zl = z;
+
+                switch(dim)
+                  {
+                    case 0: /* note: for the x-direction, we difference the transposed direction (y) */
+                    case 1:
+                      yr  = y + 1;
+                      yl  = y - 1;
+                      yrr = y + 2;
+                      yll = y - 2;
+
+                      break;
+                    case 2:
+                      zr  = z + 1;
+                      zl  = z - 1;
+                      zrr = z + 2;
+                      zll = z - 2;
+
+                      break;
+                  }
+
+                if(dim == 0)
+                  forcegrid[TI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[TI(x, yl, zl)] - rhogrid[TI(x, yr, zr)]) -
+                                                  (1.0 / 6) * (rhogrid[TI(x, yll, zll)] - rhogrid[TI(x, yrr, zrr)]));
+                else
+                  forcegrid[FI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[FI(x, yl, zl)] - rhogrid[FI(x, yr, zr)]) -
+                                                  (1.0 / 6) * (rhogrid[FI(x, yll, zll)] - rhogrid[FI(x, yrr, zrr)]));
+              }
+
+      if(dim == 0)
+        my_slab_transposeB(&myplan, forcegrid, rhogrid); /* reverse the transpose from above */
+#else
+      fft_real *scratch;
+
+      if(dim != 2)
+        {
+          scratch = (fft_real *)Mem.mymalloc("scratch", myplan.fftsize * sizeof(fft_real)); /* need a third field as scratch space */
+          memcpy(scratch, rhogrid, myplan.fftsize * sizeof(fft_real));
+
+          if(dim == 1)
+            my_fft_swap23(&myplan, scratch, forcegrid);
+          else
+            my_fft_swap13(&myplan, scratch, forcegrid);
+        }
+
+      int ncols;
+      if(dim == 2)
+        ncols = myplan.ncol_XY;
+      else if(dim == 1)
+        ncols = myplan.ncol_XZ;
+      else
+        ncols = myplan.ncol_ZY;
+
+      for(int i = 0; i < ncols; i++)
+        {
+          fft_real *forcep, *potp;
+
+          if(dim != 2)
+            {
+              forcep = &scratch[GRID * i];
+              potp = &forcegrid[GRID * i];
+            }
+          else
+            {
+              forcep = &forcegrid[GRID2 * i];
+              potp = &rhogrid[GRID2 * i];
+            }
+
+          for(int z = 2; z < GRID / 2 - 2; z++)
+            {
+              int zr = z + 1;
+              int zl = z - 1;
+              int zrr = z + 2;
+              int zll = z - 2;
+
+              forcep[z] = fac * ((4.0 / 3) * (potp[zl] - potp[zr]) - (1.0 / 6) * (potp[zll] - potp[zrr]));
+            }
+        }
+
+      if(dim != 2)
+        {
+          if(dim == 1)
+            my_fft_swap23back(&myplan, scratch, forcegrid);
+          else
+            my_fft_swap13back(&myplan, scratch, forcegrid);
+
+          Mem.myfree(scratch);
+        }
+#endif
+
+#ifdef PM_ZOOM_OPTIMIZED
+      pmforce_nonperiodic_zoom_optimized_readout_forces_or_potential(grnr, dim);
+#else
+      pmforce_nonperiodic_uniform_optimized_readout_forces_or_potential(grnr, dim);
+#endif
+    }
+
+  /* free stuff */
+  Mem.myfree(forcegrid);
+  Mem.myfree(rhogrid);
+
+#ifdef PM_ZOOM_OPTIMIZED
+  Mem.myfree(localfield_recvcount);
+  Mem.myfree(localfield_offset);
+  Mem.myfree(localfield_sendcount);
+  Mem.myfree(localfield_first);
+  Mem.myfree(localfield_data);
+  Mem.myfree(localfield_globalindex);
+  Mem.myfree(part);
+#else
+  Mem.myfree(partin);
+  Mem.myfree(Rcvpm_offset);
+  Mem.myfree(Rcvpm_count);
+  Mem.myfree(Sndpm_offset);
+  Mem.myfree(Sndpm_count);
+#endif
+
+  double tend = Logs.second();
+
+  mpi_printf("PM-NONPERIODIC: done.  (took %g seconds)\n", Logs.timediff(tstart, tend));
+
+  return 0;
+}
+
+/*! This function sets-up the Greens function for the non-periodic potential
+ *  in real space, and then converts it to Fourier space by means of a FFT.
+ */
+void pm_nonperiodic::pm_setup_nonperiodic_kernel(void)
+{
+  mpi_printf("PM-NONPERIODIC: Setting up non-periodic PM kernel(s) (GRID=%d)  presently allocated=%g MB).\n", (int)GRID,
+             Mem.getAllocatedBytesInMB());
+
+  /* now set up kernel and its Fourier transform */
+
+#if !defined(PERIODIC)
+  for(size_t i = 0; i < maxfftsize; i++) /* clear local field */
+    kernel[0][i] = 0;
+
+#ifndef FFT_COLUMN_BASED
+  for(int i = myplan.slabstart_x; i < (myplan.slabstart_x + myplan.nslab_x); i++)
+    for(int j = 0; j < GRID; j++)
+      {
+#else
+  for(int c = myplan.firstcol_XY; c < (myplan.firstcol_XY + myplan.ncol_XY); c++)
+    {
+      int i = c / GRID;
+      int j = c % GRID;
+#endif
+        for(int k = 0; k < GRID; k++)
+          {
+            double xx = ((double)i) / GRID;
+            double yy = ((double)j) / GRID;
+            double zz = ((double)k) / GRID;
+
+            if(xx >= 0.5)
+              xx -= 1.0;
+            if(yy >= 0.5)
+              yy -= 1.0;
+            if(zz >= 0.5)
+              zz -= 1.0;
+
+            double r = sqrt(xx * xx + yy * yy + zz * zz);
+
+            double u = 0.5 * r / (((double)ASMTH) / GRID);
+
+            double fac = 1 - erfc(u);
+
+#ifndef FFT_COLUMN_BASED
+            size_t ip = FI(i - myplan.slabstart_x, j, k);
+#else
+          size_t ip = FC(c, k);
+#endif
+            if(r > 0)
+              kernel[0][ip] = -fac / r;
+            else
+              kernel[0][ip] = -1 / (sqrt(M_PI) * (((double)ASMTH) / GRID));
+          }
+      }
+
+  {
+    fft_real *workspc = (fft_real *)Mem.mymalloc("workspc", maxfftsize * sizeof(fft_real));
+    /* Do the FFT of the kernel */
+#ifndef FFT_COLUMN_BASED
+    my_slab_based_fft(&myplan, kernel[0], workspc, 1);
+#else
+    my_column_based_fft(&myplan, kernel[0], workspc, 1); /* result is in workspace, not in kernel */
+    memcpy(kernel[0], workspc, maxfftsize * sizeof(fft_real));
+#endif
+    Mem.myfree(workspc);
+  }
+
+#endif
+
+#if defined(PLACEHIGHRESREGION)
+
+  for(int i = 0; i < maxfftsize; i++) /* clear local field */
+    kernel[1][i] = 0;
+
+#ifndef FFT_COLUMN_BASED
+  for(int i = myplan.slabstart_x; i < (myplan.slabstart_x + myplan.nslab_x); i++)
+    for(int j = 0; j < GRID; j++)
+      {
+#else
+  for(int c = myplan.firstcol_XY; c < (myplan.firstcol_XY + myplan.ncol_XY); c++)
+    {
+      int i = c / GRID;
+      int j = c % GRID;
+#endif
+        for(int k = 0; k < GRID; k++)
+          {
+            double xx = ((double)i) / GRID;
+            double yy = ((double)j) / GRID;
+            double zz = ((double)k) / GRID;
+
+            if(xx >= 0.5)
+              xx -= 1.0;
+            if(yy >= 0.5)
+              yy -= 1.0;
+            if(zz >= 0.5)
+              zz -= 1.0;
+
+            double r = sqrt(xx * xx + yy * yy + zz * zz);
+
+            double u = 0.5 * r / (((double)ASMTH) / GRID);
+
+            double fac = erfc(u * Sp->Asmth[1] / Sp->Asmth[0]) - erfc(u);
+
+#ifndef FFT_COLUMN_BASED
+            size_t ip = FI(i - myplan.slabstart_x, j, k);
+#else
+          size_t ip = FC(c, k);
+#endif
+
+            if(r > 0)
+              kernel[1][ip] = -fac / r;
+            else
+              {
+                fac           = 1 - Sp->Asmth[1] / Sp->Asmth[0];
+                kernel[1][ip] = -fac / (sqrt(M_PI) * (((double)ASMTH) / GRID));
+              }
+          }
+#ifndef FFT_COLUMN_BASED
+      }
+#else
+    }
+#endif
+
+  {
+    fft_real *workspc = (fft_real *)Mem.mymalloc("workspc", maxfftsize * sizeof(fft_real));
+    /* Do the FFT of the kernel */
+#ifndef FFT_COLUMN_BASED
+    my_slab_based_fft(&myplan, kernel[1], workspc, 1);
+#else
+    my_column_based_fft(&myplan, kernel[1], workspc, 1); /* result is in workspace, not in kernel */
+    memcpy(kernel[1], workspc, maxfftsize * sizeof(fft_real));
+#endif
+    Mem.myfree(workspc);
+  }
+
+#endif
+
+  /* deconvolve the Greens function twice with the CIC kernel */
+#ifdef FFT_COLUMN_BASED
+
+  for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
+    {
+      large_array_offset ipcell = ip + myplan.transposed_firstcol * GRID;
+      int y                     = ipcell / (GRID * GRIDz);
+      int yr                    = ipcell % (GRID * GRIDz);
+      int z                     = yr / GRID;
+      int x                     = yr % GRID;
+#else
+  for(int x = 0; x < GRID; x++)
+    for(int y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
+      for(int z = 0; z < GRIDz; z++)
+        {
+#endif
+      double kx, ky, kz;
+
+      if(x > GRID / 2)
+        kx = x - GRID;
+      else
+        kx = x;
+      if(y > GRID / 2)
+        ky = y - GRID;
+      else
+        ky = y;
+      if(z > GRID / 2)
+        kz = z - GRID;
+      else
+        kz = z;
+
+      double k2 = kx * kx + ky * ky + kz * kz;
+
+      if(k2 > 0)
+        {
+          double fx = 1, fy = 1, fz = 1;
+
+          if(kx != 0)
+            {
+              fx = (M_PI * kx) / GRID;
+              fx = sin(fx) / fx;
+            }
+          if(ky != 0)
+            {
+              fy = (M_PI * ky) / GRID;
+              fy = sin(fy) / fy;
+            }
+          if(kz != 0)
+            {
+              fz = (M_PI * kz) / GRID;
+              fz = sin(fz) / fz;
+            }
+
+          double ff = 1 / (fx * fy * fz);
+          ff        = ff * ff * ff * ff;
+
+#ifndef FFT_COLUMN_BASED
+          large_array_offset ip = ((large_array_offset)GRIDz) * (GRID * (y - myplan.slabstart_y) + x) + z;
+#endif
+#if !defined(PERIODIC)
+          fft_of_kernel[0][ip][0] *= ff;
+          fft_of_kernel[0][ip][1] *= ff;
+#endif
+#if defined(PLACEHIGHRESREGION)
+          fft_of_kernel[1][ip][0] *= ff;
+          fft_of_kernel[1][ip][1] *= ff;
+#endif
+        }
+    }
+
+  /* end deconvolution */
+}
+
+#endif
diff --git a/src/pm/pm_nonperiodic.h b/src/pm/pm_nonperiodic.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecab45e4522bef2f08032aba5be7aef8deeea159
--- /dev/null
+++ b/src/pm/pm_nonperiodic.h
@@ -0,0 +1,168 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  pm_nonperiodic.h
+ *
+ *  \brief declaration of a class used for non-periodic long-range PM force calculation
+ */
+
+#ifndef PM_NONPERIODIC_H
+#define PM_NONPERIODIC_H
+
+#include "gadgetconfig.h"
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../data/simparticles.h"
+#include "../domain/domain.h"
+#include "../logs/timer.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../pm/pm_mpi_fft.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+class pm_nonperiodic : public pm_mpi_fft
+{
+ public:
+  pm_nonperiodic(MPI_Comm comm) : setcomm(comm), pm_mpi_fft(comm) {}
+
+#if defined(PMGRID) && (!defined(PERIODIC) || defined(PLACEHIGHRESREGION))
+
+ private:
+#if defined(LONG_X_BITS) || defined(LONG_Y_BITS) || defined(LONG_Z_BITS)
+#error "LONG_X/Y/Z_BITS not supported for the non-periodic FFT gravity code"
+#endif
+
+#if defined(GRAVITY_TALLBOX)
+#error "GRAVITY_TALLBOX not supported for the non-periodic FFT gravity code"
+#endif
+
+#if(HRPMGRID > 1024)
+  typedef long long large_array_offset; /* use a larger data type in this case so that we can always address all cells of the 3D grid
+                                           with a single index */
+#else
+  typedef int large_array_offset;
+#endif
+
+#ifdef NUMPART_PER_TASK_LARGE
+  typedef long long large_numpart_type; /* if there is a risk that the local particle number times 8 overflows a 32-bit integer, this
+                                           data type should be used */
+#else
+  typedef int large_numpart_type;
+#endif
+
+  /* short-cut macros for accessing different 3D arrays */
+
+  int NSource;
+
+  fft_plan myplan; /*!< In this structure, various bookkeeping variables for the distributed FFTs are stored */
+
+  /*! \var maxfftsize
+   *  \brief maximum size of the local fft grid among all tasks
+   */
+  size_t maxfftsize;
+
+  /*! \var rhogrid
+   *  \brief This array hold the local part of the density field and
+   *  after the FFTs the local part of the potential
+   *
+   *  \var forcegrid
+   *  \brief This array will contain the force field
+   *
+   *  \var workspace
+   *  \brief Workspace array used during the FFTs
+   */
+  fft_real *rhogrid, *forcegrid, *workspace;
+
+  /*! \brief Array containing the FFT of #rhogrid
+   *
+   *  This pointer points to the same array as #rhogrid,
+   *  because in-place FFTs are used.
+   */
+  fft_complex *fft_of_rhogrid;
+
+  fft_real *kernel[2];
+  fft_complex *fft_of_kernel[2];
+
+ public:
+  simparticles *Sp;
+
+  void pm_init_nonperiodic(simparticles *Sp_ptr);
+  void pm_init_regionsize(void);
+  int pmforce_nonperiodic(int grnr);
+  void pm_setup_nonperiodic_kernel(void);
+  void pmforce_nonperiodic_zoom_optimized_prepare_density(int grnr);
+
+ private:
+#ifdef PM_ZOOM_OPTIMIZED
+
+  void pmforce_nonperiodic_zoom_optimized_readout_forces_or_potential(int grnr, int dim);
+
+  /*! \brief This structure links the particles to the mesh cells, to which they contribute their mass
+   *
+   * Each particle will have eight items of this structure in the #part array.
+   * For each of the eight mesh cells the CIC assignment will contribute,
+   * one item of this struct exists.
+   */
+  struct part_slab_data
+  {
+    large_array_offset globalindex; /*!< index in the global density mesh */
+    large_numpart_type partindex;   /*!< contains the local particle index shifted by 2^3, the first three bits encode to which part of
+                                       the CIC assignment this item belongs to */
+    large_array_offset localindex;  /*!< index to a local copy of the corresponding mesh cell of the global density array (used during
+                                       local mass and force assignment) */
+  };
+  part_slab_data *part; /*!< array of part_slab_data linking the local particles to their mesh cells */
+
+  size_t *localfield_sendcount, *localfield_first, *localfield_offset, *localfield_recvcount;
+  large_array_offset *localfield_globalindex, *import_globalindex;
+  fft_real *localfield_data, *import_data;
+  large_numpart_type num_on_grid;
+
+  /* realize the comparison function as a functor, so that it can have an internal state (here the data array for which we sort indices
+   */
+  struct pm_nonperiodic_sortindex_comparator
+  {
+   private:
+    const part_slab_data *data;
+
+   public:
+    pm_nonperiodic_sortindex_comparator(const part_slab_data *data_) : data(data_) {}
+
+    bool operator()(const large_numpart_type &a, const large_numpart_type &b) const
+    {
+      return data[a].globalindex < data[b].globalindex;
+    }
+  };
+
+#else
+
+  /*
+   *  Here come the routines for a different communication algorithm that is better suited for a homogenuously loaded boxes.
+   */
+  struct partbuf
+  {
+    MyIntPosType IntPos[3];
+    MyFloat Mass;
+  };
+  partbuf *partin, *partout;
+
+  size_t nimport, nexport;
+
+  size_t *Sndpm_count, *Sndpm_offset;
+  size_t *Rcvpm_count, *Rcvpm_offset;
+
+  void pmforce_nonperiodic_uniform_optimized_prepare_density(int grnr);
+  void pmforce_nonperiodic_uniform_optimized_readout_forces_or_potential(int grnr, int dim);
+
+#endif
+
+#endif
+};
+
+#endif
diff --git a/src/pm/pm_periodic.cc b/src/pm/pm_periodic.cc
new file mode 100644
index 0000000000000000000000000000000000000000..753015f518865409b05ef794ee85d10357205301
--- /dev/null
+++ b/src/pm/pm_periodic.cc
@@ -0,0 +1,2731 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  pm_periodic.cc
+ *
+ *  \brief routines for periodic PM-force calculation
+ */
+
+#include "gadgetconfig.h"
+
+#if defined(PMGRID) && defined(PERIODIC)
+
+#include <fftw3.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <algorithm>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../pm/pm.h"
+#include "../pm/pm_periodic.h"
+#include "../sort/cxxsort.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/*!
+ * These routines support two different strategies for doing the particle data exchange to assemble the density field
+ * and to read out the forces and potentials:
+ *
+ * The default scheme sends the particle positions to the target slabs, and bins them there. This works usually well for
+ * homogenuously loaded boxes, but can be problematic for zoom-in runs. In the latter case,  PM_ZOOM_OPTIMIZED can be
+ * activated, where the data is binned on the originating processor followed by assembly of the binned density field.
+ *
+ * In addition, the routines can be either used with a slab-based FFT (as is traditionally done in FFTW), or with a
+ * column-based FFT. The latter requires more communication and is hence usually slower than the slab-based one.
+ * But if the number of MPI ranks exceeds the number of cells per dimension, then the column-based one can still scale
+ * and offers a balanced memory consumption, whereas this is not the case for the slab-based approach. To select the
+ * column-based FFT, the switch FFT_COLUMN_BASED can be activated.
+ *
+ * The switches PM_ZOOM_OPTIMIZED and FFT_COLUMN_BASED may also be combined, such that there are 4 main modes of how the
+ * PM routines may operate.
+ *
+ * It is also possible to use non-cubical boxes, by means of setting one or several of the LONG_X, LONG_Y, and LONG_Z
+ * options in the config file. The values need to be integers, and then BoxSize is stretched by that factor in the
+ * corresponding dimension.
+ *
+ * Finally, one may also use the TreePM routine for simulations where gravity is perdiodic only in two spatial dimensions.
+ * The non-periodic dimension is selected via the GRAVITY_TALLBOX flag. Also in this case, arbitrarily stretched boxes can
+ * be used, and one can use PM_ZOOM_OPTIMIZED and/or FFT_COLUMN_BASED if desired.
+ *
+ * Much of the code is multi-threaded, so there should be some speed-up if OpenMP is used with NUM_THREADS > 1, but the
+ * benefit may be limited because the data transfer steps (which weigh in quite heavily) are not accelerated by this.
+ *
+ * If eight times the particle load per processor exceeds 2^31 ~ 2 billion, one should activate NUMPART_PER_TASK_LARGE.
+ * The code will check this condition and terminate if this is violated, so there should hopefully be no severe risk
+ * to accidentally forget this.
+ */
+
+#define GRIDz (GRIDZ / 2 + 1)
+#define GRID2 (2 * GRIDz)
+
+/* short-cut macros for accessing different 3D arrays */
+#define FI(x, y, z) (((large_array_offset)GRID2) * (GRIDY * (x) + (y)) + (z))
+#define FCxy(c, z) (((large_array_offset)GRID2) * ((c)-myplan.firstcol_XY) + (z))
+#define FCxz(c, y) (((large_array_offset)GRIDY) * ((c)-myplan.firstcol_XZ) + (y))
+#define FCzy(c, x) (((large_array_offset)GRIDX) * ((c)-myplan.firstcol_ZY) + (x))
+
+#ifndef FFT_COLUMN_BASED
+#define NI(x, y, z) (((large_array_offset)GRIDZ) * ((y) + (x)*myplan.nslab_y) + (z))
+#endif
+
+/*! \brief This routine generates the FFTW-plans to carry out the FFTs later on.
+ *
+ *  Some auxiliary variables for bookkeeping are also initialized.
+ */
+void pm_periodic::pm_init_periodic(simparticles *Sp_ptr)
+{
+  Sp = Sp_ptr;
+
+  Sp->Asmth[0] = ASMTH * All.BoxSize / PMGRID;
+  Sp->Rcut[0]  = RCUT * Sp->Asmth[0];
+
+  /* Set up the FFTW-3 plan files. */
+  int ndimx[1] = {GRIDX}; /* dimension of the 1D transforms */
+  int ndimy[1] = {GRIDY}; /* dimension of the 1D transforms */
+  int ndimz[1] = {GRIDZ}; /* dimension of the 1D transforms */
+
+  int max_GRID2 = 2 * (std::max<int>(std::max<int>(GRIDX, GRIDY), GRIDZ) / 2 + 1);
+
+  /* temporarily allocate some arrays to make sure that out-of-place plans are created */
+  rhogrid   = (fft_real *)Mem.mymalloc("rhogrid", max_GRID2 * sizeof(fft_real));
+  forcegrid = (fft_real *)Mem.mymalloc("forcegrid", max_GRID2 * sizeof(fft_real));
+
+#ifdef DOUBLEPRECISION_FFTW
+  int alignflag = 0;
+#else
+  /* for single precision, the start of our FFT columns is presently only guaranteed to be 8-byte aligned */
+  int alignflag = FFTW_UNALIGNED;
+#endif
+
+  myplan.forward_plan_zdir = FFTW(plan_many_dft_r2c)(1, ndimz, 1, rhogrid, 0, 1, GRID2, (fft_complex *)forcegrid, 0, 1, GRIDz,
+                                                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+#ifndef FFT_COLUMN_BASED
+  int stride = GRIDz;
+#else
+  int stride    = 1;
+#endif
+
+  myplan.forward_plan_ydir =
+      FFTW(plan_many_dft)(1, ndimy, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDY, (fft_complex *)forcegrid, 0, stride,
+                          GRIDz * GRIDY, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.forward_plan_xdir =
+      FFTW(plan_many_dft)(1, ndimx, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDX, (fft_complex *)forcegrid, 0, stride,
+                          GRIDz * GRIDX, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.backward_plan_xdir =
+      FFTW(plan_many_dft)(1, ndimx, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDX, (fft_complex *)forcegrid, 0, stride,
+                          GRIDz * GRIDX, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.backward_plan_ydir =
+      FFTW(plan_many_dft)(1, ndimy, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDY, (fft_complex *)forcegrid, 0, stride,
+                          GRIDz * GRIDY, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  myplan.backward_plan_zdir = FFTW(plan_many_dft_c2r)(1, ndimz, 1, (fft_complex *)rhogrid, 0, 1, GRIDz, forcegrid, 0, 1, GRID2,
+                                                      FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
+
+  Mem.myfree(forcegrid);
+  Mem.myfree(rhogrid);
+
+#ifndef FFT_COLUMN_BASED
+
+  my_slab_based_fft_init(&myplan, GRIDX, GRIDY, GRIDZ);
+
+  maxfftsize = std::max<int>(myplan.largest_x_slab * GRIDY, myplan.largest_y_slab * GRIDX) * ((size_t)GRID2);
+
+#else
+
+  my_column_based_fft_init(&myplan, GRIDX, GRIDY, GRIDZ);
+
+  maxfftsize = myplan.max_datasize;
+
+#endif
+
+#if defined(GRAVITY_TALLBOX)
+  kernel        = (fft_real *)Mem.mymalloc("kernel", maxfftsize * sizeof(fft_real));
+  fft_of_kernel = (fft_complex *)kernel;
+
+  pmforce_setup_tallbox_kernel();
+#endif
+}
+
+/* Below, the two functions
+ *
+ *           pmforce_ ...... _prepare_density()
+ * and
+ *           pmforce_ ...... _readout_forces_or_potential(int dim)
+ *
+ * are defined in two different versions, one that works better for uniform
+ * simulations, the other for zoom-in runs. Only one of the two sets is used,
+ * depending on the setting of PM_ZOOM_OPTIMIZED.
+ */
+
+#ifdef PM_ZOOM_OPTIMIZED
+
+void pm_periodic::pmforce_zoom_optimized_prepare_density(int mode, int *typelist)
+{
+  int level, recvTask;
+  MPI_Status status;
+
+  particle_data *P = Sp->P;
+
+  part = (part_slab_data *)Mem.mymalloc("part", 8 * (NSource * sizeof(part_slab_data)));
+  large_numpart_type *part_sortindex =
+      (large_numpart_type *)Mem.mymalloc("part_sortindex", 8 * (NSource * sizeof(large_numpart_type)));
+
+#ifdef FFT_COLUMN_BASED
+  int columns         = GRIDX * GRIDY;
+  int avg             = (columns - 1) / NTask + 1;
+  int exc             = NTask * avg - columns;
+  int tasklastsection = NTask - exc;
+  int pivotcol        = tasklastsection * avg;
+#endif
+
+  /* determine the cells each particle accesses */
+  for(int idx = 0; idx < NSource; idx++)
+    {
+      int i = Sp->get_active_index(idx);
+
+      if(P[i].Ti_Current != All.Ti_Current)
+        Sp->drift_particle(&P[i], &Sp->SphP[i], All.Ti_Current);
+
+      int slab_x, slab_y, slab_z;
+      if(mode == 2)
+        {
+          slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
+          slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
+          slab_z = (P[i].IntPos[2] * POWERSPEC_FOLDFAC) / INTCELL;
+        }
+      else if(mode == 3)
+        {
+          slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
+          slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
+          slab_z = (P[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
+        }
+      else
+        {
+          slab_x = P[i].IntPos[0] / INTCELL;
+          slab_y = P[i].IntPos[1] / INTCELL;
+          slab_z = P[i].IntPos[2] / INTCELL;
+        }
+
+      large_numpart_type index_on_grid = ((large_numpart_type)idx) << 3;
+
+      for(int xx = 0; xx < 2; xx++)
+        for(int yy = 0; yy < 2; yy++)
+          for(int zz = 0; zz < 2; zz++)
+            {
+              int slab_xx = slab_x + xx;
+              int slab_yy = slab_y + yy;
+              int slab_zz = slab_z + zz;
+
+              if(slab_xx >= GRIDX)
+                slab_xx = 0;
+              if(slab_yy >= GRIDY)
+                slab_yy = 0;
+              if(slab_zz >= GRIDZ)
+                slab_zz = 0;
+
+              large_array_offset offset = FI(slab_xx, slab_yy, slab_zz);
+
+              part[index_on_grid].partindex   = (i << 3) + (xx << 2) + (yy << 1) + zz;
+              part[index_on_grid].globalindex = offset;
+              part_sortindex[index_on_grid]   = index_on_grid;
+              index_on_grid++;
+            }
+    }
+
+  /* note: num_on_grid will be  8 times larger than the particle number, but num_field_points will generally be much smaller */
+
+  large_array_offset num_field_points;
+  large_numpart_type num_on_grid = ((large_numpart_type)NSource) << 3;
+
+  /* bring the part-field into the order of the accessed cells. This allows the removal of duplicates */
+  // mycxxsort(part_sortindex, part_sortindex +  num_on_grid, pm_periodic_compare_sortindex, part);
+
+  mycxxsort(part_sortindex, part_sortindex + num_on_grid, pm_periodic_sortindex_comparator(part));
+
+  if(num_on_grid > 0)
+    num_field_points = 1;
+  else
+    num_field_points = 0;
+
+  /* determine the number of unique field points */
+  for(large_numpart_type i = 1; i < num_on_grid; i++)
+    {
+      if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
+        num_field_points++;
+    }
+
+  /* allocate the local field */
+  localfield_globalindex = (large_array_offset *)Mem.mymalloc_movable(&localfield_globalindex, "localfield_globalindex",
+                                                                      num_field_points * sizeof(large_array_offset));
+  localfield_data        = (fft_real *)Mem.mymalloc_movable(&localfield_data, "localfield_data", num_field_points * sizeof(fft_real));
+  localfield_first       = (size_t *)Mem.mymalloc_movable(&localfield_first, "localfield_first", NTask * sizeof(size_t));
+  localfield_sendcount   = (size_t *)Mem.mymalloc_movable(&localfield_sendcount, "localfield_sendcount", NTask * sizeof(size_t));
+  localfield_offset      = (size_t *)Mem.mymalloc_movable(&localfield_offset, "localfield_offset", NTask * sizeof(size_t));
+  localfield_recvcount   = (size_t *)Mem.mymalloc_movable(&localfield_recvcount, "localfield_recvcount", NTask * sizeof(size_t));
+
+  for(int i = 0; i < NTask; i++)
+    {
+      localfield_first[i]     = 0;
+      localfield_sendcount[i] = 0;
+    }
+
+  /* establish the cross link between the part[ ]-array and the local list of
+   * mesh points. Also, count on which CPU the needed field points are stored.
+   */
+  num_field_points = 0;
+  for(large_numpart_type i = 0; i < num_on_grid; i++)
+    {
+      if(i > 0)
+        if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
+          num_field_points++;
+
+      part[part_sortindex[i]].localindex = num_field_points;
+
+      if(i > 0)
+        if(part[part_sortindex[i]].globalindex == part[part_sortindex[i - 1]].globalindex)
+          continue;
+
+      localfield_globalindex[num_field_points] = part[part_sortindex[i]].globalindex;
+
+#ifndef FFT_COLUMN_BASED
+      int slab = part[part_sortindex[i]].globalindex / (GRIDY * GRID2);
+      int task = myplan.slab_to_task[slab];
+#else
+      int task, column = part[part_sortindex[i]].globalindex / (GRID2);
+
+      if(column < pivotcol)
+        task = column / avg;
+      else
+        task = (column - pivotcol) / (avg - 1) + tasklastsection;
+#endif
+
+      if(localfield_sendcount[task] == 0)
+        localfield_first[task] = num_field_points;
+
+      localfield_sendcount[task]++;
+    }
+  num_field_points++;
+
+  localfield_offset[0] = 0;
+  for(int i = 1; i < NTask; i++)
+    localfield_offset[i] = localfield_offset[i - 1] + localfield_sendcount[i - 1];
+
+  Mem.myfree_movable(part_sortindex);
+  part_sortindex = NULL;
+
+  /* now bin the local particle data onto the mesh list */
+  for(large_numpart_type i = 0; i < num_field_points; i++)
+    localfield_data[i] = 0;
+
+  for(large_numpart_type i = 0; i < num_on_grid; i += 8)
+    {
+      int pindex = (part[i].partindex >> 3);
+      MyIntPosType rmd_x, rmd_y, rmd_z;
+
+      if(mode == 2)
+        {
+          rmd_x = (P[pindex].IntPos[0] * POWERSPEC_FOLDFAC) % INTCELL;
+          rmd_y = (P[pindex].IntPos[1] * POWERSPEC_FOLDFAC) % INTCELL;
+          rmd_z = (P[pindex].IntPos[2] * POWERSPEC_FOLDFAC) % INTCELL;
+        }
+      else if(mode == 3)
+        {
+          rmd_x = (P[pindex].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
+          rmd_y = (P[pindex].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
+          rmd_z = (P[pindex].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
+        }
+      else
+        {
+          rmd_x = P[pindex].IntPos[0] % INTCELL;
+          rmd_y = P[pindex].IntPos[1] % INTCELL;
+          rmd_z = P[pindex].IntPos[2] % INTCELL;
+        }
+
+      double dx = rmd_x * (1.0 / INTCELL);
+      double dy = rmd_y * (1.0 / INTCELL);
+      double dz = rmd_z * (1.0 / INTCELL);
+
+      double weight = P[pindex].getMass();
+
+      if(mode) /* only for power spectrum calculation */
+        if(typelist[P[pindex].getType()] == 0)
+          continue;
+
+      localfield_data[part[i + 0].localindex] += weight * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
+      localfield_data[part[i + 1].localindex] += weight * (1.0 - dx) * (1.0 - dy) * dz;
+      localfield_data[part[i + 2].localindex] += weight * (1.0 - dx) * dy * (1.0 - dz);
+      localfield_data[part[i + 3].localindex] += weight * (1.0 - dx) * dy * dz;
+      localfield_data[part[i + 4].localindex] += weight * (dx) * (1.0 - dy) * (1.0 - dz);
+      localfield_data[part[i + 5].localindex] += weight * (dx) * (1.0 - dy) * dz;
+      localfield_data[part[i + 6].localindex] += weight * (dx)*dy * (1.0 - dz);
+      localfield_data[part[i + 7].localindex] += weight * (dx)*dy * dz;
+    }
+
+  rhogrid = (fft_real *)Mem.mymalloc_clear("rhogrid", maxfftsize * sizeof(fft_real));
+
+  /* exchange data and add contributions to the local mesh-path */
+  MPI_Alltoall(localfield_sendcount, sizeof(size_t), MPI_BYTE, localfield_recvcount, sizeof(size_t), MPI_BYTE, Communicator);
+
+  for(level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
+    {
+      recvTask = ThisTask ^ level;
+
+      if(recvTask < NTask)
+        {
+          if(level > 0)
+            {
+              import_data        = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
+              import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
+                                                                      localfield_recvcount[recvTask] * sizeof(large_array_offset));
+
+              if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
+                {
+                  myMPI_Sendrecv(localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
+                                 MPI_BYTE, recvTask, TAG_NONPERIOD_A, import_data, localfield_recvcount[recvTask] * sizeof(fft_real),
+                                 MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
+
+                  myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
+                                 localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_B,
+                                 import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
+                                 TAG_NONPERIOD_B, Communicator, &status);
+                }
+            }
+          else
+            {
+              import_data        = localfield_data + localfield_offset[ThisTask];
+              import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
+            }
+
+          /* note: here every element in rhogrid is only accessed once, so there should be no race condition */
+          for(size_t i = 0; i < localfield_recvcount[recvTask]; i++)
+            {
+              /* determine offset in local FFT slab */
+#ifndef FFT_COLUMN_BASED
+              large_array_offset offset =
+                  import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRIDY * ((large_array_offset)GRID2);
+#else
+              large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
+#endif
+              rhogrid[offset] += import_data[i];
+            }
+
+          if(level > 0)
+            {
+              Mem.myfree(import_globalindex);
+              Mem.myfree(import_data);
+            }
+        }
+    }
+}
+
+/* Function to read out the force component corresponding to spatial dimension 'dim'.
+ * If dim is negative, potential values are read out and assigned to particles.
+ */
+void pm_periodic::pmforce_zoom_optimized_readout_forces_or_potential(fft_real *grid, int dim)
+{
+  particle_data *P = Sp->P;
+
+#ifdef EVALPOTENTIAL
+#ifdef GRAVITY_TALLBOX
+  double fac = 1.0 / (((double)GRIDX) * GRIDY * GRIDZ); /* to get potential  */
+#else
+  double fac = 4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / pow(All.BoxSize, 3); /* to get potential  */
+#endif
+#endif
+
+  for(int level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
+    {
+      int recvTask = ThisTask ^ level;
+
+      if(recvTask < NTask)
+        {
+          if(level > 0)
+            {
+              import_data        = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
+              import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
+                                                                      localfield_recvcount[recvTask] * sizeof(large_array_offset));
+
+              if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
+                {
+                  MPI_Status status;
+                  myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
+                                 localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_C,
+                                 import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
+                                 TAG_NONPERIOD_C, Communicator, &status);
+                }
+            }
+          else
+            {
+              import_data        = localfield_data + localfield_offset[ThisTask];
+              import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
+            }
+
+          for(size_t i = 0; i < localfield_recvcount[recvTask]; i++)
+            {
+#ifndef FFT_COLUMN_BASED
+              large_array_offset offset =
+                  import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRIDY * ((large_array_offset)GRID2);
+#else
+              large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
+#endif
+              import_data[i] = grid[offset];
+            }
+
+          if(level > 0)
+            {
+              MPI_Status status;
+              myMPI_Sendrecv(import_data, localfield_recvcount[recvTask] * sizeof(fft_real), MPI_BYTE, recvTask, TAG_NONPERIOD_A,
+                             localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
+                             MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
+
+              Mem.myfree(import_globalindex);
+              Mem.myfree(import_data);
+            }
+        }
+    }
+
+  /* read out the force/potential values, which all have been assembled in localfield_data */
+  for(int idx = 0; idx < NSource; idx++)
+    {
+      int i = Sp->get_active_index(idx);
+
+#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
+      if(!Sp->TimeBinSynchronized[P[i].TimeBinGrav])
+        continue;
+#endif
+
+      large_numpart_type j = (idx << 3);
+
+      MyIntPosType rmd_x = P[i].IntPos[0] % INTCELL;
+      MyIntPosType rmd_y = P[i].IntPos[1] % INTCELL;
+      MyIntPosType rmd_z = P[i].IntPos[2] % INTCELL;
+
+      double dx = rmd_x * (1.0 / INTCELL);
+      double dy = rmd_y * (1.0 / INTCELL);
+      double dz = rmd_z * (1.0 / INTCELL);
+
+      double value = localfield_data[part[j + 0].localindex] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
+                     localfield_data[part[j + 1].localindex] * (1.0 - dx) * (1.0 - dy) * dz +
+                     localfield_data[part[j + 2].localindex] * (1.0 - dx) * dy * (1.0 - dz) +
+                     localfield_data[part[j + 3].localindex] * (1.0 - dx) * dy * dz +
+                     localfield_data[part[j + 4].localindex] * (dx) * (1.0 - dy) * (1.0 - dz) +
+                     localfield_data[part[j + 5].localindex] * (dx) * (1.0 - dy) * dz +
+                     localfield_data[part[j + 6].localindex] * (dx)*dy * (1.0 - dz) +
+                     localfield_data[part[j + 7].localindex] * (dx)*dy * dz;
+
+      if(dim < 0)
+        {
+#ifdef EVALPOTENTIAL
+#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+          P[i].PM_Potential += value * fac;
+#else
+          P[i].Potential += value * fac;
+#endif
+#endif
+        }
+      else
+        {
+#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+          Sp->P[i].GravPM[dim] += value;
+#else
+          Sp->P[i].GravAccel[dim] += value;
+#endif
+        }
+    }
+}
+
+#else
+
+/*
+ *  Here come the routines for a different communication algorithm that is better suited for a homogeneously loaded boxes.
+ */
+
+void pm_periodic::pmforce_uniform_optimized_prepare_density(int mode, int *typelist)
+{
+  Sndpm_count = (size_t *)Mem.mymalloc("Sndpm_count", NTask * sizeof(size_t));
+  Sndpm_offset = (size_t *)Mem.mymalloc("Sndpm_offset", NTask * sizeof(size_t));
+  Rcvpm_count = (size_t *)Mem.mymalloc("Rcvpm_count", NTask * sizeof(size_t));
+  Rcvpm_offset = (size_t *)Mem.mymalloc("Rcvpm_offset", NTask * sizeof(size_t));
+
+  particle_data *P = Sp->P;
+
+  /* determine the slabs/columns each particles accesses */
+
+#ifdef FFT_COLUMN_BASED
+  int columns = GRIDX * GRIDY;
+  int avg = (columns - 1) / NTask + 1;
+  int exc = NTask * avg - columns;
+  int tasklastsection = NTask - exc;
+  int pivotcol = tasklastsection * avg;
+#endif
+
+  for(int rep = 0; rep < 2; rep++)
+    {
+      /* each threads needs to do the loop to clear its send_count[] array */
+      for(int j = 0; j < NTask; j++)
+        Sndpm_count[j] = 0;
+
+      for(int idx = 0; idx < NSource; idx++)
+        {
+          int i = Sp->get_active_index(idx);
+
+          if(P[i].Ti_Current != All.Ti_Current)
+            Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
+
+          if(mode) /* only for power spectrum calculation */
+            if(typelist[P[i].getType()] == 0)
+              continue;
+
+          int slab_x;
+          if(mode == 2)
+            slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
+          else if(mode == 3)
+            slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
+          else
+            slab_x = P[i].IntPos[0] / INTCELL;
+
+          int slab_xx = slab_x + 1;
+
+          if(slab_xx >= GRIDX)
+            slab_xx = 0;
+
+#ifndef FFT_COLUMN_BASED
+          if(rep == 0)
+            {
+              int task0 = myplan.slab_to_task[slab_x];
+              int task1 = myplan.slab_to_task[slab_xx];
+
+              Sndpm_count[task0]++;
+
+              if(task0 != task1)
+                Sndpm_count[task1]++;
+            }
+          else
+            {
+              int task0 = myplan.slab_to_task[slab_x];
+              int task1 = myplan.slab_to_task[slab_xx];
+
+              size_t ind0 = Sndpm_offset[task0] + Sndpm_count[task0]++;
+#ifndef LEAN
+              partout[ind0].Mass = P[i].getMass();
+#endif
+              for(int j = 0; j < 3; j++)
+                partout[ind0].IntPos[j] = P[i].IntPos[j];
+
+              if(task0 != task1)
+                {
+                  size_t ind1 = Sndpm_offset[task1] + Sndpm_count[task1]++;
+#ifndef LEAN
+                  partout[ind1].Mass = P[i].getMass();
+#endif
+                  for(int j = 0; j < 3; j++)
+                    partout[ind1].IntPos[j] = P[i].IntPos[j];
+                }
+            }
+
+#else
+          int slab_y;
+          if(mode == 2)
+            slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
+          else if(mode == 3)
+            slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
+          else
+            slab_y = P[i].IntPos[1] / INTCELL;
+
+          int slab_yy = slab_y + 1;
+
+          if(slab_yy >= GRIDY)
+            slab_yy = 0;
+
+          int column0 = slab_x * GRIDY + slab_y;
+          int column1 = slab_x * GRIDY + slab_yy;
+          int column2 = slab_xx * GRIDY + slab_y;
+          int column3 = slab_xx * GRIDY + slab_yy;
+
+          int task0, task1, task2, task3;
+
+          if(column0 < pivotcol)
+            task0 = column0 / avg;
+          else
+            task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(column1 < pivotcol)
+            task1 = column1 / avg;
+          else
+            task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(column2 < pivotcol)
+            task2 = column2 / avg;
+          else
+            task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(column3 < pivotcol)
+            task3 = column3 / avg;
+          else
+            task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(rep == 0)
+            {
+              Sndpm_count[task0]++;
+              if(task1 != task0)
+                Sndpm_count[task1]++;
+              if(task2 != task1 && task2 != task0)
+                Sndpm_count[task2]++;
+              if(task3 != task0 && task3 != task1 && task3 != task2)
+                Sndpm_count[task3]++;
+            }
+          else
+            {
+              size_t ind0        = Sndpm_offset[task0] + Sndpm_count[task0]++;
+#ifndef LEAN
+              partout[ind0].Mass = P[i].getMass();
+#endif
+              for(int j = 0; j < 3; j++)
+                partout[ind0].IntPos[j] = P[i].IntPos[j];
+
+              if(task1 != task0)
+                {
+                  size_t ind1        = Sndpm_offset[task1] + Sndpm_count[task1]++;
+#ifndef LEAN
+                  partout[ind1].Mass = P[i].getMass();
+#endif
+                  for(int j = 0; j < 3; j++)
+                    partout[ind1].IntPos[j] = P[i].IntPos[j];
+                }
+              if(task2 != task1 && task2 != task0)
+                {
+                  size_t ind2        = Sndpm_offset[task2] + Sndpm_count[task2]++;
+#ifndef LEAN
+                  partout[ind2].Mass = P[i].getMass();
+#endif
+                  for(int j = 0; j < 3; j++)
+                    partout[ind2].IntPos[j] = P[i].IntPos[j];
+                }
+              if(task3 != task0 && task3 != task1 && task3 != task2)
+                {
+                  size_t ind3        = Sndpm_offset[task3] + Sndpm_count[task3]++;
+#ifndef LEAN
+                  partout[ind3].Mass = P[i].getMass();
+#endif
+                  for(int j = 0; j < 3; j++)
+                    partout[ind3].IntPos[j] = P[i].IntPos[j];
+                }
+            }
+#endif
+        }
+
+      if(rep == 0)
+        {
+          MPI_Alltoall(Sndpm_count, sizeof(size_t), MPI_BYTE, Rcvpm_count, sizeof(size_t), MPI_BYTE, Communicator);
+
+          nimport = 0, nexport = 0, Rcvpm_offset[0] = 0, Sndpm_offset[0] = 0;
+          for(int j = 0; j < NTask; j++)
+            {
+              nexport += Sndpm_count[j];
+              nimport += Rcvpm_count[j];
+
+              if(j > 0)
+                {
+                  Sndpm_offset[j] = Sndpm_offset[j - 1] + Sndpm_count[j - 1];
+                  Rcvpm_offset[j] = Rcvpm_offset[j - 1] + Rcvpm_count[j - 1];
+                }
+            }
+
+          /* allocate import and export buffer */
+          partin = (partbuf *)Mem.mymalloc_movable(&partin, "partin", nimport * sizeof(partbuf));
+          partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
+        }
+    }
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+  int flag_big = 0, flag_big_all;
+  for(int i = 0; i < NTask; i++)
+    if(Sndpm_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+      flag_big = 1;
+
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  /* exchange particle data */
+  myMPI_Alltoallv(partout, Sndpm_count, Sndpm_offset, partin, Rcvpm_count, Rcvpm_offset, sizeof(partbuf), flag_big_all, Communicator);
+
+  Mem.myfree(partout);
+
+  /* allocate cleared density field */
+  rhogrid = (fft_real *)Mem.mymalloc_movable_clear(&rhogrid, "rhogrid", maxfftsize * sizeof(fft_real));
+
+#ifndef FFT_COLUMN_BASED
+  /* bin particle data onto mesh, in multi-threaded fashion */
+
+  for(size_t i = 0; i < nimport; i++)
+    {
+      int slab_x, slab_y, slab_z;
+      MyIntPosType rmd_x, rmd_y, rmd_z;
+
+      if(mode == 2)
+        {
+          slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) % INTCELL;
+          slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) % INTCELL;
+          slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) % INTCELL;
+        }
+      else if(mode == 3)
+        {
+          slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
+          slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
+          slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
+        }
+      else
+        {
+          slab_x = partin[i].IntPos[0] / INTCELL;
+          rmd_x = partin[i].IntPos[0] % INTCELL;
+          slab_y = partin[i].IntPos[1] / INTCELL;
+          rmd_y = partin[i].IntPos[1] % INTCELL;
+          slab_z = partin[i].IntPos[2] / INTCELL;
+          rmd_z = partin[i].IntPos[2] % INTCELL;
+        }
+
+      double dx = rmd_x * (1.0 / INTCELL);
+      double dy = rmd_y * (1.0 / INTCELL);
+      double dz = rmd_z * (1.0 / INTCELL);
+
+      int slab_xx = slab_x + 1;
+      int slab_yy = slab_y + 1;
+      int slab_zz = slab_z + 1;
+
+      if(slab_xx >= GRIDX)
+        slab_xx = 0;
+      if(slab_yy >= GRIDY)
+        slab_yy = 0;
+      if(slab_zz >= GRIDZ)
+        slab_zz = 0;
+
+#ifdef LEAN
+      double mass = All.PartMass;
+#else
+      double mass = partin[i].Mass;
+#endif
+
+      if(myplan.slab_to_task[slab_x] == ThisTask)
+        {
+          slab_x -= myplan.first_slab_x_of_task[ThisTask];
+
+          rhogrid[FI(slab_x, slab_y, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
+          rhogrid[FI(slab_x, slab_y, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
+
+          rhogrid[FI(slab_x, slab_yy, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
+          rhogrid[FI(slab_x, slab_yy, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
+        }
+
+      if(myplan.slab_to_task[slab_xx] == ThisTask)
+        {
+          slab_xx -= myplan.first_slab_x_of_task[ThisTask];
+
+          rhogrid[FI(slab_xx, slab_y, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
+          rhogrid[FI(slab_xx, slab_y, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
+
+          rhogrid[FI(slab_xx, slab_yy, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
+          rhogrid[FI(slab_xx, slab_yy, slab_zz)] += (mass * (dx) * (dy) * (dz));
+        }
+    }
+
+#else
+
+  int first_col = myplan.firstcol_XY;
+  int last_col = myplan.firstcol_XY + myplan.ncol_XY - 1;
+
+  for(size_t i = 0; i < nimport; i++)
+    {
+      int slab_x, slab_y;
+      MyIntPosType rmd_x, rmd_y;
+      if(mode == 2)
+        {
+          slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) % INTCELL;
+          slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) % INTCELL;
+        }
+      else if(mode == 3)
+        {
+          slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
+          slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
+        }
+      else
+        {
+          slab_x = partin[i].IntPos[0] / INTCELL;
+          rmd_x = partin[i].IntPos[0] % INTCELL;
+          slab_y = partin[i].IntPos[1] / INTCELL;
+          rmd_y = partin[i].IntPos[1] % INTCELL;
+        }
+
+      double dx = rmd_x * (1.0 / INTCELL);
+      double dy = rmd_y * (1.0 / INTCELL);
+
+      int slab_xx = slab_x + 1;
+      int slab_yy = slab_y + 1;
+
+      if(slab_xx >= GRIDX)
+        slab_xx = 0;
+
+      if(slab_yy >= GRIDY)
+        slab_yy = 0;
+
+      int col0 = slab_x * GRIDY + slab_y;
+      int col1 = slab_x * GRIDY + slab_yy;
+      int col2 = slab_xx * GRIDY + slab_y;
+      int col3 = slab_xx * GRIDY + slab_yy;
+
+#ifdef LEAN
+      double mass = All.PartMass;
+#else
+      double mass = partin[i].Mass;
+#endif
+
+      int slab_z;
+      MyIntPosType rmd_z;
+      if(mode == 2)
+        {
+          slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) % INTCELL;
+        }
+      else if(mode == 3)
+        {
+          slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
+          rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
+        }
+      else
+        {
+          slab_z = partin[i].IntPos[2] / INTCELL;
+          rmd_z = partin[i].IntPos[2] % INTCELL;
+        }
+
+      double dz = rmd_z * (1.0 / INTCELL);
+
+      int slab_zz = slab_z + 1;
+
+      if(slab_zz >= GRIDZ)
+        slab_zz = 0;
+
+      if(col0 >= first_col && col0 <= last_col)
+        {
+          rhogrid[FCxy(col0, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
+          rhogrid[FCxy(col0, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
+        }
+
+      if(col1 >= first_col && col1 <= last_col)
+        {
+          rhogrid[FCxy(col1, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
+          rhogrid[FCxy(col1, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
+        }
+
+      if(col2 >= first_col && col2 <= last_col)
+        {
+          rhogrid[FCxy(col2, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
+          rhogrid[FCxy(col2, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
+        }
+
+      if(col3 >= first_col && col3 <= last_col)
+        {
+          rhogrid[FCxy(col3, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
+          rhogrid[FCxy(col3, slab_zz)] += (mass * (dx) * (dy) * (dz));
+        }
+    }
+
+#endif
+}
+
+/* If dim<0, this function reads out the potential, otherwise Cartesian force components.
+ */
+void pm_periodic::pmforce_uniform_optimized_readout_forces_or_potential_xy(fft_real *grid, int dim)
+{
+  particle_data *P = Sp->P;
+
+#ifdef EVALPOTENTIAL
+#ifdef GRAVITY_TALLBOX
+  double fac = 1.0 / (((double)GRIDX) * GRIDY * GRIDZ); /* to get potential  */
+#else
+  double fac = 4 * M_PI * (LONG_X * LONG_Y * LONG_Z) / pow(All.BoxSize, 3); /* to get potential  */
+#endif
+#endif
+
+  MyFloat *flistin = (MyFloat *)Mem.mymalloc("flistin", nimport * sizeof(MyFloat));
+  MyFloat *flistout = (MyFloat *)Mem.mymalloc("flistout", nexport * sizeof(MyFloat));
+
+#ifdef FFT_COLUMN_BASED
+  int columns = GRIDX * GRIDY;
+  int avg = (columns - 1) / NTask + 1;
+  int exc = NTask * avg - columns;
+  int tasklastsection = NTask - exc;
+  int pivotcol = tasklastsection * avg;
+#endif
+
+  for(size_t i = 0; i < nimport; i++)
+    {
+      flistin[i] = 0;
+
+      int slab_x = partin[i].IntPos[0] / INTCELL;
+      int slab_y = partin[i].IntPos[1] / INTCELL;
+      int slab_z = partin[i].IntPos[2] / INTCELL;
+
+      MyIntPosType rmd_x = partin[i].IntPos[0] % INTCELL;
+      MyIntPosType rmd_y = partin[i].IntPos[1] % INTCELL;
+      MyIntPosType rmd_z = partin[i].IntPos[2] % INTCELL;
+
+      double dx = rmd_x * (1.0 / INTCELL);
+      double dy = rmd_y * (1.0 / INTCELL);
+      double dz = rmd_z * (1.0 / INTCELL);
+
+      int slab_xx = slab_x + 1;
+      int slab_yy = slab_y + 1;
+      int slab_zz = slab_z + 1;
+
+      if(slab_xx >= GRIDX)
+        slab_xx = 0;
+      if(slab_yy >= GRIDY)
+        slab_yy = 0;
+      if(slab_zz >= GRIDZ)
+        slab_zz = 0;
+
+#ifndef FFT_COLUMN_BASED
+      if(myplan.slab_to_task[slab_x] == ThisTask)
+        {
+          slab_x -= myplan.first_slab_x_of_task[ThisTask];
+
+          flistin[i] += grid[FI(slab_x, slab_y, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
+                        grid[FI(slab_x, slab_y, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz) +
+                        grid[FI(slab_x, slab_yy, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) +
+                        grid[FI(slab_x, slab_yy, slab_zz)] * (1.0 - dx) * (dy) * (dz);
+        }
+
+      if(myplan.slab_to_task[slab_xx] == ThisTask)
+        {
+          slab_xx -= myplan.first_slab_x_of_task[ThisTask];
+
+          flistin[i] += grid[FI(slab_xx, slab_y, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) +
+                        grid[FI(slab_xx, slab_y, slab_zz)] * (dx) * (1.0 - dy) * (dz) +
+                        grid[FI(slab_xx, slab_yy, slab_z)] * (dx) * (dy) * (1.0 - dz) +
+                        grid[FI(slab_xx, slab_yy, slab_zz)] * (dx) * (dy) * (dz);
+        }
+#else
+      int column0 = slab_x * GRIDY + slab_y;
+      int column1 = slab_x * GRIDY + slab_yy;
+      int column2 = slab_xx * GRIDY + slab_y;
+      int column3 = slab_xx * GRIDY + slab_yy;
+
+      if(column0 >= myplan.firstcol_XY && column0 <= myplan.lastcol_XY)
+        {
+          flistin[i] += grid[FCxy(column0, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
+                        grid[FCxy(column0, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz);
+        }
+      if(column1 >= myplan.firstcol_XY && column1 <= myplan.lastcol_XY)
+        {
+          flistin[i] +=
+              grid[FCxy(column1, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) + grid[FCxy(column1, slab_zz)] * (1.0 - dx) * (dy) * (dz);
+        }
+
+      if(column2 >= myplan.firstcol_XY && column2 <= myplan.lastcol_XY)
+        {
+          flistin[i] +=
+              grid[FCxy(column2, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) + grid[FCxy(column2, slab_zz)] * (dx) * (1.0 - dy) * (dz);
+        }
+
+      if(column3 >= myplan.firstcol_XY && column3 <= myplan.lastcol_XY)
+        {
+          flistin[i] += grid[FCxy(column3, slab_z)] * (dx) * (dy) * (1.0 - dz) + grid[FCxy(column3, slab_zz)] * (dx) * (dy) * (dz);
+        }
+#endif
+    }
+
+  /* exchange the potential component data */
+  int flag_big = 0, flag_big_all;
+  for(int i = 0; i < NTask; i++)
+    if(Sndpm_count[i] * sizeof(MyFloat) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+      flag_big = 1;
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  /* exchange  data */
+  myMPI_Alltoallv(flistin, Rcvpm_count, Rcvpm_offset, flistout, Sndpm_count, Sndpm_offset, sizeof(MyFloat), flag_big_all,
+                  Communicator);
+
+  /* each threads needs to do the loop to clear its send_count[] array */
+  for(int j = 0; j < NTask; j++)
+    Sndpm_count[j] = 0;
+
+  for(int idx = 0; idx < NSource; idx++)
+    {
+      int i = Sp->get_active_index(idx);
+
+      int slab_x = P[i].IntPos[0] / INTCELL;
+      int slab_xx = slab_x + 1;
+
+      if(slab_xx >= GRIDX)
+        slab_xx = 0;
+
+#ifndef FFT_COLUMN_BASED
+      int task0 = myplan.slab_to_task[slab_x];
+      int task1 = myplan.slab_to_task[slab_xx];
+
+      double value = flistout[Sndpm_offset[task0] + Sndpm_count[task0]++];
+
+      if(task0 != task1)
+        value += flistout[Sndpm_offset[task1] + Sndpm_count[task1]++];
+#else
+      int slab_y = P[i].IntPos[1] / INTCELL;
+      int slab_yy = slab_y + 1;
+
+      if(slab_yy >= GRIDY)
+        slab_yy = 0;
+
+      int column0 = slab_x * GRIDY + slab_y;
+      int column1 = slab_x * GRIDY + slab_yy;
+      int column2 = slab_xx * GRIDY + slab_y;
+      int column3 = slab_xx * GRIDY + slab_yy;
+
+      int task0, task1, task2, task3;
+
+      if(column0 < pivotcol)
+        task0 = column0 / avg;
+      else
+        task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column1 < pivotcol)
+        task1 = column1 / avg;
+      else
+        task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column2 < pivotcol)
+        task2 = column2 / avg;
+      else
+        task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column3 < pivotcol)
+        task3 = column3 / avg;
+      else
+        task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+      double value = flistout[Sndpm_offset[task0] + Sndpm_count[task0]++];
+
+      if(task1 != task0)
+        value += flistout[Sndpm_offset[task1] + Sndpm_count[task1]++];
+
+      if(task2 != task1 && task2 != task0)
+        value += flistout[Sndpm_offset[task2] + Sndpm_count[task2]++];
+
+      if(task3 != task0 && task3 != task1 && task3 != task2)
+        value += flistout[Sndpm_offset[task3] + Sndpm_count[task3]++];
+#endif
+
+#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
+      if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
+        continue;
+#endif
+
+      if(dim < 0)
+        {
+#ifdef EVALPOTENTIAL
+#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+          Sp->P[i].PM_Potential += value * fac;
+#else
+          Sp->P[i].Potential += value * fac;
+#endif
+#endif
+        }
+      else
+        {
+#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+          Sp->P[i].GravPM[dim] += value;
+#else
+          Sp->P[i].GravAccel[dim] += value;
+#endif
+        }
+    }
+
+  Mem.myfree(flistout);
+  Mem.myfree(flistin);
+}
+
+#ifdef FFT_COLUMN_BASED
+void pm_periodic::pmforce_uniform_optimized_readout_forces_or_potential_xz(fft_real *grid, int dim)
+{
+  if(dim != 1)
+    Terminate("bummer");
+
+  size_t *send_count = (size_t *)Mem.mymalloc("send_count", NTask * sizeof(size_t));
+  size_t *send_offset = (size_t *)Mem.mymalloc("send_offset", NTask * sizeof(size_t));
+  size_t *recv_count = (size_t *)Mem.mymalloc("recv_count", NTask * sizeof(size_t));
+  size_t *recv_offset = (size_t *)Mem.mymalloc("recv_offset", NTask * sizeof(size_t));
+
+  struct partbuf
+  {
+    MyIntPosType IntPos[3];
+  };
+
+  partbuf *partin, *partout;
+  size_t nimport = 0, nexport = 0;
+
+  particle_data *P = Sp->P;
+
+  int columns = GRIDX * GRID2;
+  int avg = (columns - 1) / NTask + 1;
+  int exc = NTask * avg - columns;
+  int tasklastsection = NTask - exc;
+  int pivotcol = tasklastsection * avg;
+
+  /* determine the slabs/columns each particles accesses */
+  for(int rep = 0; rep < 2; rep++)
+    {
+      for(int j = 0; j < NTask; j++)
+        send_count[j] = 0;
+
+      for(int idx = 0; idx < NSource; idx++)
+        {
+          int i = Sp->get_active_index(idx);
+
+          if(P[i].Ti_Current != All.Ti_Current)
+            Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
+
+          int slab_x = P[i].IntPos[0] / INTCELL;
+          int slab_xx = slab_x + 1;
+
+          if(slab_xx >= GRIDX)
+            slab_xx = 0;
+
+          int slab_z = P[i].IntPos[2] / INTCELL;
+          int slab_zz = slab_z + 1;
+
+          if(slab_zz >= GRIDZ)
+            slab_zz = 0;
+
+          int column0 = slab_x * GRID2 + slab_z;
+          int column1 = slab_x * GRID2 + slab_zz;
+          int column2 = slab_xx * GRID2 + slab_z;
+          int column3 = slab_xx * GRID2 + slab_zz;
+
+          int task0, task1, task2, task3;
+
+          if(column0 < pivotcol)
+            task0 = column0 / avg;
+          else
+            task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(column1 < pivotcol)
+            task1 = column1 / avg;
+          else
+            task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(column2 < pivotcol)
+            task2 = column2 / avg;
+          else
+            task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(column3 < pivotcol)
+            task3 = column3 / avg;
+          else
+            task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(rep == 0)
+            {
+              send_count[task0]++;
+              if(task1 != task0)
+                send_count[task1]++;
+              if(task2 != task1 && task2 != task0)
+                send_count[task2]++;
+              if(task3 != task0 && task3 != task1 && task3 != task2)
+                send_count[task3]++;
+            }
+          else
+            {
+              size_t ind0 = send_offset[task0] + send_count[task0]++;
+              for(int j = 0; j < 3; j++)
+                partout[ind0].IntPos[j] = P[i].IntPos[j];
+
+              if(task1 != task0)
+                {
+                  size_t ind1 = send_offset[task1] + send_count[task1]++;
+                  for(int j = 0; j < 3; j++)
+                    partout[ind1].IntPos[j] = P[i].IntPos[j];
+                }
+              if(task2 != task1 && task2 != task0)
+                {
+                  size_t ind2 = send_offset[task2] + send_count[task2]++;
+
+                  for(int j = 0; j < 3; j++)
+                    partout[ind2].IntPos[j] = P[i].IntPos[j];
+                }
+              if(task3 != task0 && task3 != task1 && task3 != task2)
+                {
+                  size_t ind3 = send_offset[task3] + send_count[task3]++;
+
+                  for(int j = 0; j < 3; j++)
+                    partout[ind3].IntPos[j] = P[i].IntPos[j];
+                }
+            }
+        }
+
+      if(rep == 0)
+        {
+          MPI_Alltoall(send_count, sizeof(size_t), MPI_BYTE, recv_count, sizeof(size_t), MPI_BYTE, Communicator);
+
+          nimport = 0, nexport = 0;
+          recv_offset[0] = send_offset[0] = 0;
+
+          for(int j = 0; j < NTask; j++)
+            {
+              nexport += send_count[j];
+              nimport += recv_count[j];
+
+              if(j > 0)
+                {
+                  send_offset[j] = send_offset[j - 1] + send_count[j - 1];
+                  recv_offset[j] = recv_offset[j - 1] + recv_count[j - 1];
+                }
+            }
+
+          /* allocate import and export buffer */
+          partin = (partbuf *)Mem.mymalloc("partin", nimport * sizeof(partbuf));
+          partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
+        }
+    }
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+  int flag_big = 0, flag_big_all;
+  for(int i = 0; i < NTask; i++)
+    if(send_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+      flag_big = 1;
+
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  /* exchange particle data */
+  myMPI_Alltoallv(partout, send_count, send_offset, partin, recv_count, recv_offset, sizeof(partbuf), flag_big_all, Communicator);
+
+  Mem.myfree(partout);
+
+  MyFloat *flistin = (MyFloat *)Mem.mymalloc("flistin", nimport * sizeof(MyFloat));
+  MyFloat *flistout = (MyFloat *)Mem.mymalloc("flistout", nexport * sizeof(MyFloat));
+
+  for(size_t i = 0; i < nimport; i++)
+    {
+      flistin[i] = 0;
+
+      int slab_x = partin[i].IntPos[0] / INTCELL;
+      int slab_y = partin[i].IntPos[1] / INTCELL;
+      int slab_z = partin[i].IntPos[2] / INTCELL;
+
+      MyIntPosType rmd_x = partin[i].IntPos[0] % INTCELL;
+      MyIntPosType rmd_y = partin[i].IntPos[1] % INTCELL;
+      MyIntPosType rmd_z = partin[i].IntPos[2] % INTCELL;
+
+      double dx = rmd_x * (1.0 / INTCELL);
+      double dy = rmd_y * (1.0 / INTCELL);
+      double dz = rmd_z * (1.0 / INTCELL);
+
+      int slab_xx = slab_x + 1;
+      int slab_yy = slab_y + 1;
+      int slab_zz = slab_z + 1;
+
+      if(slab_xx >= GRIDX)
+        slab_xx = 0;
+      if(slab_yy >= GRIDY)
+        slab_yy = 0;
+      if(slab_zz >= GRIDZ)
+        slab_zz = 0;
+
+      int column0 = slab_x * GRID2 + slab_z;
+      int column1 = slab_x * GRID2 + slab_zz;
+      int column2 = slab_xx * GRID2 + slab_z;
+      int column3 = slab_xx * GRID2 + slab_zz;
+
+      if(column0 >= myplan.firstcol_XZ && column0 <= myplan.lastcol_XZ)
+        {
+          flistin[i] += grid[FCxz(column0, slab_y)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
+                        grid[FCxz(column0, slab_yy)] * (1.0 - dx) * (dy) * (1.0 - dz);
+        }
+      if(column1 >= myplan.firstcol_XZ && column1 <= myplan.lastcol_XZ)
+        {
+          flistin[i] +=
+              grid[FCxz(column1, slab_y)] * (1.0 - dx) * (1.0 - dy) * (dz) + grid[FCxz(column1, slab_yy)] * (1.0 - dx) * (dy) * (dz);
+        }
+
+      if(column2 >= myplan.firstcol_XZ && column2 <= myplan.lastcol_XZ)
+        {
+          flistin[i] +=
+              grid[FCxz(column2, slab_y)] * (dx) * (1.0 - dy) * (1.0 - dz) + grid[FCxz(column2, slab_yy)] * (dx) * (dy) * (1.0 - dz);
+        }
+
+      if(column3 >= myplan.firstcol_XZ && column3 <= myplan.lastcol_XZ)
+        {
+          flistin[i] += grid[FCxz(column3, slab_y)] * (dx) * (1.0 - dy) * (dz) + grid[FCxz(column3, slab_yy)] * (dx) * (dy) * (dz);
+        }
+    }
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+  flag_big = 0;
+  for(int i = 0; i < NTask; i++)
+    if(send_count[i] * sizeof(MyFloat) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+      flag_big = 1;
+
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  /* exchange data */
+  myMPI_Alltoallv(flistin, recv_count, recv_offset, flistout, send_count, send_offset, sizeof(MyFloat), flag_big_all, Communicator);
+
+  for(int j = 0; j < NTask; j++)
+    send_count[j] = 0;
+
+  /* now assign to original points */
+  for(int idx = 0; idx < NSource; idx++)
+    {
+      int i = Sp->get_active_index(idx);
+
+      int slab_x = P[i].IntPos[0] / INTCELL;
+      int slab_xx = slab_x + 1;
+
+      if(slab_xx >= GRIDX)
+        slab_xx = 0;
+
+      int slab_z = P[i].IntPos[2] / INTCELL;
+      int slab_zz = slab_z + 1;
+
+      if(slab_zz >= GRIDZ)
+        slab_zz = 0;
+
+      int column0 = slab_x * GRID2 + slab_z;
+      int column1 = slab_x * GRID2 + slab_zz;
+      int column2 = slab_xx * GRID2 + slab_z;
+      int column3 = slab_xx * GRID2 + slab_zz;
+
+      int task0, task1, task2, task3;
+
+      if(column0 < pivotcol)
+        task0 = column0 / avg;
+      else
+        task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column1 < pivotcol)
+        task1 = column1 / avg;
+      else
+        task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column2 < pivotcol)
+        task2 = column2 / avg;
+      else
+        task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column3 < pivotcol)
+        task3 = column3 / avg;
+      else
+        task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+      double value = flistout[send_offset[task0] + send_count[task0]++];
+
+      if(task1 != task0)
+        value += flistout[send_offset[task1] + send_count[task1]++];
+
+      if(task2 != task1 && task2 != task0)
+        value += flistout[send_offset[task2] + send_count[task2]++];
+
+      if(task3 != task0 && task3 != task1 && task3 != task2)
+        value += flistout[send_offset[task3] + send_count[task3]++];
+
+#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
+      if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
+        continue;
+#endif
+
+#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+      Sp->P[i].GravPM[dim] += value;
+#else
+      Sp->P[i].GravAccel[dim] += value;
+#endif
+    }
+
+  Mem.myfree(flistout);
+  Mem.myfree(flistin);
+  Mem.myfree(partin);
+  Mem.myfree(recv_offset);
+  Mem.myfree(recv_count);
+  Mem.myfree(send_offset);
+  Mem.myfree(send_count);
+}
+
+void pm_periodic::pmforce_uniform_optimized_readout_forces_or_potential_zy(fft_real *grid, int dim)
+{
+  if(dim != 0)
+    Terminate("bummer");
+
+  size_t *send_count = (size_t *)Mem.mymalloc("send_count", NTask * sizeof(size_t));
+  size_t *send_offset = (size_t *)Mem.mymalloc("send_offset", NTask * sizeof(size_t));
+  size_t *recv_count = (size_t *)Mem.mymalloc("recv_count", NTask * sizeof(size_t));
+  size_t *recv_offset = (size_t *)Mem.mymalloc("recv_offset", NTask * sizeof(size_t));
+
+  struct partbuf
+  {
+    MyIntPosType IntPos[3];
+  };
+
+  partbuf *partin, *partout;
+  size_t nimport = 0, nexport = 0;
+
+  particle_data *P = Sp->P;
+
+  int columns = GRID2 * GRIDY;
+  int avg = (columns - 1) / NTask + 1;
+  int exc = NTask * avg - columns;
+  int tasklastsection = NTask - exc;
+  int pivotcol = tasklastsection * avg;
+
+  /* determine the slabs/columns each particles accesses */
+  for(int rep = 0; rep < 2; rep++)
+    {
+      for(int j = 0; j < NTask; j++)
+        send_count[j] = 0;
+
+      for(int idx = 0; idx < NSource; idx++)
+        {
+          int i = Sp->get_active_index(idx);
+
+          if(P[i].Ti_Current != All.Ti_Current)
+            Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
+
+          int slab_z = P[i].IntPos[2] / INTCELL;
+          int slab_zz = slab_z + 1;
+
+          if(slab_zz >= GRIDZ)
+            slab_zz = 0;
+
+          int slab_y = P[i].IntPos[1] / INTCELL;
+          int slab_yy = slab_y + 1;
+
+          if(slab_yy >= GRIDY)
+            slab_yy = 0;
+
+          int column0 = slab_z * GRIDY + slab_y;
+          int column1 = slab_z * GRIDY + slab_yy;
+          int column2 = slab_zz * GRIDY + slab_y;
+          int column3 = slab_zz * GRIDY + slab_yy;
+
+          int task0, task1, task2, task3;
+
+          if(column0 < pivotcol)
+            task0 = column0 / avg;
+          else
+            task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(column1 < pivotcol)
+            task1 = column1 / avg;
+          else
+            task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(column2 < pivotcol)
+            task2 = column2 / avg;
+          else
+            task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(column3 < pivotcol)
+            task3 = column3 / avg;
+          else
+            task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+          if(rep == 0)
+            {
+              send_count[task0]++;
+              if(task1 != task0)
+                send_count[task1]++;
+              if(task2 != task1 && task2 != task0)
+                send_count[task2]++;
+              if(task3 != task0 && task3 != task1 && task3 != task2)
+                send_count[task3]++;
+            }
+          else
+            {
+              size_t ind0 = send_offset[task0] + send_count[task0]++;
+              for(int j = 0; j < 3; j++)
+                partout[ind0].IntPos[j] = P[i].IntPos[j];
+
+              if(task1 != task0)
+                {
+                  size_t ind1 = send_offset[task1] + send_count[task1]++;
+                  for(int j = 0; j < 3; j++)
+                    partout[ind1].IntPos[j] = P[i].IntPos[j];
+                }
+              if(task2 != task1 && task2 != task0)
+                {
+                  size_t ind2 = send_offset[task2] + send_count[task2]++;
+
+                  for(int j = 0; j < 3; j++)
+                    partout[ind2].IntPos[j] = P[i].IntPos[j];
+                }
+              if(task3 != task0 && task3 != task1 && task3 != task2)
+                {
+                  size_t ind3 = send_offset[task3] + send_count[task3]++;
+
+                  for(int j = 0; j < 3; j++)
+                    partout[ind3].IntPos[j] = P[i].IntPos[j];
+                }
+            }
+        }
+
+      if(rep == 0)
+        {
+          MPI_Alltoall(send_count, sizeof(size_t), MPI_BYTE, recv_count, sizeof(size_t), MPI_BYTE, Communicator);
+
+          nimport = 0, nexport = 0;
+          recv_offset[0] = send_offset[0] = 0;
+
+          for(int j = 0; j < NTask; j++)
+            {
+              nexport += send_count[j];
+              nimport += recv_count[j];
+
+              if(j > 0)
+                {
+                  send_offset[j] = send_offset[j - 1] + send_count[j - 1];
+                  recv_offset[j] = recv_offset[j - 1] + recv_count[j - 1];
+                }
+            }
+
+          /* allocate import and export buffer */
+          partin = (partbuf *)Mem.mymalloc("partin", nimport * sizeof(partbuf));
+          partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
+        }
+    }
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+  int flag_big = 0, flag_big_all;
+  for(int i = 0; i < NTask; i++)
+    if(send_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+      flag_big = 1;
+
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  /* exchange particle data */
+  myMPI_Alltoallv(partout, send_count, send_offset, partin, recv_count, recv_offset, sizeof(partbuf), flag_big_all, Communicator);
+
+  Mem.myfree(partout);
+
+  MyFloat *flistin = (MyFloat *)Mem.mymalloc("flistin", nimport * sizeof(MyFloat));
+  MyFloat *flistout = (MyFloat *)Mem.mymalloc("flistout", nexport * sizeof(MyFloat));
+
+  for(size_t i = 0; i < nimport; i++)
+    {
+      flistin[i] = 0;
+
+      int slab_x = partin[i].IntPos[0] / INTCELL;
+      int slab_y = partin[i].IntPos[1] / INTCELL;
+      int slab_z = partin[i].IntPos[2] / INTCELL;
+
+      MyIntPosType rmd_x = partin[i].IntPos[0] % INTCELL;
+      MyIntPosType rmd_y = partin[i].IntPos[1] % INTCELL;
+      MyIntPosType rmd_z = partin[i].IntPos[2] % INTCELL;
+
+      double dx = rmd_x * (1.0 / INTCELL);
+      double dy = rmd_y * (1.0 / INTCELL);
+      double dz = rmd_z * (1.0 / INTCELL);
+
+      int slab_xx = slab_x + 1;
+      int slab_yy = slab_y + 1;
+      int slab_zz = slab_z + 1;
+
+      if(slab_xx >= GRIDX)
+        slab_xx = 0;
+      if(slab_yy >= GRIDY)
+        slab_yy = 0;
+      if(slab_zz >= GRIDZ)
+        slab_zz = 0;
+
+      int column0 = slab_z * GRIDY + slab_y;
+      int column1 = slab_z * GRIDY + slab_yy;
+      int column2 = slab_zz * GRIDY + slab_y;
+      int column3 = slab_zz * GRIDY + slab_yy;
+
+      if(column0 >= myplan.firstcol_ZY && column0 <= myplan.lastcol_ZY)
+        {
+          flistin[i] += grid[FCzy(column0, slab_x)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
+                        grid[FCzy(column0, slab_xx)] * (dx) * (1.0 - dy) * (1.0 - dz);
+        }
+      if(column1 >= myplan.firstcol_ZY && column1 <= myplan.lastcol_ZY)
+        {
+          flistin[i] +=
+              grid[FCzy(column1, slab_x)] * (1.0 - dx) * (dy) * (1.0 - dz) + grid[FCzy(column1, slab_xx)] * (dx) * (dy) * (1.0 - dz);
+        }
+
+      if(column2 >= myplan.firstcol_ZY && column2 <= myplan.lastcol_ZY)
+        {
+          flistin[i] +=
+              grid[FCzy(column2, slab_x)] * (1.0 - dx) * (1.0 - dy) * (dz) + grid[FCzy(column2, slab_xx)] * (dx) * (1.0 - dy) * (dz);
+        }
+
+      if(column3 >= myplan.firstcol_ZY && column3 <= myplan.lastcol_ZY)
+        {
+          flistin[i] += grid[FCzy(column3, slab_x)] * (1.0 - dx) * (dy) * (dz) + grid[FCzy(column3, slab_xx)] * (dx) * (dy) * (dz);
+        }
+    }
+
+  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
+   * transfer the data in chunks.
+   */
+  flag_big = 0;
+  for(int i = 0; i < NTask; i++)
+    if(send_count[i] * sizeof(MyFloat) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
+      flag_big = 1;
+
+  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
+
+  /* exchange data */
+  myMPI_Alltoallv(flistin, recv_count, recv_offset, flistout, send_count, send_offset, sizeof(MyFloat), flag_big_all, Communicator);
+
+  for(int j = 0; j < NTask; j++)
+    send_count[j] = 0;
+
+  /* now assign to original points */
+  for(int idx = 0; idx < NSource; idx++)
+    {
+      int i = Sp->get_active_index(idx);
+
+      int slab_z = P[i].IntPos[2] / INTCELL;
+      int slab_zz = slab_z + 1;
+
+      if(slab_zz >= GRIDZ)
+        slab_zz = 0;
+
+      int slab_y = P[i].IntPos[1] / INTCELL;
+      int slab_yy = slab_y + 1;
+
+      if(slab_yy >= GRIDY)
+        slab_yy = 0;
+
+      int column0 = slab_z * GRIDY + slab_y;
+      int column1 = slab_z * GRIDY + slab_yy;
+      int column2 = slab_zz * GRIDY + slab_y;
+      int column3 = slab_zz * GRIDY + slab_yy;
+
+      int task0, task1, task2, task3;
+
+      if(column0 < pivotcol)
+        task0 = column0 / avg;
+      else
+        task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column1 < pivotcol)
+        task1 = column1 / avg;
+      else
+        task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column2 < pivotcol)
+        task2 = column2 / avg;
+      else
+        task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
+
+      if(column3 < pivotcol)
+        task3 = column3 / avg;
+      else
+        task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
+
+      double value = flistout[send_offset[task0] + send_count[task0]++];
+
+      if(task1 != task0)
+        value += flistout[send_offset[task1] + send_count[task1]++];
+
+      if(task2 != task1 && task2 != task0)
+        value += flistout[send_offset[task2] + send_count[task2]++];
+
+      if(task3 != task0 && task3 != task1 && task3 != task2)
+        value += flistout[send_offset[task3] + send_count[task3]++];
+
+#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
+      if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
+        continue;
+#endif
+
+#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+      Sp->P[i].GravPM[dim] += value;
+#else
+      Sp->P[i].GravAccel[dim] += value;
+#endif
+    }
+
+  Mem.myfree(flistout);
+  Mem.myfree(flistin);
+  Mem.myfree(partin);
+  Mem.myfree(recv_offset);
+  Mem.myfree(recv_count);
+  Mem.myfree(send_offset);
+  Mem.myfree(send_count);
+}
+#endif
+
+#endif
+
+/*! Calculates the long-range periodic force given the particle positions
+ *  using the PM method.  The force is Gaussian filtered with Asmth, given in
+ *  mesh-cell units. We carry out a CIC charge assignment, and compute the
+ *  potential by fast Fourier transform methods. The potential is finite-differenced
+ *  using a 4-point finite differencing formula, and the forces are
+ *  interpolated tri-linearly to the particle positions. The CIC kernel is
+ *  deconvolved.
+ *
+ *  For mode=0, normal force calculation, mode=1, only density field construction
+ *  for a power spectrum calculation. In the later case, typelist flags the particle
+ *  types that should be included in the density field.
+ */
+void pm_periodic::pmforce_periodic(int mode, int *typelist)
+{
+  int x, y, z;
+
+  double tstart = Logs.second();
+
+  if(mode == 0)
+    mpi_printf("PM-PERIODIC: Starting periodic PM calculation. (Rcut=%g)  presently allocated=%g MB\n", Sp->Rcut[0],
+               Mem.getAllocatedBytesInMB());
+
+#ifdef HIERARCHICAL_GRAVITY
+  NSource = Sp->TimeBinsGravity.NActiveParticles;
+#else
+  NSource = Sp->NumPart;
+#endif
+
+#ifndef TREEPM_NOTIMESPLIT
+  if(NSource != Sp->NumPart)
+    Terminate("unexpected NSource != Sp->NumPart");
+#endif
+
+#ifndef NUMPART_PER_TASK_LARGE
+  if((((long long)Sp->NumPart) << 3) >= (((long long)1) << 31))
+    Terminate("We are dealing with a too large particle number per MPI rank - enabling NUMPART_PER_TASK_LARGE might help.");
+#endif
+
+  double asmth2 = Sp->Asmth[0] * Sp->Asmth[0];
+  double d      = All.BoxSize / PMGRID;
+  double dhalf  = 0.5 * d;
+
+#ifdef GRAVITY_TALLBOX
+  double fac = 1.0 / (((double)GRIDX) * GRIDY * GRIDZ); /* to get potential  */
+#else
+  double fac = 4 * M_PI * (LONG_X * LONG_Y * LONG_Z) / pow(All.BoxSize, 3); /* to get potential  */
+#endif
+
+  fac *= 1 / (2 * d); /* for finite differencing */
+
+#ifdef PM_ZOOM_OPTIMIZED
+  pmforce_zoom_optimized_prepare_density(mode, typelist);
+#else
+  pmforce_uniform_optimized_prepare_density(mode, typelist);
+#endif
+
+  /* note: after density, we still keep the field 'partin' from the density assignment,
+   * as we can use this later on to return potential and z-force
+   */
+
+  /* allocate the memory to hold the FFT fields */
+
+  forcegrid = (fft_real *)Mem.mymalloc_movable(&forcegrid, "forcegrid", maxfftsize * sizeof(fft_real));
+
+  workspace = forcegrid;
+
+#ifndef FFT_COLUMN_BASED
+  fft_of_rhogrid = (fft_complex *)&rhogrid[0];
+#else
+  fft_of_rhogrid = (fft_complex *)&workspace[0];
+#endif
+
+  /* Do the FFT of the density field */
+#ifndef FFT_COLUMN_BASED
+  my_slab_based_fft(&myplan, &rhogrid[0], &workspace[0], 1);
+#else
+  my_column_based_fft(&myplan, rhogrid, workspace, 1); /* result is in workspace, not in rhogrid ! */
+#endif
+
+  if(mode != 0)
+    {
+      pmforce_measure_powerspec(mode - 1, typelist);
+
+#if defined(FFT_COLUMN_BASED) && !defined(PM_ZOOM_OPTIMIZED)
+      Mem.myfree_movable(partin);
+      partin = NULL;
+#endif
+    }
+  else
+    {
+      /* multiply with Green's function in order to obtain the potential (or forces for spectral diffencing) */
+
+      double kfacx = 2.0 * M_PI / (GRIDX * d);
+      double kfacy = 2.0 * M_PI / (GRIDY * d);
+      double kfacz = 2.0 * M_PI / (GRIDZ * d);
+
+#ifdef FFT_COLUMN_BASED
+      for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
+        {
+          large_array_offset ipcell = ip + ((large_array_offset)myplan.second_transposed_firstcol) * GRIDX;
+          y                         = ipcell / (GRIDX * GRIDz);
+          int yr                    = ipcell % (GRIDX * GRIDz);
+          z                         = yr / GRIDX;
+          x                         = yr % GRIDX;
+#else
+      for(x = 0; x < GRIDX; x++)
+        for(y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
+          for(z = 0; z < GRIDz; z++)
+            {
+#endif
+          int xx, yy, zz;
+
+          if(x >= (GRIDX / 2))
+            xx = x - GRIDX;
+          else
+            xx = x;
+          if(y >= (GRIDY / 2))
+            yy = y - GRIDY;
+          else
+            yy = y;
+          if(z >= (GRIDZ / 2))
+            zz = z - GRIDZ;
+          else
+            zz = z;
+
+          double kx = kfacx * xx;
+          double ky = kfacy * yy;
+          double kz = kfacz * zz;
+
+          double k2 = kx * kx + ky * ky + kz * kz;
+
+          double smth = 1.0, deconv = 1.0;
+
+          if(k2 > 0)
+            {
+              smth = -exp(-k2 * asmth2) / k2;
+
+              /* do deconvolution */
+
+              double fx = 1, fy = 1, fz = 1;
+
+              if(xx != 0)
+                {
+                  fx = kx * dhalf;
+                  fx = sin(fx) / fx;
+                }
+              if(yy != 0)
+                {
+                  fy = ky * dhalf;
+                  fy = sin(fy) / fy;
+                }
+              if(zz != 0)
+                {
+                  fz = kz * dhalf;
+                  fz = sin(fz) / fz;
+                }
+
+              double ff = 1 / (fx * fy * fz);
+              deconv    = ff * ff * ff * ff;
+
+              smth *= deconv; /* deconvolution */
+            }
+
+#ifndef FFT_COLUMN_BASED
+          large_array_offset ip = ((large_array_offset)GRIDz) * (GRIDX * (y - myplan.slabstart_y) + x) + z;
+#endif
+
+#ifdef GRAVITY_TALLBOX
+          double re = fft_of_rhogrid[ip][0] * fft_of_kernel[ip][0] - fft_of_rhogrid[ip][1] * fft_of_kernel[ip][1];
+          double im = fft_of_rhogrid[ip][0] * fft_of_kernel[ip][1] + fft_of_rhogrid[ip][1] * fft_of_kernel[ip][0];
+
+          fft_of_rhogrid[ip][0] = re * deconv * exp(-k2 * asmth2);
+          fft_of_rhogrid[ip][1] = im * deconv * exp(-k2 * asmth2);
+#else
+              fft_of_rhogrid[ip][0] *= smth;
+              fft_of_rhogrid[ip][1] *= smth;
+#endif
+        }
+
+#ifndef GRAVITY_TALLBOX
+#ifdef FFT_COLUMN_BASED
+      if(myplan.second_transposed_firstcol == 0)
+        fft_of_rhogrid[0][0] = fft_of_rhogrid[0][1] = 0.0;
+#else
+      if(myplan.slabstart_y == 0)
+        fft_of_rhogrid[0][0] = fft_of_rhogrid[0][1] = 0.0;
+#endif
+#endif
+
+        /* Do the inverse FFT to get the potential/forces */
+
+#ifndef FFT_COLUMN_BASED
+      my_slab_based_fft(&myplan, &rhogrid[0], &workspace[0], -1);
+#else
+      my_column_based_fft(&myplan, workspace, rhogrid, -1);
+#endif
+
+      /* Now rhogrid holds the potential/forces */
+
+#ifdef EVALPOTENTIAL
+#ifdef PM_ZOOM_OPTIMIZED
+      pmforce_zoom_optimized_readout_forces_or_potential(rhogrid, -1);
+#else
+      pmforce_uniform_optimized_readout_forces_or_potential_xy(rhogrid, -1);
+#endif
+#endif
+
+      /* get the force components by finite differencing of the potential for each dimension,
+       * and send the results back to the right CPUs
+       */
+
+      /* we do the x component last, because for differencing the potential in the x-direction, we need to construct the
+       * transpose
+       */
+
+#ifndef FFT_COLUMN_BASED
+
+      /* z-direction */
+      for(y = 0; y < GRIDY; y++)
+        for(x = 0; x < myplan.nslab_x; x++)
+          for(z = 0; z < GRIDZ; z++)
+            {
+              int zr = z + 1, zl = z - 1, zrr = z + 2, zll = z - 2;
+              if(zr >= GRIDZ)
+                zr -= GRIDZ;
+              if(zrr >= GRIDZ)
+                zrr -= GRIDZ;
+              if(zl < 0)
+                zl += GRIDZ;
+              if(zll < 0)
+                zll += GRIDZ;
+
+              forcegrid[FI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[FI(x, y, zl)] - rhogrid[FI(x, y, zr)]) -
+                                              (1.0 / 6) * (rhogrid[FI(x, y, zll)] - rhogrid[FI(x, y, zrr)]));
+            }
+
+#ifdef PM_ZOOM_OPTIMIZED
+      pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 2);
+#else
+      pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 2);
+#endif
+
+      /* y-direction */
+      for(y = 0; y < GRIDY; y++)
+        for(x = 0; x < myplan.nslab_x; x++)
+          for(z = 0; z < GRIDZ; z++)
+            {
+              int yr = y + 1, yl = y - 1, yrr = y + 2, yll = y - 2;
+              if(yr >= GRIDY)
+                yr -= GRIDY;
+              if(yrr >= GRIDY)
+                yrr -= GRIDY;
+              if(yl < 0)
+                yl += GRIDY;
+              if(yll < 0)
+                yll += GRIDY;
+
+              forcegrid[FI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[FI(x, yl, z)] - rhogrid[FI(x, yr, z)]) -
+                                              (1.0 / 6) * (rhogrid[FI(x, yll, z)] - rhogrid[FI(x, yrr, z)]));
+            }
+
+#ifdef PM_ZOOM_OPTIMIZED
+      pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 1);
+#else
+      pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 1);
+#endif
+
+      /* x-direction */
+      my_slab_transposeA(&myplan, rhogrid, forcegrid); /* compute the transpose of the potential field for finite differencing */
+                                                       /* note: for the x-direction, we difference the transposed field */
+
+      for(x = 0; x < GRIDX; x++)
+        for(y = 0; y < myplan.nslab_y; y++)
+          for(z = 0; z < GRIDZ; z++)
+            {
+              int xrr = x + 2, xll = x - 2, xr = x + 1, xl = x - 1;
+              if(xr >= GRIDX)
+                xr -= GRIDX;
+              if(xrr >= GRIDX)
+                xrr -= GRIDX;
+              if(xl < 0)
+                xl += GRIDX;
+              if(xll < 0)
+                xll += GRIDX;
+
+              forcegrid[NI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[NI(xl, y, z)] - rhogrid[NI(xr, y, z)]) -
+                                              (1.0 / 6) * (rhogrid[NI(xll, y, z)] - rhogrid[NI(xrr, y, z)]));
+            }
+
+      my_slab_transposeB(&myplan, forcegrid, rhogrid); /* reverse the transpose from above */
+
+#ifdef PM_ZOOM_OPTIMIZED
+      pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 0);
+#else
+      pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 0);
+#endif
+
+#else
+
+      /* z-direction */
+      for(large_array_offset i = 0; i < myplan.ncol_XY; i++)
+        {
+          fft_real *forcep = &forcegrid[GRID2 * i];
+          fft_real *potp = &rhogrid[GRID2 * i];
+
+          for(int z = 0; z < GRIDZ; z++)
+            {
+              int zr = z + 1;
+              int zl = z - 1;
+              int zrr = z + 2;
+              int zll = z - 2;
+
+              if(zr >= GRIDZ)
+                zr -= GRIDZ;
+              if(zrr >= GRIDZ)
+                zrr -= GRIDZ;
+              if(zl < 0)
+                zl += GRIDZ;
+              if(zll < 0)
+                zll += GRIDZ;
+
+              forcep[z] = fac * ((4.0 / 3) * (potp[zl] - potp[zr]) - (1.0 / 6) * (potp[zll] - potp[zrr]));
+            }
+        }
+
+#ifdef PM_ZOOM_OPTIMIZED
+      pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 2);
+#else
+      pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 2);
+
+      /* at this point we can free partin */
+      Mem.myfree_movable(partin);
+      partin = NULL;
+#endif
+
+      /* y-direction */
+      my_fft_swap23(&myplan, rhogrid, forcegrid);  // rhogrid contains potential field, forcegrid the transposed field
+
+      /* make an in-place computation */
+      fft_real *column = (fft_real *)Mem.mymalloc("column", GRIDY * sizeof(fft_real));
+
+      for(large_array_offset i = 0; i < myplan.ncol_XZ; i++)
+        {
+          memcpy(column, &forcegrid[GRIDY * i], GRIDY * sizeof(fft_real));
+
+          fft_real *potp = column;
+          fft_real *forcep = &forcegrid[GRIDY * i];
+
+          for(int y = 0; y < GRIDY; y++)
+            {
+              int yr = y + 1;
+              int yl = y - 1;
+              int yrr = y + 2;
+              int yll = y - 2;
+
+              if(yr >= GRIDY)
+                yr -= GRIDY;
+              if(yrr >= GRIDY)
+                yrr -= GRIDY;
+              if(yl < 0)
+                yl += GRIDY;
+              if(yll < 0)
+                yll += GRIDY;
+
+              forcep[y] = fac * ((4.0 / 3) * (potp[yl] - potp[yr]) - (1.0 / 6) * (potp[yll] - potp[yrr]));
+            }
+        }
+
+      Mem.myfree(column);
+
+      /* now need to read out from forcegrid  in a non-standard way */
+
+#ifdef PM_ZOOM_OPTIMIZED
+      /* need a third field as scratch space */
+      fft_real *scratch = (fft_real *)Mem.mymalloc("scratch", myplan.fftsize * sizeof(fft_real));
+
+      my_fft_swap23back(&myplan, forcegrid, scratch);
+      pmforce_zoom_optimized_readout_forces_or_potential(scratch, 1);
+
+      Mem.myfree(scratch);
+#else
+      pmforce_uniform_optimized_readout_forces_or_potential_xz(forcegrid, 1);
+#endif
+
+      /* x-direction */
+      my_fft_swap13(&myplan, rhogrid, forcegrid);  // rhogrid contains potential field
+
+      for(large_array_offset i = 0; i < myplan.ncol_ZY; i++)
+        {
+          fft_real *forcep = &rhogrid[GRIDX * i];
+          fft_real *potp = &forcegrid[GRIDX * i];
+
+          for(int x = 0; x < GRIDX; x++)
+            {
+              int xr = x + 1;
+              int xl = x - 1;
+              int xrr = x + 2;
+              int xll = x - 2;
+
+              if(xr >= GRIDX)
+                xr -= GRIDX;
+              if(xrr >= GRIDX)
+                xrr -= GRIDX;
+              if(xl < 0)
+                xl += GRIDX;
+              if(xll < 0)
+                xll += GRIDX;
+
+              forcep[x] = fac * ((4.0 / 3) * (potp[xl] - potp[xr]) - (1.0 / 6) * (potp[xll] - potp[xrr]));
+            }
+        }
+
+        /* now need to read out from forcegrid in a non-standard way */
+#ifdef PM_ZOOM_OPTIMIZED
+      my_fft_swap13back(&myplan, rhogrid, forcegrid);
+      pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 0);
+#else
+      pmforce_uniform_optimized_readout_forces_or_potential_zy(rhogrid, 0);
+#endif
+
+#endif
+    }
+
+  /* free stuff */
+
+  Mem.myfree(forcegrid);
+  Mem.myfree(rhogrid);
+
+#ifdef PM_ZOOM_OPTIMIZED
+  Mem.myfree(localfield_recvcount);
+  Mem.myfree(localfield_offset);
+  Mem.myfree(localfield_sendcount);
+  Mem.myfree(localfield_first);
+  Mem.myfree(localfield_data);
+  Mem.myfree(localfield_globalindex);
+  Mem.myfree(part);
+#else
+#ifndef FFT_COLUMN_BASED
+  Mem.myfree(partin);
+#endif
+  Mem.myfree(Rcvpm_offset);
+  Mem.myfree(Rcvpm_count);
+  Mem.myfree(Sndpm_offset);
+  Mem.myfree(Sndpm_count);
+#endif
+
+  double tend = Logs.second();
+
+  if(mode == 0)
+    mpi_printf("PM-PERIODIC: done.  (took %g seconds)\n", Logs.timediff(tstart, tend));
+}
+
+#ifdef GRAVITY_TALLBOX
+
+/*! This function sets-up the Greens function for calculating the tall-box potential
+ *  in real space, with suitable zero padding in the direction of the tall box.
+ */
+void pm_periodic::pmforce_setup_tallbox_kernel(void)
+{
+  double d = All.BoxSize / PMGRID;
+
+  mpi_printf("PM-PERIODIC: Setting up tallbox kernel (GRIDX=%d, GRIDY=%d, GRIDZ=%d)\n", GRIDX, GRIDY, GRIDZ);
+
+  /* now set up kernel and its Fourier transform */
+
+  for(int i = 0; i < maxfftsize; i++) /* clear local field */
+    kernel[i] = 0;
+
+#ifndef FFT_COLUMN_BASED
+  for(int i = myplan.slabstart_x; i < (myplan.slabstart_x + myplan.nslab_x); i++)
+    for(int j = 0; j < GRIDY; j++)
+      {
+#else
+  for(int c = myplan.firstcol_XY; c < (myplan.firstcol_XY + myplan.ncol_XY); c++)
+    {
+      int i = c / GRIDY;
+      int j = c % GRIDY;
+#endif
+
+        for(int k = 0; k < GRIDZ; k++)
+          {
+            int ii, jj, kk;
+
+            if(i >= (GRIDX / 2))
+              ii = i - GRIDX;
+            else
+              ii = i;
+            if(j >= (GRIDY / 2))
+              jj = j - GRIDY;
+            else
+              jj = j;
+            if(k >= (GRIDZ / 2))
+              kk = k - GRIDZ;
+            else
+              kk = k;
+
+            double xx = ii * d;
+            double yy = jj * d;
+            double zz = kk * d;
+
+            double pot = pmperiodic_tallbox_long_range_potential(xx, yy, zz);
+
+#ifndef FFT_COLUMN_BASED
+            size_t ip = FI(i - myplan.slabstart_x, j, k);
+#else
+          size_t ip = FCxy(c, k);
+#endif
+            kernel[ip] = pot / All.BoxSize;
+          }
+
+#ifndef FFT_COLUMN_BASED
+      }
+#else
+    }
+#endif
+
+  /* Do the FFT of the kernel */
+
+  fft_real *workspc = (fft_real *)Mem.mymalloc("workspc", maxfftsize * sizeof(fft_real));
+
+#ifndef FFT_COLUMN_BASED
+  my_slab_based_fft(&myplan, kernel, workspc, 1);
+#else
+  my_column_based_fft(&myplan, kernel, workspc, 1); /* result is in workspace, not in kernel */
+  memcpy(kernel, workspc, maxfftsize * sizeof(fft_real));
+#endif
+
+  Mem.myfree(workspc);
+
+  mpi_printf("PM-PERIODIC: Done setting up tallbox kernel\n");
+}
+
+double pm_periodic::pmperiodic_tallbox_long_range_potential(double x, double y, double z)
+{
+  x /= All.BoxSize;
+  y /= All.BoxSize;
+  z /= All.BoxSize;
+
+  double r = sqrt(x * x + y * y + z * z);
+
+  if(r == 0)
+    return 0;
+
+  double xx, yy, zz;
+  switch(GRAVITY_TALLBOX)
+    {
+      case 0:
+        xx = y;
+        yy = z;
+        zz = x;
+        break;
+      case 1:
+        xx = x;
+        yy = z;
+        zz = y;
+        break;
+      case 2:
+        xx = x;
+        yy = y;
+        zz = z;
+        break;
+    }
+  x = xx;
+  y = yy;
+  z = zz;
+
+  /* the third dimension, z, is now the non-periodic one */
+
+  double leff  = sqrt(BOXX * BOXY);
+  double alpha = 2.0 / leff;
+
+  double sum1 = 0.0;
+
+  int qxmax = (int)(10.0 / (BOXX * alpha) + 0.5);
+  int qymax = (int)(10.0 / (BOXY * alpha) + 0.5);
+
+  int nxmax = (int)(4.0 * alpha * BOXX + 0.5);
+  int nymax = (int)(4.0 * alpha * BOXY + 0.5);
+
+  for(int nx = -qxmax; nx <= qxmax; nx++)
+    for(int ny = -qymax; ny <= qymax; ny++)
+      {
+        double dx = x - nx * BOXX;
+        double dy = y - ny * BOXY;
+        double r  = sqrt(dx * dx + dy * dy + z * z);
+        if(r > 0)
+          sum1 += erfc(alpha * r) / r;
+      }
+
+  double alpha2 = alpha * alpha;
+
+  double sum2 = 0.0;
+
+  for(int nx = -nxmax; nx <= nxmax; nx++)
+    for(int ny = -nymax; ny <= nymax; ny++)
+      {
+        if(nx != 0 || ny != 0)
+          {
+            double kx = (2.0 * M_PI / BOXX) * nx;
+            double ky = (2.0 * M_PI / BOXY) * ny;
+            double k2 = kx * kx + ky * ky;
+            double k  = sqrt(k2);
+
+            if(k * z > 0)
+              {
+                double ex = exp(-k * z);
+                if(ex > 0)
+                  sum2 += cos(kx * x + ky * y) * (erfc(k / (2 * alpha) + alpha * z) / ex + ex * erfc(k / (2 * alpha) - alpha * z)) / k;
+              }
+            else
+              {
+                double ex = exp(k * z);
+                if(ex > 0)
+                  sum2 += cos(kx * x + ky * y) * (ex * erfc(k / (2 * alpha) + alpha * z) + erfc(k / (2 * alpha) - alpha * z) / ex) / k;
+              }
+          }
+      }
+
+  sum2 *= M_PI / (BOXX * BOXY);
+
+  double psi = 2.0 * alpha / sqrt(M_PI) +
+               (2 * sqrt(M_PI) / (BOXX * BOXY) * (exp(-alpha2 * z * z) / alpha + sqrt(M_PI) * z * erf(alpha * z))) - (sum1 + sum2);
+
+  return psi;
+}
+#endif
+
+/*----------------------------------------------------------------------------------------------------*/
+/*           Here comes code for the power-spectrum computation                                       */
+/*----------------------------------------------------------------------------------------------------*/
+
+void pm_periodic::calculate_power_spectra(int num)
+{
+  int n_type[NTYPES];
+  long long ntot_type_all[NTYPES];
+  /* determine global and local particle numbers */
+  for(int n = 0; n < NTYPES; n++)
+    n_type[n] = 0;
+  for(int n = 0; n < Sp->NumPart; n++)
+    n_type[Sp->P[n].getType()]++;
+
+  sumup_large_ints(NTYPES, n_type, ntot_type_all, Communicator);
+
+  int typeflag[NTYPES];
+
+  for(int i = 0; i < NTYPES; i++)
+    typeflag[i] = 1;
+
+#ifdef HIERARCHICAL_GRAVITY
+  int flag_extra_allocate = 0;
+  if(Sp->TimeBinsGravity.ActiveParticleList == NULL)
+    {
+      flag_extra_allocate = 1;
+      Sp->TimeBinsGravity.timebins_allocate();
+    }
+
+  Sp->TimeBinsGravity.NActiveParticles = 0;
+  for(int i = 0; i < Sp->NumPart; i++)
+    Sp->TimeBinsGravity.ActiveParticleList[Sp->TimeBinsGravity.NActiveParticles++] = i;
+#endif
+
+  if(ThisTask == 0)
+    {
+      char buf[MAXLEN_PATH_EXTRA];
+      sprintf(buf, "%s/powerspecs", All.OutputDir);
+      mkdir(buf, 02755);
+    }
+
+  sprintf(power_spec_fname, "%s/powerspecs/powerspec_%03d.txt", All.OutputDir, num);
+
+  pmforce_do_powerspec(typeflag); /* calculate power spectrum for all particle types */
+
+  /* check whether whether more than one type is in use */
+  int count_types = 0;
+  for(int i = 0; i < NTYPES; i++)
+    if(ntot_type_all[i] > 0)
+      count_types++;
+
+  if(count_types > 1)
+    for(int i = 0; i < NTYPES; i++)
+      {
+        if(ntot_type_all[i] > 0)
+          {
+            for(int j = 0; j < NTYPES; j++)
+              typeflag[j] = 0;
+
+            typeflag[i] = 1;
+
+            sprintf(power_spec_fname, "%s/powerspecs/powerspec_type%d_%03d.txt", All.OutputDir, i, num);
+
+            pmforce_do_powerspec(typeflag); /* calculate power spectrum for type i */
+          }
+      }
+
+#ifdef HIERARCHICAL_GRAVITY
+  if(flag_extra_allocate)
+    Sp->TimeBinsGravity.timebins_free();
+#endif
+}
+
+void pm_periodic::pmforce_do_powerspec(int *typeflag)
+{
+  mpi_printf("POWERSPEC: Begin power spectrum. (typeflag=[");
+  for(int i = 0; i < NTYPES; i++)
+    mpi_printf(" %d ", typeflag[i]);
+  mpi_printf("])\n");
+
+  double tstart = Logs.second();
+
+  pmforce_periodic(1, typeflag); /* calculate regular power spectrum for selected particle types */
+
+  pmforce_periodic(2, typeflag); /* calculate folded power spectrum for selected particle types  */
+
+  pmforce_periodic(3, typeflag); /* calculate twice folded power spectrum for selected particle types  */
+
+  double tend = Logs.second();
+
+  mpi_printf("POWERSPEC: End power spectrum. took %g seconds\n", Logs.timediff(tstart, tend));
+}
+
+void pm_periodic::pmforce_measure_powerspec(int flag, int *typeflag)
+{
+  particle_data *P = Sp->P;
+
+  long long CountModes[BINS_PS];
+  double SumPowerUncorrected[BINS_PS]; /* without binning correction (as for shot noise) */
+  double PowerUncorrected[BINS_PS];    /* without binning correction */
+  double DeltaUncorrected[BINS_PS];    /* without binning correction */
+  double ShotLimit[BINS_PS];
+  double KWeightSum[BINS_PS];
+  double Kbin[BINS_PS];
+
+  double mass = 0, mass2 = 0, count = 0;
+  for(int i = 0; i < Sp->NumPart; i++)
+    if(typeflag[P[i].getType()])
+      {
+        mass += Sp->P[i].getMass();
+        mass2 += Sp->P[i].getMass() * Sp->P[i].getMass();
+        count += 1.0;
+      }
+
+  MPI_Allreduce(MPI_IN_PLACE, &mass, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  MPI_Allreduce(MPI_IN_PLACE, &mass2, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  MPI_Allreduce(MPI_IN_PLACE, &count, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+
+  double d     = All.BoxSize / PMGRID;
+  double dhalf = 0.5 * d;
+
+  double fac = 1.0 / mass;
+
+  double K0     = 2 * M_PI / All.BoxSize;                                                        /* minimum k */
+  double K1     = 2 * M_PI / All.BoxSize * (POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC * PMGRID / 2); /* maximum k that can be measured */
+  double binfac = BINS_PS / (log(K1) - log(K0));
+
+  double kfacx = 2.0 * M_PI * LONG_X / All.BoxSize;
+  double kfacy = 2.0 * M_PI * LONG_Y / All.BoxSize;
+  double kfacz = 2.0 * M_PI * LONG_Z / All.BoxSize;
+
+  for(int i = 0; i < BINS_PS; i++)
+    {
+      SumPowerUncorrected[i] = 0;
+      CountModes[i]          = 0;
+      KWeightSum[i]          = 0;
+    }
+
+#ifdef FFT_COLUMN_BASED
+  for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
+    {
+      large_array_offset ipcell = ip + ((large_array_offset)myplan.second_transposed_firstcol) * GRIDX;
+      int y                     = ipcell / (GRIDX * GRIDz);
+      int yr                    = ipcell % (GRIDX * GRIDz);
+      int z                     = yr / GRIDX;
+      int x                     = yr % GRIDX;
+#else
+  for(int y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
+    for(int x = 0; x < GRIDX; x++)
+      for(int z = 0; z < GRIDz; z++)
+        {
+#endif
+      int count_double;
+
+      if(z >= 1 &&
+         z < (GRIDZ + 1) / 2) /* these modes need to be counted twice due to the storage scheme for the FFT of a real field */
+        count_double = 1;
+      else
+        count_double = 0;
+
+      int xx, yy, zz;
+
+      if(x >= (GRIDX / 2))
+        xx = x - GRIDX;
+      else
+        xx = x;
+
+      if(y >= (GRIDY / 2))
+        yy = y - GRIDY;
+      else
+        yy = y;
+
+      if(z >= (GRIDZ / 2))
+        zz = z - GRIDZ;
+      else
+        zz = z;
+
+      double kx = kfacx * xx;
+      double ky = kfacy * yy;
+      double kz = kfacz * zz;
+
+      double k2 = kx * kx + ky * ky + kz * kz;
+
+      if(k2 > 0)
+        {
+          /* do deconvolution */
+          double fx = 1, fy = 1, fz = 1;
+
+          if(xx != 0)
+            {
+              fx = kx * dhalf;
+              fx = sin(fx) / fx;
+            }
+          if(yy != 0)
+            {
+              fy = ky * dhalf;
+              fy = sin(fy) / fy;
+            }
+          if(zz != 0)
+            {
+              fz = kz * dhalf;
+              fz = sin(fz) / fz;
+            }
+          double ff   = 1 / (fx * fy * fz);
+          double smth = ff * ff * ff * ff;
+          /*
+           * Note: The Fourier-transform of the density field (rho_hat) must be multiplied with ff^2
+           * in order to do the de-convolution. Thats why po = rho_hat^2 gains a factor of ff^4.
+           */
+          /* end deconvolution */
+
+#ifndef FFT_COLUMN_BASED
+          large_array_offset ip = ((large_array_offset)GRIDz) * (GRIDX * (y - myplan.slabstart_y) + x) + z;
+#endif
+
+          double po = (fft_of_rhogrid[ip][0] * fft_of_rhogrid[ip][0] + fft_of_rhogrid[ip][1] * fft_of_rhogrid[ip][1]);
+
+          po *= fac * fac * smth;
+
+          double k = sqrt(k2);
+
+          if(flag == 1)
+            k *= POWERSPEC_FOLDFAC;
+          else if(flag == 2)
+            k *= POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC;
+
+          if(k >= K0 && k < K1)
+            {
+              int bin = log(k / K0) * binfac;
+
+              SumPowerUncorrected[bin] += po;
+              CountModes[bin] += 1;
+              KWeightSum[bin] += log(k);
+
+              if(count_double)
+                {
+                  SumPowerUncorrected[bin] += po;
+                  CountModes[bin] += 1;
+                  KWeightSum[bin] += log(k);
+                }
+            }
+        }
+    }
+
+  MPI_Allreduce(MPI_IN_PLACE, SumPowerUncorrected, BINS_PS, MPI_DOUBLE, MPI_SUM, Communicator);
+  MPI_Allreduce(MPI_IN_PLACE, CountModes, BINS_PS, MPI_LONG_LONG, MPI_SUM, Communicator);
+  MPI_Allreduce(MPI_IN_PLACE, KWeightSum, BINS_PS, MPI_DOUBLE, MPI_SUM, Communicator);
+
+  int count_non_zero_bins = 0;
+  for(int i = 0; i < BINS_PS; i++)
+    {
+      if(CountModes[i] > 0)
+        {
+          Kbin[i] = exp(KWeightSum[i] / CountModes[i]);
+          count_non_zero_bins++;
+        }
+      else
+        Kbin[i] = exp((i + 0.5) / binfac + log(K0));
+
+      if(CountModes[i] > 0)
+        PowerUncorrected[i] = SumPowerUncorrected[i] / CountModes[i];
+      else
+        PowerUncorrected[i] = 0;
+
+      DeltaUncorrected[i] = 4 * M_PI * pow(Kbin[i], 3) / pow(2 * M_PI / All.BoxSize, 3) * PowerUncorrected[i];
+
+      ShotLimit[i] = 4 * M_PI * pow(Kbin[i], 3) / pow(2 * M_PI / All.BoxSize, 3) * (mass2 / (mass * mass));
+    }
+
+  /* store the result */
+  if(ThisTask == 0)
+    {
+      FILE *fd;
+
+      if(flag == 0)
+        {
+          if(!(fd = fopen(power_spec_fname, "w"))) /* store the unfolded spectrum */
+            Terminate("can't open file `%s`\n", power_spec_fname);
+        }
+      else if(flag == 1 || flag == 2)
+        {
+          if(!(fd = fopen(power_spec_fname, "a"))) /* append the file, store the folded spectrum */
+            Terminate("can't open file `%s`\n", power_spec_fname);
+        }
+      else
+        Terminate("Something wrong.\n");
+
+      fprintf(fd, "%g\n", All.Time);
+      fprintf(fd, "%d\n", count_non_zero_bins);
+      fprintf(fd, "%g\n", All.BoxSize);
+      fprintf(fd, "%d\n", (int)(PMGRID));
+      if(All.ComovingIntegrationOn)
+        fprintf(fd, "%g\n", All.ComovingIntegrationOn > 0 ? linear_growth_factor(All.Time, 1.0) : 1.0);
+
+      for(int i = 0; i < BINS_PS; i++)
+        if(CountModes[i] > 0)
+          fprintf(fd, "%g %g %g %g %g\n", Kbin[i], DeltaUncorrected[i], PowerUncorrected[i], (double)CountModes[i], ShotLimit[i]);
+
+      if(flag == 2)
+        {
+          fprintf(fd, "%g\n", mass);
+          fprintf(fd, "%g\n", count);
+          fprintf(fd, "%g\n", mass * mass / mass2);
+        }
+
+      fclose(fd);
+    }
+}
+
+#endif
diff --git a/src/pm/pm_periodic.h b/src/pm/pm_periodic.h
new file mode 100644
index 0000000000000000000000000000000000000000..6510c89159f721ccf232147cbe55748042d1224c
--- /dev/null
+++ b/src/pm/pm_periodic.h
@@ -0,0 +1,240 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  pm_periodic.h
+ *
+ *  \brief declaration of a class used for periodic PM-force calculations
+ */
+
+#ifndef PM_PERIODIC_H
+#define PM_PERIODIC_H
+
+#include "gadgetconfig.h"
+
+#include <gsl/gsl_integration.h>
+
+#if defined(PMGRID) || defined(NGENIC)
+
+#include <gsl/gsl_integration.h>
+#include <gsl/gsl_rng.h>
+#include <math.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../data/simparticles.h"
+#include "../domain/domain.h"
+#include "../logs/timer.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../pm/pm_mpi_fft.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+class pm_periodic : public pm_mpi_fft
+{
+ public:
+  pm_periodic(MPI_Comm comm) : setcomm(comm), pm_mpi_fft(comm) {}
+
+#if defined(PMGRID) && defined(PERIODIC)
+
+ private:
+#ifdef LONG_X_BITS
+#if PMGRID != ((PMGRID / LONG_X) * LONG_X)
+#error "PMGRID must be a multiple of the stretch factor in the x-direction"
+#endif
+#endif
+
+#ifdef LONG_Y_BITS
+#if PMGRID != ((PMGRID / LONG_Y) * LONG_Y)
+#error "PMGRID must be a multiple of the stretch factor in the y-direction"
+#endif
+#endif
+
+#ifdef LONG_Z_BITS
+#if PMGRID != ((PMGRID / LONG_Z) * LONG_Z)
+#error "PMGRID must be a multiple of the stretch factor in the x-direction"
+#endif
+#endif
+
+#define GRIDX ((PMGRID / LONG_X) * DBX + DBX_EXTRA)
+#define GRIDY ((PMGRID / LONG_Y) * DBY + DBY_EXTRA)
+#define GRIDZ ((PMGRID / LONG_Z) * DBZ + DBZ_EXTRA)
+
+#define INTCELL ((~((MyIntPosType)0)) / PMGRID + 1)
+
+#if(GRIDX > 1024) || (GRIDY > 1024) || (GRIDZ > 1024)
+  typedef long long large_array_offset; /* use a larger data type in this case so that we can always address all cells of the 3D grid
+                                           with a single index */
+#else
+  typedef int large_array_offset;
+#endif
+
+#ifdef NUMPART_PER_TASK_LARGE
+  typedef long long large_numpart_type; /* if there is a risk that the local particle number times 8 overflows a 32-bit integer, this
+                                           data type should be used */
+#else
+  typedef int large_numpart_type;
+#endif
+
+  /* variables for power spectrum estimation */
+#ifndef BINS_PS
+#define BINS_PS 4000 /* number of bins for power spectrum computation */
+#endif
+#ifndef POWERSPEC_FOLDFAC
+#define POWERSPEC_FOLDFAC 16 /* folding factor to obtain an estimate of the power spectrum on very small scales */
+#endif
+
+  char power_spec_fname[MAXLEN_PATH_EXTRA];
+
+  int NSource;
+
+  void pmforce_measure_powerspec(int flag, int *typeflag);
+  void pmforce_do_powerspec(int *typeflag);
+#if defined(GRAVITY_TALLBOX)
+  void pmforce_setup_tallbox_kernel(void);
+  double pmperiodic_tallbox_long_range_potential(double x, double y, double z);
+#endif
+
+  fft_plan myplan; /*!< In this structure, various bookkeeping variables for the distributed FFTs are stored */
+
+  /*! \var maxfftsize
+   *  \brief maximum size of the local fft grid among all tasks
+   */
+  long long maxfftsize;
+
+  /*! \var rhogrid
+   *  \brief This array hold the local part of the density field and
+   *  after the FFTs the local part of the potential
+   *
+   *  \var forcegrid
+   *  \brief This array will contain the force field
+   *
+   *  \var workspace
+   *  \brief Workspace array used during the FFTs
+   */
+  fft_real *rhogrid, *forcegrid, *workspace;
+
+  /*! \brief Array containing the FFT of #rhogrid
+   *
+   *  This pointer points to the same array as #rhogrid,
+   *  because in-place FFTs are used.
+   */
+  fft_complex *fft_of_rhogrid;
+
+#if defined(GRAVITY_TALLBOX)
+  fft_real *kernel; /*!< If the tallbox option is used, the code will construct and store the k-space Greens function by FFTing it from
+                       real space */
+  fft_complex *fft_of_kernel;
+#endif
+
+#ifdef PM_ZOOM_OPTIMIZED
+
+  /*! \brief This structure links the particles to the mesh cells, to which they contribute their mass
+   *
+   * Each particle will have eight items of this structure in the #part array.
+   * For each of the eight mesh cells the CIC assignment will contribute,
+   * one item of this struct exists.
+   */
+
+ public:
+  struct part_slab_data
+  {
+    large_array_offset globalindex; /*!< index in the global density mesh */
+    large_numpart_type partindex;   /*!< contains the local particle index shifted by 2^3, the first three bits encode to which part of
+                                       the CIC assignment this item belongs to */
+    large_array_offset localindex;  /*!< index to a local copy of the corresponding mesh cell of the global density array (used during
+                                       local mass and force assignment) */
+  };
+  part_slab_data *part; /*!< array of part_slab_data linking the local particles to their mesh cells */
+
+  /* realize the comparison function as a functor, so that it can have an internal state (here the data array for which we sort indices
+   */
+  struct pm_periodic_sortindex_comparator
+  {
+   private:
+    const part_slab_data *data;
+
+   public:
+    pm_periodic_sortindex_comparator(const part_slab_data *data_) : data(data_) {}
+
+    bool operator()(const large_numpart_type &a, const large_numpart_type &b) const
+    {
+      return data[a].globalindex < data[b].globalindex;
+    }
+  };
+
+ private:
+  size_t *localfield_sendcount, *localfield_first, *localfield_offset, *localfield_recvcount;
+  large_array_offset *localfield_globalindex, *import_globalindex;
+  fft_real *localfield_data, *import_data;
+
+  void pmforce_zoom_optimized_prepare_density(int mode, int *typelist);
+  void pmforce_zoom_optimized_readout_forces_or_potential(fft_real *grid, int dim);
+
+#else
+
+  struct partbuf
+  {
+#ifndef LEAN
+    MyFloat Mass;
+#endif
+    MyIntPosType IntPos[3];
+  };
+  partbuf *partin, *partout;
+
+  size_t nimport, nexport;
+
+  size_t *Sndpm_count, *Sndpm_offset;
+  size_t *Rcvpm_count, *Rcvpm_offset;
+
+  void pmforce_uniform_optimized_prepare_density(int mode, int *typelist);
+
+  void pmforce_uniform_optimized_readout_forces_or_potential_xy(fft_real *grid, int dim);
+
+  void pmforce_uniform_optimized_readout_forces_or_potential_xz(fft_real *grid, int dim);
+  void pmforce_uniform_optimized_readout_forces_or_potential_zy(fft_real *grid, int dim);
+#endif
+
+ public:
+  simparticles *Sp;
+
+  void pm_init_periodic(simparticles *Sp_ptr);
+  void pmforce_periodic(int mode, int *typelist);
+
+  void calculate_power_spectra(int num);
+
+  static double growthfactor_integrand(double a, void *param)
+  {
+    return pow(a / (All.Omega0 + (1 - All.Omega0 - All.OmegaLambda) * a + All.OmegaLambda * a * a * a), 1.5);
+  }
+
+  double linear_growth_factor(double astart, double aend) { return linear_growth(aend) / linear_growth(astart); }
+
+  double linear_growth(double a)
+  {
+    double hubble_a = sqrt(All.Omega0 / (a * a * a) + (1 - All.Omega0 - All.OmegaLambda) / (a * a) + All.OmegaLambda);
+
+    const int worksize = 100000;
+
+    double result, abserr;
+    gsl_function F;
+
+    gsl_integration_workspace *workspace = gsl_integration_workspace_alloc(worksize);
+    F.function                           = &growthfactor_integrand;
+
+    gsl_integration_qag(&F, 0, a, 0, 1.0e-8, worksize, GSL_INTEG_GAUSS41, workspace, &result, &abserr);
+
+    gsl_integration_workspace_free(workspace);
+
+    return hubble_a * result;
+  }
+#endif
+};
+
+#endif
+
+#endif
diff --git a/src/sort/cxxsort.h b/src/sort/cxxsort.h
new file mode 100644
index 0000000000000000000000000000000000000000..72416e5ea2140c06aa2d962862df5831f04a1c24
--- /dev/null
+++ b/src/sort/cxxsort.h
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  cxxsort.h
+ *  \brief various sort routines
+ */
+
+#ifndef GADGET4_CXXSORT_H
+#define GADGET4_CXXSORT_H
+
+#include <algorithm>
+
+#include "../data/allvars.h"
+#include "../data/mymalloc.h"
+#include "../logs/logs.h"
+
+template <typename T, typename Tcomp>
+void mycxxsort_internal_serial(T *begin, T *end, T *buf, bool res_into_buf, Tcomp comp)
+{
+  std::size_t n = end - begin;
+  if(n <= 1)
+    {
+      if((n == 1) && res_into_buf)
+        *buf = *begin;
+      return;
+    }
+
+  mycxxsort_internal_serial(begin, begin + n / 2, buf, !res_into_buf, comp);
+  mycxxsort_internal_serial(begin + n / 2, end, buf + n / 2, !res_into_buf, comp);
+
+  res_into_buf ? std::merge(begin, begin + n / 2, begin + n / 2, begin + n, buf, comp)
+               : std::merge(buf, buf + n / 2, buf + n / 2, buf + n, begin, comp);
+}
+
+template <typename T, typename Tcomp>
+double mycxxsort(T *begin, T *end, Tcomp comp)
+{
+  if(end - begin <= 1)
+    return 0.;
+
+  double t0 = Logs.second();
+
+  T *buf = (T *)Mem.mymalloc("buf", (end - begin) * sizeof(T));
+
+  mycxxsort_internal_serial(begin, end, buf, false, comp);
+
+  Mem.myfree(buf);
+
+  // FIXME: verification, temporary
+  //  if (!std::is_sorted(begin,end,comp)) Terminate ("sort error!");
+
+  return Logs.timediff(t0, Logs.second());
+}
+
+#endif
diff --git a/src/sort/parallel_sort.h b/src/sort/parallel_sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..7869b4174c136b15d1cefa7b86e6c34077ef8219
--- /dev/null
+++ b/src/sort/parallel_sort.h
@@ -0,0 +1,559 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  parallel_sort.h
+ *
+ *  \brief parallel sort routine that leaves the number of elements per processor invariant
+ */
+
+#ifndef PARALLEL_SORT_H
+#define PARALLEL_SORT_H
+
+#include "cxxsort.h"
+
+#include "../data/mymalloc.h"
+
+//#define CHECK_LOCAL_RANK
+
+template <typename It, typename Comp>
+class IdxComp__
+{
+ private:
+  It begin;
+  Comp comp;
+
+ public:
+  IdxComp__(It begin_, Comp comp_) : begin(begin_), comp(comp_) {}
+  bool operator()(std::size_t a, std::size_t b) const { return comp(*(begin + a), *(begin + b)); }
+};
+
+/*! Performs an indirect sort on the supplied iterator range and returns in
+    \a idx a \a vector containing the indices of the smallest, second smallest,
+    third smallest, etc. element, according to \a comp. */
+template <typename It, typename T2, typename Comp>
+inline void buildIndex(It begin, It end, T2 *idx, Comp comp)
+{
+  using namespace std;
+  T2 num = end - begin;
+  for(T2 i = 0; i < num; ++i)
+    idx[i] = i;
+  mycxxsort(idx, idx + num, IdxComp__<It, Comp>(begin, comp));
+}
+
+template <typename T, typename Comp>
+void get_local_rank(const T &element, std::size_t tie_breaking_rank, const T *base, size_t nmemb, size_t noffs_thistask,
+                    long long left, long long right, size_t *loc, Comp comp)
+{
+  if(right < left)
+    Terminate("right < left");
+
+  if(left == 0 && right == (int)nmemb + 1)
+    {
+      if(comp(base[nmemb - 1], element))
+        {
+          *loc = nmemb;
+          return;
+        }
+      else if(comp(element, base[0]))
+        {
+          *loc = 0;
+          return;
+        }
+    }
+
+  if(right == left) /* looks like we already converged to the proper rank */
+    {
+      *loc = left;
+    }
+  else
+    {
+      if(comp(base[right - 1], element)) /* the last element is smaller, hence all elements are on the left */
+        *loc = (right - 1) + 1;
+      else if(comp(element, base[left])) /* the first element is already larger, hence no element is on the left */
+        *loc = left;
+      else
+        {
+          while(right > left)
+            {
+              long long mid = ((right - 1) + left) / 2;
+
+              int cmp = comp(base[mid], element) ? -1 : (comp(element, base[mid]) ? +1 : 0);
+              if(cmp == 0)
+                {
+                  if(mid + noffs_thistask < tie_breaking_rank)
+                    cmp = -1;
+                  else if(mid + noffs_thistask > tie_breaking_rank)
+                    cmp = +1;
+                }
+
+              if(cmp == 0) /* element has exactly been found */
+                {
+                  *loc = mid;
+                  break;
+                }
+
+              if((right - 1) == left) /* elements is not on this CPU */
+                {
+                  if(cmp < 0)
+                    *loc = mid + 1;
+                  else
+                    *loc = mid;
+                  break;
+                }
+
+              if(cmp < 0)
+                {
+                  left = mid + 1;
+                }
+              else
+                {
+                  if((right - 1) == left + 1)
+                    {
+                      if(mid != left)
+                        Terminate("Can't be: -->left=%lld  right=%lld\n", left, right);
+
+                      *loc = left;
+                      break;
+                    }
+
+                  right = mid;
+                }
+            }
+        }
+    }
+}
+
+#ifdef CHECK_LOCAL_RANK
+template <typename T, typename Comp>
+inline void check_local_rank(const T &element,                /* element of which we want the rank */
+                             size_t tie_breaking_rank,        /* the initial global rank of this element (needed for breaking ties) */
+                             const T *base,                   /* base address of local data */
+                             size_t nmemb,                    /* number and size of local data */
+                             size_t noffs_thistask,           /* cumulative length of data on lower tasks */
+                             long long left, long long right, /* range of elements on local task that may hold the element */
+                             size_t loc, Comp comp)           /* user-specified  comparison function */
+{
+  long long count = 0;
+
+  for(size_t i = 0; i < nmemb; i++)
+    {
+      int cmp = comp(base[i], element) ? -1 : (comp(element, base[i]) ? +1 : 0);
+
+      if(cmp == 0)
+        {
+          if(noffs_thistask + i < tie_breaking_rank)
+            cmp = -1;
+        }
+
+      if(cmp < 0)
+        count++;
+    }
+
+  if(count != (long long)loc)
+    Terminate("Inconsistency: loc=%lld count=%lld  left=%lld right=%lld  nmemb=%lld\n", (long long)loc, count, left, right,
+              (long long)nmemb);
+}
+#endif
+
+template <typename T, typename Comp>
+inline double mycxxsort_parallel(T *begin, T *end, Comp comp, MPI_Comm comm)
+{
+  const int MAX_ITER_PARALLEL_SORT = 500;
+  int ranks_not_found, Local_ThisTask, Local_NTask, Color, new_max_loc;
+  size_t tie_breaking_rank, new_tie_breaking_rank, rank;
+  MPI_Comm MPI_CommLocal;
+
+  double ta    = Logs.second();
+  size_t nmemb = end - begin;
+  size_t size  = sizeof(T);
+  /* do a serial sort of the local data up front */
+  mycxxsort(begin, end, comp);
+
+  /* we create a communicator that contains just those tasks with nmemb > 0. This makes
+   *  it easier to deal with CPUs that do not hold any data.
+   */
+  if(nmemb)
+    Color = 1;
+  else
+    Color = 0;
+
+  int thistask;
+  MPI_Comm_rank(comm, &thistask);
+
+  MPI_Comm_split(comm, Color, thistask, &MPI_CommLocal);
+  MPI_Comm_rank(MPI_CommLocal, &Local_ThisTask);
+  MPI_Comm_size(MPI_CommLocal, &Local_NTask);
+
+  if(Local_NTask > 1 && Color == 1)
+    {
+      size_t *nlist = (size_t *)Mem.mymalloc("nlist", Local_NTask * sizeof(size_t));
+      size_t *noffs = (size_t *)Mem.mymalloc("noffs", Local_NTask * sizeof(size_t));
+
+      MPI_Allgather(&nmemb, sizeof(size_t), MPI_BYTE, nlist, sizeof(size_t), MPI_BYTE, MPI_CommLocal);
+
+      noffs[0] = 0;
+      for(int i = 1; i < Local_NTask; i++)
+        noffs[i] = noffs[i - 1] + nlist[i - 1];
+
+      T *element_guess                  = (T *)Mem.mymalloc("element_guess", Local_NTask * size);
+      size_t *element_tie_breaking_rank = (size_t *)Mem.mymalloc("element_tie_breaking_rank", Local_NTask * sizeof(size_t));
+      size_t *desired_glob_rank         = (size_t *)Mem.mymalloc("desired_glob_rank", Local_NTask * sizeof(size_t));
+      size_t *current_glob_rank         = (size_t *)Mem.mymalloc("current_glob_rank", Local_NTask * sizeof(size_t));
+      size_t *current_loc_rank          = (size_t *)Mem.mymalloc("current_loc_rank", Local_NTask * sizeof(size_t));
+      long long *range_left             = (long long *)Mem.mymalloc("range_left", Local_NTask * sizeof(long long));
+      long long *range_right            = (long long *)Mem.mymalloc("range_right", Local_NTask * sizeof(long long));
+      int *max_loc                      = (int *)Mem.mymalloc("max_loc", Local_NTask * sizeof(int));
+
+      size_t *list           = (size_t *)Mem.mymalloc("list", Local_NTask * sizeof(size_t));
+      size_t *range_len_list = (size_t *)Mem.mymalloc("range_len_list", Local_NTask * sizeof(long long));
+      T median_element;
+      T *median_element_list                = (T *)Mem.mymalloc("median_element_list", Local_NTask * size);
+      size_t *tie_breaking_rank_list        = (size_t *)Mem.mymalloc("tie_breaking_rank_list", Local_NTask * sizeof(size_t));
+      int *index_list                       = (int *)Mem.mymalloc("index_list", Local_NTask * sizeof(int));
+      int *max_loc_list                     = (int *)Mem.mymalloc("max_loc_list", Local_NTask * sizeof(int));
+      size_t *source_range_len_list         = (size_t *)Mem.mymalloc("source_range_len_list", Local_NTask * sizeof(long long));
+      size_t *source_tie_breaking_rank_list = (size_t *)Mem.mymalloc("source_tie_breaking_rank_list", Local_NTask * sizeof(long long));
+      T *source_median_element_list         = (T *)Mem.mymalloc("source_median_element_list", Local_NTask * size);
+      T new_element_guess;
+
+      for(int i = 0; i < Local_NTask - 1; i++)
+        {
+          desired_glob_rank[i] = noffs[i + 1];
+          current_glob_rank[i] = 0;
+          range_left[i]        = 0;     /* first element that it can be */
+          range_right[i]       = nmemb; /* first element that it can not be */
+        }
+
+      /* now we determine the first split element guess, which is the same for all divisions in the first iteration */
+
+      /* find the median of each processor, and then take the median among those values.
+       * This should work reasonably well even for extremely skewed distributions
+       */
+      long long range_len = range_right[0] - range_left[0];
+
+      if(range_len >= 1)
+        {
+          long long mid     = (range_left[0] + range_right[0]) / 2;
+          median_element    = begin[mid];
+          tie_breaking_rank = mid + noffs[Local_ThisTask];
+        }
+
+      MPI_Gather(&range_len, sizeof(long long), MPI_BYTE, range_len_list, sizeof(long long), MPI_BYTE, 0, MPI_CommLocal);
+      MPI_Gather(&median_element, size, MPI_BYTE, median_element_list, size, MPI_BYTE, 0, MPI_CommLocal);
+      MPI_Gather(&tie_breaking_rank, sizeof(size_t), MPI_BYTE, tie_breaking_rank_list, sizeof(size_t), MPI_BYTE, 0, MPI_CommLocal);
+
+      if(Local_ThisTask == 0)
+        {
+          for(int j = 0; j < Local_NTask; j++)
+            max_loc_list[j] = j;
+
+          /* eliminate the elements that are undefined because the corresponding CPU has zero range left */
+          int nleft = Local_NTask;
+
+          for(int j = 0; j < nleft; j++)
+            {
+              if(range_len_list[j] < 1)
+                {
+                  range_len_list[j] = range_len_list[nleft - 1];
+                  if(range_len_list[nleft - 1] >= 1 && j != (nleft - 1))
+                    {
+                      median_element_list[j]    = median_element_list[nleft - 1];
+                      tie_breaking_rank_list[j] = tie_breaking_rank_list[nleft - 1];
+                      max_loc_list[j]           = max_loc_list[nleft - 1];
+                    }
+
+                  nleft--;
+                  j--;
+                }
+            }
+
+          /* do a serial sort of the remaining elements (indirectly, so that we have the order of tie breaking list as well) */
+          buildIndex(median_element_list, median_element_list + nleft, index_list, comp);
+
+          /* now select the median of the medians */
+          int mid                      = nleft / 2;
+          element_guess[0]             = median_element_list[index_list[mid]];
+          element_tie_breaking_rank[0] = tie_breaking_rank_list[index_list[mid]];
+          max_loc[0]                   = max_loc_list[index_list[mid]];
+        }
+
+      MPI_Bcast(element_guess, size, MPI_BYTE, 0, MPI_CommLocal);
+      MPI_Bcast(&element_tie_breaking_rank[0], sizeof(size_t), MPI_BYTE, 0, MPI_CommLocal);
+      MPI_Bcast(&max_loc[0], 1, MPI_INT, 0, MPI_CommLocal);
+
+      for(int i = 1; i < Local_NTask - 1; i++)
+        {
+          element_guess[i]             = element_guess[0];
+          element_tie_breaking_rank[i] = element_tie_breaking_rank[0];
+          max_loc[i]                   = max_loc[0];
+        }
+
+      int iter = 0;
+
+      do
+        {
+          for(int i = 0; i < Local_NTask - 1; i++)
+            {
+              if(current_glob_rank[i] != desired_glob_rank[i])
+                {
+                  get_local_rank(element_guess[i], element_tie_breaking_rank[i], begin, nmemb, noffs[Local_ThisTask], range_left[i],
+                                 range_right[i], &current_loc_rank[i], comp);
+
+#ifdef CHECK_LOCAL_RANK
+                  check_local_rank(element_guess[i], element_tie_breaking_rank[i], begin, nmemb, noffs[Local_ThisTask], range_left[i],
+                                   range_right[i], current_loc_rank[i], comp);
+#endif
+                }
+            }
+
+          /* now compute the global ranks by summing the local ranks */
+          /* Note: the last element in current_loc_rank is not defined. It will be summed by the last processor, and stored in the last
+           * element of current_glob_rank */
+          MPI_Alltoall(current_loc_rank, sizeof(size_t), MPI_BYTE, list, sizeof(size_t), MPI_BYTE, MPI_CommLocal);
+          rank = 0;
+          for(int j = 0; j < Local_NTask; j++)
+            rank += list[j];
+          MPI_Allgather(&rank, sizeof(size_t), MPI_BYTE, current_glob_rank, sizeof(size_t), MPI_BYTE, MPI_CommLocal);
+
+          ranks_not_found = 0;
+          for(int i = 0; i < Local_NTask - 1; i++)
+            {
+              if(current_glob_rank[i] != desired_glob_rank[i]) /* here we're not yet done */
+                {
+                  ranks_not_found++;
+
+                  if(current_glob_rank[i] < desired_glob_rank[i])
+                    {
+                      range_left[i] = current_loc_rank[i];
+
+                      if(Local_ThisTask == max_loc[i])
+                        range_left[i]++;
+                    }
+
+                  if(current_glob_rank[i] > desired_glob_rank[i])
+                    range_right[i] = current_loc_rank[i];
+                }
+            }
+
+          /* now we need to determine new element guesses */
+          for(int i = 0; i < Local_NTask - 1; i++)
+            {
+              if(current_glob_rank[i] != desired_glob_rank[i]) /* here we're not yet done */
+                {
+                  /* find the median of each processor, and then take the median among those values.
+                   * This should work reasonably well even for extremely skewed distributions
+                   */
+                  source_range_len_list[i] = range_right[i] - range_left[i];
+
+                  if(source_range_len_list[i] >= 1)
+                    {
+                      long long middle                 = (range_left[i] + range_right[i]) / 2;
+                      source_median_element_list[i]    = begin[middle];
+                      source_tie_breaking_rank_list[i] = middle + noffs[Local_ThisTask];
+                    }
+                }
+            }
+
+          MPI_Alltoall(source_range_len_list, sizeof(long long), MPI_BYTE, range_len_list, sizeof(long long), MPI_BYTE, MPI_CommLocal);
+          MPI_Alltoall(source_median_element_list, size, MPI_BYTE, median_element_list, size, MPI_BYTE, MPI_CommLocal);
+          MPI_Alltoall(source_tie_breaking_rank_list, sizeof(size_t), MPI_BYTE, tie_breaking_rank_list, sizeof(size_t), MPI_BYTE,
+                       MPI_CommLocal);
+
+          if(Local_ThisTask < Local_NTask - 1)
+            {
+              if(current_glob_rank[Local_ThisTask] !=
+                 desired_glob_rank[Local_ThisTask]) /* in this case we're not yet done for this split point */
+                {
+                  for(int j = 0; j < Local_NTask; j++)
+                    max_loc_list[j] = j;
+
+                  /* eliminate the elements that are undefined because the corresponding CPU has zero range left */
+                  int nleft = Local_NTask;
+
+                  for(int j = 0; j < nleft; j++)
+                    {
+                      if(range_len_list[j] < 1)
+                        {
+                          range_len_list[j] = range_len_list[nleft - 1];
+                          if(range_len_list[nleft - 1] >= 1 && j != (nleft - 1))
+                            {
+                              median_element_list[j]    = median_element_list[nleft - 1];
+                              tie_breaking_rank_list[j] = tie_breaking_rank_list[nleft - 1];
+                              max_loc_list[j]           = max_loc_list[nleft - 1];
+                            }
+
+                          nleft--;
+                          j--;
+                        }
+                    }
+
+                  if((iter & 1))
+                    {
+                      size_t max_range = 0, maxj = 0;
+
+                      for(int j = 0; j < nleft; j++)
+                        if(range_len_list[j] > max_range)
+                          {
+                            max_range = range_len_list[j];
+                            maxj      = j;
+                          }
+
+                      /* now select the median element from the task which has the largest range */
+                      new_element_guess     = median_element_list[maxj];
+                      new_tie_breaking_rank = tie_breaking_rank_list[maxj];
+                      new_max_loc           = max_loc_list[maxj];
+                    }
+                  else
+                    {
+                      /* do a serial sort of the remaining elements (indirectly, so that we have the order of tie breaking list as
+                       * well) */
+                      buildIndex(median_element_list, median_element_list + nleft, index_list, comp);
+
+                      /* now select the median of the medians */
+                      int mid               = nleft / 2;
+                      new_element_guess     = median_element_list[index_list[mid]];
+                      new_tie_breaking_rank = tie_breaking_rank_list[index_list[mid]];
+                      new_max_loc           = max_loc_list[index_list[mid]];
+                    }
+                }
+              else
+                {
+                  /* in order to preserve existing guesses */
+                  new_element_guess     = element_guess[Local_ThisTask];
+                  new_tie_breaking_rank = element_tie_breaking_rank[Local_ThisTask];
+                  new_max_loc           = max_loc[Local_ThisTask];
+                }
+            }
+
+          MPI_Allgather(&new_element_guess, size, MPI_BYTE, element_guess, size, MPI_BYTE, MPI_CommLocal);
+          MPI_Allgather(&new_tie_breaking_rank, sizeof(size_t), MPI_BYTE, element_tie_breaking_rank, sizeof(size_t), MPI_BYTE,
+                        MPI_CommLocal);
+          MPI_Allgather(&new_max_loc, 1, MPI_INT, max_loc, 1, MPI_INT, MPI_CommLocal);
+
+          iter++;
+
+          if(iter > (MAX_ITER_PARALLEL_SORT - 100) && Local_ThisTask == 0)
+            {
+              printf("PSORT: iter=%d: ranks_not_found=%d  Local_NTask=%d\n", iter, ranks_not_found, Local_NTask);
+              myflush(stdout);
+              if(iter > MAX_ITER_PARALLEL_SORT)
+                Terminate("can't find the split points. That's odd");
+            }
+        }
+      while(ranks_not_found);
+
+      Mem.myfree(source_median_element_list);
+      Mem.myfree(source_tie_breaking_rank_list);
+      Mem.myfree(source_range_len_list);
+      Mem.myfree(max_loc_list);
+      Mem.myfree(index_list);
+      Mem.myfree(tie_breaking_rank_list);
+      Mem.myfree(median_element_list);
+
+      /* At this point we have found all the elements corresponding to the desired split points */
+      /* we can now go ahead and determine how many elements of the local CPU have to go to each other CPU */
+
+      if(nmemb * size > (1LL << 31))
+        Terminate("currently, local data must be smaller than 2 GB");
+      /* note: to restrict this limitation, the send/recv count arrays have to made 64-bit,
+       * and the MPI data exchange though MPI_Alltoall has to be modified such that buffers > 2 GB become possible
+       */
+
+      int *send_count  = (int *)Mem.mymalloc("send_count", Local_NTask * sizeof(int));
+      int *recv_count  = (int *)Mem.mymalloc("recv_count", Local_NTask * sizeof(int));
+      int *send_offset = (int *)Mem.mymalloc("send_offset", Local_NTask * sizeof(int));
+      int *recv_offset = (int *)Mem.mymalloc("recv_offset", Local_NTask * sizeof(int));
+
+      for(int i = 0; i < Local_NTask; i++)
+        send_count[i] = 0;
+
+      int target = 0;
+
+      for(size_t i = 0; i < nmemb; i++)
+        {
+          while(target < Local_NTask - 1)
+            {
+              int cmp = comp(begin[i], element_guess[target]) ? -1 : (comp(element_guess[target], begin[i]) ? +1 : 0);
+              if(cmp == 0)
+                {
+                  if(i + noffs[Local_ThisTask] < element_tie_breaking_rank[target])
+                    cmp = -1;
+                  else if(i + noffs[Local_ThisTask] > element_tie_breaking_rank[target])
+                    cmp = +1;
+                }
+              if(cmp >= 0)
+                target++;
+              else
+                break;
+            }
+          send_count[target]++;
+        }
+
+      MPI_Alltoall(send_count, 1, MPI_INT, recv_count, 1, MPI_INT, MPI_CommLocal);
+
+      size_t nimport = 0;
+
+      recv_offset[0] = 0;
+      send_offset[0] = 0;
+      for(int j = 0; j < Local_NTask; j++)
+        {
+          nimport += recv_count[j];
+
+          if(j > 0)
+            {
+              send_offset[j] = send_offset[j - 1] + send_count[j - 1];
+              recv_offset[j] = recv_offset[j - 1] + recv_count[j - 1];
+            }
+        }
+
+      if(nimport != nmemb)
+        Terminate("nimport=%lld != nmemb=%lld", (long long)nimport, (long long)nmemb);
+
+      for(int j = 0; j < Local_NTask; j++)
+        {
+          send_count[j] *= size;
+          recv_count[j] *= size;
+
+          send_offset[j] *= size;
+          recv_offset[j] *= size;
+        }
+
+      T *basetmp = (T *)Mem.mymalloc("basetmp", nmemb * size);
+
+      /* exchange the data */
+      MPI_Alltoallv(begin, send_count, send_offset, MPI_BYTE, basetmp, recv_count, recv_offset, MPI_BYTE, MPI_CommLocal);
+
+      memcpy(static_cast<void *>(begin), static_cast<void *>(basetmp), nmemb * size);
+      Mem.myfree(basetmp);
+
+      mycxxsort(begin, begin + nmemb, comp);
+
+      Mem.myfree(recv_offset);
+      Mem.myfree(send_offset);
+      Mem.myfree(recv_count);
+      Mem.myfree(send_count);
+
+      Mem.myfree(range_len_list);
+      Mem.myfree(list);
+      Mem.myfree(max_loc);
+      Mem.myfree(range_right);
+      Mem.myfree(range_left);
+      Mem.myfree(current_loc_rank);
+      Mem.myfree(current_glob_rank);
+      Mem.myfree(desired_glob_rank);
+      Mem.myfree(element_tie_breaking_rank);
+      Mem.myfree(element_guess);
+      Mem.myfree(noffs);
+      Mem.myfree(nlist);
+    }
+
+  MPI_Comm_free(&MPI_CommLocal);
+
+  double tb = Logs.second();
+  return Logs.timediff(ta, tb);
+}
+
+#endif
diff --git a/src/sort/peano.cc b/src/sort/peano.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf1ed755f74f2f9342dcb2ddd27c5597e4ab1841
--- /dev/null
+++ b/src/sort/peano.cc
@@ -0,0 +1,182 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  peano.cc
+ *
+ *  \brief routines for computing Peano-Hilbert keys and for bringing particles into this order
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../sort/cxxsort.h"
+#include "../sort/peano.h"
+#include "../system/system.h"
+
+namespace
+{
+struct peano_hilbert_data
+{
+  peanokey key;
+  int index;
+};
+
+/*
+struct peano_comparator
+{
+bool operator() (const peano_hilbert_data & a, const peano_hilbert_data & b)
+{
+  return a.key < b.key;
+}
+};
+*/
+
+const unsigned char rottable3[48][8] = {
+    {36, 28, 25, 27, 10, 10, 25, 27}, {29, 11, 24, 24, 37, 11, 26, 26}, {8, 8, 25, 27, 30, 38, 25, 27},
+    {9, 39, 24, 24, 9, 31, 26, 26},   {40, 24, 44, 32, 40, 6, 44, 6},   {25, 7, 33, 7, 41, 41, 45, 45},
+    {4, 42, 4, 46, 26, 42, 34, 46},   {43, 43, 47, 47, 5, 27, 5, 35},   {33, 35, 36, 28, 33, 35, 2, 2},
+    {32, 32, 29, 3, 34, 34, 37, 3},   {33, 35, 0, 0, 33, 35, 30, 38},   {32, 32, 1, 39, 34, 34, 1, 31},
+    {24, 42, 32, 46, 14, 42, 14, 46}, {43, 43, 47, 47, 25, 15, 33, 15}, {40, 12, 44, 12, 40, 26, 44, 34},
+    {13, 27, 13, 35, 41, 41, 45, 45}, {28, 41, 28, 22, 38, 43, 38, 22}, {42, 40, 23, 23, 29, 39, 29, 39},
+    {41, 36, 20, 36, 43, 30, 20, 30}, {37, 31, 37, 31, 42, 40, 21, 21}, {28, 18, 28, 45, 38, 18, 38, 47},
+    {19, 19, 46, 44, 29, 39, 29, 39}, {16, 36, 45, 36, 16, 30, 47, 30}, {37, 31, 37, 31, 17, 17, 46, 44},
+    {12, 4, 1, 3, 34, 34, 1, 3},      {5, 35, 0, 0, 13, 35, 2, 2},      {32, 32, 1, 3, 6, 14, 1, 3},
+    {33, 15, 0, 0, 33, 7, 2, 2},      {16, 0, 20, 8, 16, 30, 20, 30},   {1, 31, 9, 31, 17, 17, 21, 21},
+    {28, 18, 28, 22, 2, 18, 10, 22},  {19, 19, 23, 23, 29, 3, 29, 11},  {9, 11, 12, 4, 9, 11, 26, 26},
+    {8, 8, 5, 27, 10, 10, 13, 27},    {9, 11, 24, 24, 9, 11, 6, 14},    {8, 8, 25, 15, 10, 10, 25, 7},
+    {0, 18, 8, 22, 38, 18, 38, 22},   {19, 19, 23, 23, 1, 39, 9, 39},   {16, 36, 20, 36, 16, 2, 20, 10},
+    {37, 3, 37, 11, 17, 17, 21, 21},  {4, 17, 4, 46, 14, 19, 14, 46},   {18, 16, 47, 47, 5, 15, 5, 15},
+    {17, 12, 44, 12, 19, 6, 44, 6},   {13, 7, 13, 7, 18, 16, 45, 45},   {4, 42, 4, 21, 14, 42, 14, 23},
+    {43, 43, 22, 20, 5, 15, 5, 15},   {40, 12, 21, 12, 40, 6, 23, 6},   {13, 7, 13, 7, 41, 41, 22, 20}};
+
+const unsigned char subpix3[48][8] = {
+    {0, 7, 1, 6, 3, 4, 2, 5}, {7, 4, 6, 5, 0, 3, 1, 2}, {4, 3, 5, 2, 7, 0, 6, 1}, {3, 0, 2, 1, 4, 7, 5, 6}, {1, 0, 6, 7, 2, 3, 5, 4},
+    {0, 3, 7, 4, 1, 2, 6, 5}, {3, 2, 4, 5, 0, 1, 7, 6}, {2, 1, 5, 6, 3, 0, 4, 7}, {6, 1, 7, 0, 5, 2, 4, 3}, {1, 2, 0, 3, 6, 5, 7, 4},
+    {2, 5, 3, 4, 1, 6, 0, 7}, {5, 6, 4, 7, 2, 1, 3, 0}, {7, 6, 0, 1, 4, 5, 3, 2}, {6, 5, 1, 2, 7, 4, 0, 3}, {5, 4, 2, 3, 6, 7, 1, 0},
+    {4, 7, 3, 0, 5, 6, 2, 1}, {6, 7, 5, 4, 1, 0, 2, 3}, {7, 0, 4, 3, 6, 1, 5, 2}, {0, 1, 3, 2, 7, 6, 4, 5}, {1, 6, 2, 5, 0, 7, 3, 4},
+    {2, 3, 1, 0, 5, 4, 6, 7}, {3, 4, 0, 7, 2, 5, 1, 6}, {4, 5, 7, 6, 3, 2, 0, 1}, {5, 2, 6, 1, 4, 3, 7, 0}, {7, 0, 6, 1, 4, 3, 5, 2},
+    {0, 3, 1, 2, 7, 4, 6, 5}, {3, 4, 2, 5, 0, 7, 1, 6}, {4, 7, 5, 6, 3, 0, 2, 1}, {6, 7, 1, 0, 5, 4, 2, 3}, {7, 4, 0, 3, 6, 5, 1, 2},
+    {4, 5, 3, 2, 7, 6, 0, 1}, {5, 6, 2, 1, 4, 7, 3, 0}, {1, 6, 0, 7, 2, 5, 3, 4}, {6, 5, 7, 4, 1, 2, 0, 3}, {5, 2, 4, 3, 6, 1, 7, 0},
+    {2, 1, 3, 0, 5, 6, 4, 7}, {0, 1, 7, 6, 3, 2, 4, 5}, {1, 2, 6, 5, 0, 3, 7, 4}, {2, 3, 5, 4, 1, 0, 6, 7}, {3, 0, 4, 7, 2, 1, 5, 6},
+    {1, 0, 2, 3, 6, 7, 5, 4}, {0, 7, 3, 4, 1, 6, 2, 5}, {7, 6, 4, 5, 0, 1, 3, 2}, {6, 1, 5, 2, 7, 0, 4, 3}, {5, 4, 6, 7, 2, 3, 1, 0},
+    {4, 3, 7, 0, 5, 2, 6, 1}, {3, 2, 0, 1, 4, 5, 7, 6}, {2, 5, 1, 6, 3, 4, 0, 7}};
+
+}  // unnamed namespace
+
+/*! This function computes a Peano-Hilbert key for an integer triplet (x,y,z),
+ *  with x,y,z in the range between 0 and 2^bits-1.
+ */
+peanokey peano_hilbert_key(MyIntPosType x, MyIntPosType y, MyIntPosType z, int bits)
+{
+  unsigned char rotation = 0;
+  peanokey key           = {0, 0, 0};
+
+  for(MyIntPosType mask = ((MyIntPosType)1) << (bits - 1); mask > 0; mask >>= 1)
+    {
+      unsigned char pix = ((x & mask) ? 4 : 0) | ((y & mask) ? 2 : 0) | ((z & mask) ? 1 : 0);
+
+      key.hs <<= 3;
+      key.hs |= (key.is & (~((~((MyIntPosType)0)) >> 3))) >> (BITS_FOR_POSITIONS - 3);
+
+      key.is <<= 3;
+      key.is |= (key.ls & (~((~((MyIntPosType)0)) >> 3))) >> (BITS_FOR_POSITIONS - 3);
+
+      key.ls <<= 3;
+      key.ls |= subpix3[rotation][pix];
+
+      rotation = rottable3[rotation][pix];
+    }
+
+  return key;
+}
+
+unsigned char peano_incremental_key(unsigned char pix, unsigned char *rotation)
+{
+  unsigned char outpix = subpix3[*rotation][pix];
+  *rotation            = rottable3[*rotation][pix];
+
+  return outpix;
+}
+
+const unsigned char irottable3[48][8] = {
+    {28, 27, 27, 10, 10, 25, 25, 36}, {29, 24, 24, 11, 11, 26, 26, 37}, {30, 25, 25, 8, 8, 27, 27, 38},
+    {31, 26, 26, 9, 9, 24, 24, 39},   {32, 44, 44, 6, 6, 40, 40, 24},   {33, 45, 45, 7, 7, 41, 41, 25},
+    {34, 46, 46, 4, 4, 42, 42, 26},   {35, 47, 47, 5, 5, 43, 43, 27},   {36, 33, 33, 2, 2, 35, 35, 28},
+    {37, 34, 34, 3, 3, 32, 32, 29},   {38, 35, 35, 0, 0, 33, 33, 30},   {39, 32, 32, 1, 1, 34, 34, 31},
+    {24, 42, 42, 14, 14, 46, 46, 32}, {25, 43, 43, 15, 15, 47, 47, 33}, {26, 40, 40, 12, 12, 44, 44, 34},
+    {27, 41, 41, 13, 13, 45, 45, 35}, {41, 28, 28, 22, 22, 38, 38, 43}, {42, 29, 29, 23, 23, 39, 39, 40},
+    {43, 30, 30, 20, 20, 36, 36, 41}, {40, 31, 31, 21, 21, 37, 37, 42}, {47, 38, 38, 18, 18, 28, 28, 45},
+    {44, 39, 39, 19, 19, 29, 29, 46}, {45, 36, 36, 16, 16, 30, 30, 47}, {46, 37, 37, 17, 17, 31, 31, 44},
+    {12, 1, 1, 34, 34, 3, 3, 4},      {13, 2, 2, 35, 35, 0, 0, 5},      {14, 3, 3, 32, 32, 1, 1, 6},
+    {15, 0, 0, 33, 33, 2, 2, 7},      {0, 16, 16, 30, 30, 20, 20, 8},   {1, 17, 17, 31, 31, 21, 21, 9},
+    {2, 18, 18, 28, 28, 22, 22, 10},  {3, 19, 19, 29, 29, 23, 23, 11},  {4, 11, 11, 26, 26, 9, 9, 12},
+    {5, 8, 8, 27, 27, 10, 10, 13},    {6, 9, 9, 24, 24, 11, 11, 14},    {7, 10, 10, 25, 25, 8, 8, 15},
+    {8, 22, 22, 38, 38, 18, 18, 0},   {9, 23, 23, 39, 39, 19, 19, 1},   {10, 20, 20, 36, 36, 16, 16, 2},
+    {11, 21, 21, 37, 37, 17, 17, 3},  {19, 14, 14, 46, 46, 4, 4, 17},   {16, 15, 15, 47, 47, 5, 5, 18},
+    {17, 12, 12, 44, 44, 6, 6, 19},   {18, 13, 13, 45, 45, 7, 7, 16},   {21, 4, 4, 42, 42, 14, 14, 23},
+    {22, 5, 5, 43, 43, 15, 15, 20},   {23, 6, 6, 40, 40, 12, 12, 21},   {20, 7, 7, 41, 41, 13, 13, 22}};
+
+const unsigned char ipixtable3[48][8] = {
+    {1, 3, 7, 5, 4, 6, 2, 0}, {0, 2, 3, 1, 5, 7, 6, 4}, {4, 6, 2, 0, 1, 3, 7, 5}, {5, 7, 6, 4, 0, 2, 3, 1}, {3, 2, 6, 7, 5, 4, 0, 1},
+    {2, 6, 7, 3, 1, 5, 4, 0}, {6, 7, 3, 2, 0, 1, 5, 4}, {7, 3, 2, 6, 4, 0, 1, 5}, {2, 0, 4, 6, 7, 5, 1, 3}, {6, 4, 5, 7, 3, 1, 0, 2},
+    {7, 5, 1, 3, 2, 0, 4, 6}, {3, 1, 0, 2, 6, 4, 5, 7}, {0, 1, 5, 4, 6, 7, 3, 2}, {4, 0, 1, 5, 7, 3, 2, 6}, {5, 4, 0, 1, 3, 2, 6, 7},
+    {1, 5, 4, 0, 2, 6, 7, 3}, {1, 0, 2, 3, 7, 6, 4, 5}, {0, 4, 6, 2, 3, 7, 5, 1}, {4, 5, 7, 6, 2, 3, 1, 0}, {5, 1, 3, 7, 6, 2, 0, 4},
+    {7, 6, 4, 5, 1, 0, 2, 3}, {3, 7, 5, 1, 0, 4, 6, 2}, {2, 3, 1, 0, 4, 5, 7, 6}, {6, 2, 0, 4, 5, 1, 3, 7}, {0, 2, 6, 4, 5, 7, 3, 1},
+    {4, 6, 7, 5, 1, 3, 2, 0}, {5, 7, 3, 1, 0, 2, 6, 4}, {1, 3, 2, 0, 4, 6, 7, 5}, {1, 0, 4, 5, 7, 6, 2, 3}, {0, 4, 5, 1, 3, 7, 6, 2},
+    {4, 5, 1, 0, 2, 3, 7, 6}, {5, 1, 0, 4, 6, 2, 3, 7}, {3, 1, 5, 7, 6, 4, 0, 2}, {2, 0, 1, 3, 7, 5, 4, 6}, {6, 4, 0, 2, 3, 1, 5, 7},
+    {7, 5, 4, 6, 2, 0, 1, 3}, {2, 3, 7, 6, 4, 5, 1, 0}, {6, 2, 3, 7, 5, 1, 0, 4}, {7, 6, 2, 3, 1, 0, 4, 5}, {3, 7, 6, 2, 0, 4, 5, 1},
+    {5, 4, 6, 7, 3, 2, 0, 1}, {1, 5, 7, 3, 2, 6, 4, 0}, {0, 1, 3, 2, 6, 7, 5, 4}, {4, 0, 2, 6, 7, 3, 1, 5}, {3, 2, 0, 1, 5, 4, 6, 7},
+    {2, 6, 4, 0, 1, 5, 7, 3}, {6, 7, 5, 4, 0, 1, 3, 2}, {7, 3, 1, 5, 4, 0, 2, 6},
+};
+
+void peano_hilbert_key_inverse(peanokey key, int bits, MyIntPosType *x, MyIntPosType *y, MyIntPosType *z)
+{
+  for(int i = bits; i < BITS_FOR_POSITIONS; i++)
+    {
+      key.hs <<= 3;
+      key.hs |= (key.is & (~((~((MyIntPosType)0)) >> 3))) >> (BITS_FOR_POSITIONS - 3);
+
+      key.is <<= 3;
+      key.is |= (key.ls & (~((~((MyIntPosType)0)) >> 3))) >> (BITS_FOR_POSITIONS - 3);
+
+      key.ls <<= 3;
+    }
+
+  int rot = 24;
+
+  *x = *y = *z = 0;
+
+  for(int i = 0; i < bits; i++)
+    {
+      unsigned int keypart = (key.hs & (~((~((MyIntPosType)0)) >> 3))) >> (BITS_FOR_POSITIONS - 3);
+
+      int quad = ipixtable3[rot][keypart];
+
+      *x  = (*x << 1) + (quad >> 2);
+      *y  = (*y << 1) + ((quad & 2) >> 1);
+      *z  = (*z << 1) + (quad & 1);
+      rot = irottable3[rot][keypart];
+
+      key.hs <<= 3;
+      key.hs |= (key.is & (~((~((MyIntPosType)0)) >> 3))) >> (BITS_FOR_POSITIONS - 3);
+
+      key.is <<= 3;
+      key.is |= (key.ls & (~((~((MyIntPosType)0)) >> 3))) >> (BITS_FOR_POSITIONS - 3);
+
+      key.ls <<= 3;
+    }
+}
diff --git a/src/sort/peano.h b/src/sort/peano.h
new file mode 100644
index 0000000000000000000000000000000000000000..b01bbf30966e73427e68667c327d98c76899e1aa
--- /dev/null
+++ b/src/sort/peano.h
@@ -0,0 +1,20 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  peano.h
+ *
+ *  \brief declaration of function prototypes used for Peano-Hilbert keys
+ */
+
+#ifndef SORT_H
+#define SORT_H
+
+peanokey peano_hilbert_key(MyIntPosType x, MyIntPosType y, MyIntPosType z, int bits);
+void peano_hilbert_key_inverse(peanokey key, int bits, MyIntPosType *x, MyIntPosType *y, MyIntPosType *z);
+
+unsigned char peano_incremental_key(unsigned char pix, unsigned char *rotation);
+
+#endif
diff --git a/src/sph/artificial_viscosity.cc b/src/sph/artificial_viscosity.cc
new file mode 100644
index 0000000000000000000000000000000000000000..921e425255cd4ad34eb852390a888a43bcb8a42b
--- /dev/null
+++ b/src/sph/artificial_viscosity.cc
@@ -0,0 +1,166 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  artificial_viscosity.cc
+ *
+ *  \brief Calculates time-dependent artificial viscosity parameter
+ */
+
+#include "gadgetconfig.h"
+
+#include <gsl/gsl_linalg.h>
+
+#include "../data/sph_particle_data.h"
+
+/*! This file contains the function for the time-dependent artificial viscosity
+ */
+
+#ifdef IMPROVED_VELOCITY_GRADIENTS
+void sph_particle_data::set_velocity_gradients(void)
+{
+#ifdef ONEDIMS
+  if(fabs(dpos.dx_dx) > 0)
+    {
+      dvel[0][0] = dvel[0][0] / dpos.dx_dx;
+      DivVel     = dvel[0][0];
+    }
+  else
+    {
+      DivVel = dvel[0][0] / Density;
+    }
+#elif defined(TWODIMS)
+  double det = dpos.dx_dx * dpos.dy_dy - dpos.dx_dy * dpos.dx_dy;
+  if(fabs(det) > 0)
+    {
+      double m11_inv = dpos.dy_dy / det;
+      double m12_inv = -dpos.dx_dy / det;
+      double m21_inv = -dpos.dx_dy / det;
+      double m22_inv = dpos.dx_dx / det;
+
+      double y11 = dvel[0][0];
+      double y21 = dvel[0][1];
+      double y12 = dvel[1][0];
+      double y22 = dvel[1][1];
+
+      dvel[0][0] = m11_inv * y11 + m12_inv * y21;
+      dvel[0][1] = m21_inv * y11 + m22_inv * y21;
+      dvel[1][0] = m11_inv * y12 + m12_inv * y22;
+      dvel[1][1] = m21_inv * y12 + m22_inv * y22;
+      DivVel     = dvel[0][0] + dvel[1][1];
+      CurlVel    = fabs(dvel[0][1] - dvel[1][0]);
+    }
+  else
+    {
+      // TODO check
+      DivVel  = (dvel[0][0] + dvel[1][1]) / Density;
+      CurlVel = fabs((dvel[1][0] - dvel[0][1]) / Density);
+    }
+#else
+  gsl_matrix* distance_matrix = gsl_matrix_alloc(3, 3);
+  gsl_matrix_set(distance_matrix, 0, 0, dpos.dx_dx);
+  gsl_matrix_set(distance_matrix, 0, 1, dpos.dx_dy);
+  gsl_matrix_set(distance_matrix, 0, 2, dpos.dx_dz);
+  gsl_matrix_set(distance_matrix, 1, 0, dpos.dx_dy);
+  gsl_matrix_set(distance_matrix, 1, 1, dpos.dy_dy);
+  gsl_matrix_set(distance_matrix, 1, 2, dpos.dy_dz);
+  gsl_matrix_set(distance_matrix, 2, 0, dpos.dx_dz);
+  gsl_matrix_set(distance_matrix, 2, 1, dpos.dy_dz);
+  gsl_matrix_set(distance_matrix, 2, 2, dpos.dz_dz);
+
+  int sign                = 1;
+  gsl_permutation* permut = gsl_permutation_alloc(3);
+  gsl_linalg_LU_decomp(distance_matrix, permut, &sign);
+
+  double det = gsl_linalg_LU_det(distance_matrix, sign);
+
+  if(fabs(det) > 0)
+    {
+      gsl_matrix* inv_distance_matrix = gsl_matrix_alloc(3, 3);
+      gsl_linalg_LU_invert(distance_matrix, permut, inv_distance_matrix);
+
+      double m_inv[3][3];
+      for(int i = 0; i < 3; i++)
+        {
+          for(int j = 0; j < 3; j++)
+            {
+              m_inv[i][j] = gsl_matrix_get(inv_distance_matrix, i, j);
+            }
+        }
+
+      double y[3][3];
+      for(int i = 0; i < 3; i++)
+        {
+          for(int j = 0; j < 3; j++)
+            {
+              y[i][j] = dvel[i][j];
+            }
+        }
+
+      for(int i = 0; i < 3; i++)
+        {
+          for(int j = 0; j < 3; j++)
+            {
+              dvel[i][j] = 0;
+              for(int k = 0; k < 3; k++)
+                {
+                  dvel[i][j] += m_inv[j][k] * y[k][i];
+                }
+            }
+        }
+
+      DivVel  = dvel[0][0] + dvel[1][1] + dvel[2][2];
+      Rot[0]  = dvel[2][1] - dvel[1][2];
+      Rot[1]  = dvel[0][2] - dvel[2][0];
+      Rot[2]  = dvel[1][0] - dvel[0][1];
+      CurlVel = sqrt(Rot[0] * Rot[0] + Rot[1] * Rot[1] + Rot[2] * Rot[2]);
+
+      gsl_permutation_free(permut);
+      gsl_matrix_free(distance_matrix);
+      gsl_matrix_free(inv_distance_matrix);
+    }
+  else
+    {
+      DivVel  = (dvel[0][0] + dvel[1][1] + dvel[2][2]) / Density;
+      Rot[0]  = (dvel[2][1] - dvel[1][2]) / Density;
+      Rot[1]  = (dvel[0][2] - dvel[2][0]) / Density;
+      Rot[2]  = (dvel[1][0] - dvel[0][1]) / Density;
+      CurlVel = sqrt(Rot[0] * Rot[0] + Rot[1] * Rot[1] + Rot[2] * Rot[2]);
+    }
+
+#endif
+}
+#endif
+
+#ifdef TIMEDEP_ART_VISC
+void sph_particle_data::set_viscosity_coefficient(double dt)
+{
+  double dDivVel_dt = dt > 0 ? (DivVel - DivVelOld) / (dt) : 0;
+  dDivVel_dt *= All.cf_a2inv;
+  double shockIndicator = -dDivVel_dt > 0 ? -dDivVel_dt : 0;
+  double hsml           = Hsml * All.cf_atime;
+  double Hsml2          = hsml * hsml;
+  double alpha_tar      = (Hsml2 * shockIndicator) / (Hsml2 * shockIndicator + Csnd * Csnd) * All.ArtBulkViscConst;  // TODO check max
+
+  double DivVel2       = DivVel * DivVel;
+  double CurlVel2      = CurlVel * CurlVel;
+  double CsndOverHsml2 = (Csnd / Hsml) * (Csnd / Hsml);
+  double limiter       = DivVel2 / (DivVel2 + CurlVel2 + 0.00001 * CsndOverHsml2);
+#ifdef NO_SHEAR_VISCOSITY_LIMITER
+  limiter = 1.;
+#endif
+
+  if(Alpha < alpha_tar)
+    {
+      Alpha = alpha_tar * limiter;
+      return;
+    }
+  double DecayTime = 10. * Hsml / decayVel;
+  Alpha            = limiter * (alpha_tar + (Alpha - alpha_tar) * exp(-dt / DecayTime));
+  if(Alpha < All.AlphaMin)
+    Alpha = All.AlphaMin;
+}
+
+#endif
diff --git a/src/sph/density.cc b/src/sph/density.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8073180bef50013053f020e3b87bdacfb8432aae
--- /dev/null
+++ b/src/sph/density.cc
@@ -0,0 +1,1056 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  density.cc
+ *
+ *  \brief SPH density computation and smoothing length determination
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../sort/cxxsort.h"
+#include "../sph/kernel.h"
+#include "../sph/sph.h"
+#include "../system/system.h"
+
+/*! This file contains the function for the "first SPH loop", where the SPH densities and some
+ *  auxiliary quantities are computed.  There is also functionality that
+ *  iteratively corrects the smoothing length to the desired value.
+ */
+
+/* This function checks whether there is a spatial overlap between the (rectangular) enclosing box
+ * of the particles contained in a node, and the search region.
+ */
+inline int sph::sph_density_evaluate_particle_node_opening_criterion(pinfo &pdat, ngbnode *nop)
+{
+  if(nop->level <= LEVEL_ALWAYS_OPEN)  // always open the root node (note: full node length does not fit in the integer type)
+    return NODE_OPEN;
+
+  if(nop->Ti_Current != All.Ti_Current)
+    nop->drift_node(All.Ti_Current, Tp);
+
+  MyIntPosType left[3], right[3];
+
+  left[0]  = Tp->nearest_image_intpos_to_intpos_X(nop->center_offset_min[0] + nop->center[0], pdat.search_min[0]);
+  right[0] = Tp->nearest_image_intpos_to_intpos_X(nop->center_offset_max[0] + nop->center[0], pdat.search_min[0]);
+
+  /* check whether we can stop walking along this branch */
+  if(left[0] > pdat.search_range[0] && right[0] > left[0])
+    return NODE_DISCARD;
+
+  left[1]  = Tp->nearest_image_intpos_to_intpos_Y(nop->center_offset_min[1] + nop->center[1], pdat.search_min[1]);
+  right[1] = Tp->nearest_image_intpos_to_intpos_Y(nop->center_offset_max[1] + nop->center[1], pdat.search_min[1]);
+
+  /* check whether we can stop walking along this branch */
+  if(left[1] > pdat.search_range[1] && right[1] > left[1])
+    return NODE_DISCARD;
+
+  left[2]  = Tp->nearest_image_intpos_to_intpos_Z(nop->center_offset_min[2] + nop->center[2], pdat.search_min[2]);
+  right[2] = Tp->nearest_image_intpos_to_intpos_Z(nop->center_offset_max[2] + nop->center[2], pdat.search_min[2]);
+
+  /* check whether we can stop walking along this branch */
+  if(left[2] > pdat.search_range[2] && right[2] > left[2])
+    return NODE_DISCARD;
+
+  return NODE_OPEN;
+}
+
+/* Check whether the potential neighbor referenced by p/p_type/shmrank is inside the smoothing length, and if yes
+ * add it to the interaction list built up for this particle.
+ */
+inline void sph::sph_density_check_particle_particle_interaction(pinfo &pdat, int p, int p_type, unsigned char shmrank)
+{
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  if(skip_actual_force_computation)
+    return;
+#endif
+
+  if(p_type == NODE_TYPE_LOCAL_PARTICLE) /* local particle */
+    {
+      particle_data *P        = get_Pp(p, shmrank);
+      sph_particle_data *SphP = get_SphPp(p, shmrank);
+
+      if(P->getType() > 0)
+        return;
+
+      if(P->get_Ti_Current() != All.Ti_Current)
+        Tp->drift_particle(P, SphP, All.Ti_Current);  // this function avoids race conditions
+
+      double posdiff[3];
+      Tp->nearest_image_intpos_to_pos(P->IntPos, pdat.searchcenter, posdiff); /* converts the integer distance to floating point */
+
+      if(posdiff[0] * posdiff[0] + posdiff[1] * posdiff[1] + posdiff[2] * posdiff[2] > pdat.hsml2)
+        return;
+
+      if(pdat.numngb >= MAX_NGBS)
+        Terminate("pdat.numngb >= MAX_NGBS");
+
+      int n = pdat.numngb++;
+
+      Ngbdensdat[n].IntPos  = P->IntPos;
+      Ngbdensdat[n].VelPred = SphP->VelPred;
+      Ngbdensdat[n].Mass    = P->getMass();
+#ifdef PRESSURE_ENTROPY_SPH
+      Ngbdensdat[n].EntropyToInvGammaPred = SphP->EntropyToInvGammaPred;
+#endif
+#ifdef TIMEDEP_ART_VISC
+      Ngbdensdat[n].Csnd = SphP->Csnd;
+#endif
+    }
+  else if(p_type == NODE_TYPE_FETCHED_PARTICLE)
+    {
+      foreign_sphpoint_data *foreignpoint = get_foreignpointsp(p - EndOfForeignNodes, shmrank);
+
+      /* converts the integer distance to floating point */
+      double posdiff[3];
+      Tp->nearest_image_intpos_to_pos(foreignpoint->IntPos, pdat.searchcenter, posdiff);
+
+      if(posdiff[0] * posdiff[0] + posdiff[1] * posdiff[1] + posdiff[2] * posdiff[2] > pdat.hsml2)
+        return;
+
+      if(pdat.numngb >= MAX_NGBS)
+        Terminate("pdat.numngb >= MAX_NGBS");
+
+      int n = pdat.numngb++;
+
+      Ngbdensdat[n].IntPos  = foreignpoint->IntPos;
+      Ngbdensdat[n].VelPred = foreignpoint->SphCore.VelPred;
+      Ngbdensdat[n].Mass    = foreignpoint->Mass;
+#ifdef PRESSURE_ENTROPY_SPH
+      Ngbdensdat[n].EntropyToInvGammaPred = foreignpoint->SphCore.EntropyToInvGammaPred;
+#endif
+#ifdef TIMEDEP_ART_VISC
+      Ngbdensdat[n].Csnd = foreignpoint->SphCore.Csnd;
+#endif
+    }
+  else
+    Terminate("unexpected");
+}
+
+/* Continues to walk the tree for the particle referenced in pdat by opening a node.
+ */
+inline void sph::sph_density_open_node(pinfo &pdat, ngbnode *nop, int mintopleafnode, int committed)
+{
+  /* open node */
+  int p                 = nop->nextnode;
+  unsigned char shmrank = nop->nextnode_shmrank;
+
+  while(p != nop->sibling || (shmrank != nop->sibling_shmrank && nop->sibling >= MaxPart + D->NTopnodes))
+    {
+      if(p < 0)
+        Terminate(
+            "p=%d < 0  nop->sibling=%d nop->nextnode=%d shmrank=%d nop->sibling_shmrank=%d nop->foreigntask=%d  "
+            "first_nontoplevelnode=%d",
+            p, nop->sibling, nop->nextnode, shmrank, nop->sibling_shmrank, nop->OriginTask, MaxPart + D->NTopnodes);
+
+      int next;
+      unsigned char next_shmrank;
+      char type;
+
+      if(p < MaxPart) /* a local particle */
+        {
+          /* note: here shmrank cannot change */
+          next         = get_nextnodep(shmrank)[p];
+          next_shmrank = shmrank;
+          type         = NODE_TYPE_LOCAL_PARTICLE;
+        }
+      else if(p < MaxPart + MaxNodes) /* an internal node  */
+        {
+          ngbnode *nop = get_nodep(p, shmrank);
+          next         = nop->sibling;
+          next_shmrank = nop->sibling_shmrank;
+          type         = NODE_TYPE_LOCAL_NODE;
+        }
+      else if(p >= ImportedNodeOffset && p < EndOfTreePoints) /* an imported Treepoint particle  */
+        {
+          Terminate("not expected for SPH");
+        }
+      else if(p >= EndOfTreePoints && p < EndOfForeignNodes) /* an imported tree node */
+        {
+          ngbnode *nop = get_nodep(p, shmrank);
+          next         = nop->sibling;
+          next_shmrank = nop->sibling_shmrank;
+          type         = NODE_TYPE_FETCHED_NODE;
+        }
+      else if(p >= EndOfForeignNodes) /* an imported particle below an imported tree node */
+        {
+          foreign_sphpoint_data *foreignpoint = get_foreignpointsp(p - EndOfForeignNodes, shmrank);
+
+          next         = foreignpoint->Nextnode;
+          next_shmrank = foreignpoint->Nextnode_shmrank;
+          type         = NODE_TYPE_FETCHED_PARTICLE;
+        }
+      else
+        {
+          /* a pseudo point */
+          Terminate(
+              "should not happen: p=%d MaxPart=%d MaxNodes=%d  ImportedNodeOffset=%d  EndOfTreePoints=%d  EndOfForeignNodes=%d "
+              "shmrank=%d  nop->nextnode=%d  nop->cannot_be_opened_locally=%d  nop->not_empty=%d  nop-TopNodes=%d",
+              p, MaxPart, MaxNodes, ImportedNodeOffset, EndOfTreePoints, EndOfForeignNodes, shmrank, nop->nextnode,
+              (int)nop->cannot_be_opened_locally, (int)nop->not_empty, (int)(nop - TopNodes));
+        }
+
+      sph_density_interact(pdat, p, type, shmrank, mintopleafnode, committed);
+
+      p       = next;
+      shmrank = next_shmrank;
+    }
+}
+
+/* Take care of SPH density interaction between the particle referenced in pdat, and the node
+ * referenced through no/shmrank. The node can either be a normal node or an imported node from another
+ * shared memory machine, and the node can either be on the present MPI rank, or from another MPI rank in the
+ * local shared memory machine.
+ */
+inline void sph::sph_density_interact(pinfo &pdat, int no, char no_type, unsigned char shmrank, int mintopleafnode, int committed)
+{
+  if(no_type <= NODE_TYPE_FETCHED_PARTICLE)  // we are interacting with a particle
+    {
+      sph_density_check_particle_particle_interaction(pdat, no, no_type, shmrank);
+    }
+  else  // we are interacting with a node
+    {
+      ngbnode *nop = get_nodep(no, shmrank);
+
+      if(nop->not_empty == 0)
+        return;
+
+      if(no < MaxPart + MaxNodes)                // we have a top-levelnode
+        if(nop->nextnode >= MaxPart + MaxNodes)  // if the next node is not a top-level, we have a leaf node
+          mintopleafnode = no;
+
+      int openflag = sph_density_evaluate_particle_node_opening_criterion(pdat, nop);
+
+      if(openflag == NODE_OPEN) /* we need to open it */
+        {
+          if(nop->cannot_be_opened_locally.load(std::memory_order_acquire))
+            {
+              // are we in the same shared memory node?
+              if(Shmem.GetNodeIDForSimulCommRank[nop->OriginTask] == Shmem.GetNodeIDForSimulCommRank[D->ThisTask])
+                {
+                  Terminate("this should not happen any more");
+                }
+              else
+                {
+                  tree_add_to_fetch_stack(nop, no, shmrank);  // will only add unique copies
+
+                  tree_add_to_work_stack(pdat.target, no, shmrank, mintopleafnode);
+                }
+            }
+          else
+            {
+              int min_buffer_space =
+                  std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+
+              if(min_buffer_space >= committed + 8 * TREE_NUM_BEFORE_NODESPLIT)
+                sph_density_open_node(pdat, nop, mintopleafnode, committed + 8 * TREE_NUM_BEFORE_NODESPLIT);
+              else
+                tree_add_to_work_stack(pdat.target, no, shmrank, mintopleafnode);
+            }
+        }
+    }
+}
+
+/* Internal driver routine to compute densities for the particles with indices listed in the targetlist
+ * array.
+ */
+void sph::densities_determine(int ntarget, int *targetlist)
+{
+  Ngbdensdat = (ngbdata_density *)Mem.mymalloc("Ngbdensdat", MAX_NGBS * sizeof(ngbdata_density));
+
+  NumOnWorkStack         = 0;
+  AllocWorkStackBaseLow  = std::max<int>(1.5 * (Tp->NumPart + NumPartImported), TREE_MIN_WORKSTACK_SIZE);
+  AllocWorkStackBaseHigh = AllocWorkStackBaseLow + TREE_EXPECTED_CYCLES * TREE_MIN_WORKSTACK_SIZE;
+  MaxOnWorkStack         = AllocWorkStackBaseLow;
+
+  WorkStack = (workstack_data *)Mem.mymalloc("WorkStack", AllocWorkStackBaseHigh * sizeof(workstack_data));
+
+  for(int i = 0; i < ntarget; i++)
+    {
+      int target = targetlist[i];
+
+      clear_density_result(&Tp->SphP[target]);
+
+      WorkStack[NumOnWorkStack].Target         = target;
+      WorkStack[NumOnWorkStack].Node           = MaxPart;
+      WorkStack[NumOnWorkStack].ShmRank        = Shmem.Island_ThisTask;
+      WorkStack[NumOnWorkStack].MinTopLeafNode = MaxPart + D->NTopnodes;
+      NumOnWorkStack++;
+    }
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  workstack_data *WorkStackBak = (workstack_data *)Mem.mymalloc("WorkStackBak", NumOnWorkStack * sizeof(workstack_data));
+  int NumOnWorkStackBak        = NumOnWorkStack;
+  memcpy(WorkStackBak, WorkStack, NumOnWorkStack * sizeof(workstack_data));
+#endif
+
+  // set a default size of the fetch stack equal to half the work stack (this may still be somewhat too large)
+  MaxOnFetchStack = std::max<int>(0.1 * (Tp->NumPart + NumPartImported), TREE_MIN_WORKSTACK_SIZE);
+  StackToFetch    = (fetch_data *)Mem.mymalloc_movable(&StackToFetch, "StackToFetch", MaxOnFetchStack * sizeof(fetch_data));
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  for(int rep = 0; rep < 2; rep++)
+    {
+      if(rep == 0)
+        {
+          skip_actual_force_computation = true;
+        }
+      else
+        {
+          skip_actual_force_computation = false;
+          NumOnWorkStack                = NumOnWorkStackBak;
+          memcpy(WorkStack, WorkStackBak, NumOnWorkStack * sizeof(workstack_data));
+        }
+#endif
+
+      while(NumOnWorkStack > 0)  // repeat until we are out of work
+        {
+          NewOnWorkStack  = 0;  // gives the new entries
+          NumOnFetchStack = 0;
+          MaxOnWorkStack  = std::min<int>(AllocWorkStackBaseLow + max_ncycles * TREE_MIN_WORKSTACK_SIZE, AllocWorkStackBaseHigh);
+
+          TIMER_START(CPU_DENSWALK);
+
+          int item = 0;
+
+          while(item < NumOnWorkStack)
+            {
+              int committed = 8 * TREE_NUM_BEFORE_NODESPLIT;
+              int min_buffer_space =
+                  std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+              if(min_buffer_space >= committed)
+                {
+                  int target     = WorkStack[item].Target;
+                  int no         = WorkStack[item].Node;
+                  int shmrank    = WorkStack[item].ShmRank;
+                  int mintopleaf = WorkStack[item].MinTopLeafNode;
+                  item++;
+
+                  pinfo pdat;
+                  get_pinfo(target, pdat);
+
+                  if(no == MaxPart)
+                    {
+                      // we have a pristine particle that's processed for the first time
+                      sph_density_interact(pdat, no, NODE_TYPE_LOCAL_NODE, shmrank, mintopleaf, committed);
+                    }
+                  else
+                    {
+                      // we have a node that we previously could not open
+                      ngbnode *nop = get_nodep(no, shmrank);
+
+                      if(nop->cannot_be_opened_locally)
+                        {
+                          Terminate("item=%d:  no=%d  now we should be able to open it!", item, no);
+                        }
+                      else
+                        sph_density_open_node(pdat, nop, mintopleaf, committed);
+                    }
+
+                  density_evaluate_kernel(pdat);
+                }
+              else
+                break;
+            }
+
+          if(item == 0 && NumOnWorkStack > 0)
+            Terminate("Can't even process a single particle");
+
+          TIMER_STOP(CPU_DENSWALK);
+
+          TIMER_START(CPU_DENSFETCH);
+
+          tree_fetch_foreign_nodes(FETCH_SPH_DENSITY);
+
+          TIMER_STOP(CPU_DENSFETCH);
+
+          /* now reorder the workstack such that we are first going to do residual pristine particles, and then
+           * imported nodes that hang below the first leaf nodes */
+          NumOnWorkStack = NumOnWorkStack - item + NewOnWorkStack;
+          memmove(WorkStack, WorkStack + item, NumOnWorkStack * sizeof(workstack_data));
+
+          /* now let's sort such that we can go deep on top-level node branches, allowing us to clear them out eventually */
+          mycxxsort(WorkStack, WorkStack + NumOnWorkStack, compare_workstack);
+
+          max_ncycles++;
+        }
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+    }
+#endif
+
+  Mem.myfree(StackToFetch);
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  Mem.myfree(WorkStackBak);
+#endif
+  Mem.myfree(WorkStack);
+  Mem.myfree(Ngbdensdat);
+}
+
+/* This first makes sure that all active SPH particles are drifted to the current time,
+ * and then calls the SPH density computation for them.
+ */
+void sph::compute_densities(void)
+{
+  if(Tp->TimeBinsHydro.GlobalNActiveParticles > 0)
+    {
+      /* now drift the active hydro particles if not done already */
+      for(int i = 0; i < Tp->TimeBinsHydro.NActiveParticles; i++)
+        {
+          particle_data *P        = &Tp->P[Tp->TimeBinsHydro.ActiveParticleList[i]];
+          sph_particle_data *SphP = &Tp->SphP[Tp->TimeBinsHydro.ActiveParticleList[i]];
+
+          Tp->drift_particle(P, SphP, All.Ti_Current);
+#ifdef TIMEDEP_ART_VISC
+          SphP->DivVelOld = SphP->DivVel;
+#endif
+        }
+
+      /* compute density (and updates pressure) */
+      density(Tp->TimeBinsHydro.ActiveParticleList, Tp->TimeBinsHydro.NActiveParticles);
+    }
+}
+
+/* Compute the SPH densities for all particles listed in the index-array. The routine finds a suitable
+ * smoothing length by a bisection algorithm.
+ */
+void sph::density(int *list, int ntarget)
+{
+  TIMER_STORE;
+  TIMER_START(CPU_DENSITY);
+
+  D->mpi_printf("SPH-DENSITY: Begin density calculation. (presently allocated=%g MB)\n", Mem.getAllocatedBytesInMB());
+  D->mpi_printf("SPH-DENSITY: Ndensities=%llu (task zero has: NumGas=%d, Ndensities=%d)\n", Tp->TimeBinsHydro.GlobalNActiveParticles,
+                Tp->NumGas, ntarget);
+
+  double ta = Logs.second();
+
+  /* Create list of targets. We do this here to simplify the treatment later on */
+  int *targetList = (int *)Mem.mymalloc("TargetList", Tp->NumGas * sizeof(int));
+  MyFloat *Left   = (MyFloat *)Mem.mymalloc("Left", Tp->NumGas * sizeof(MyFloat));
+  MyFloat *Right  = (MyFloat *)Mem.mymalloc("Right", Tp->NumGas * sizeof(MyFloat));
+
+  int ndensities = ntarget;
+
+  for(int i = 0; i < ntarget; i++)
+    {
+      int target    = list[i];
+      targetList[i] = target;
+      Left[target] = Right[target] = 0.0;
+    }
+
+  int iter = 0;
+
+  // let's grab at most half the still available memory for imported points and nodes
+  int nspace = (0.5 * Mem.FreeBytes) / (sizeof(ngbnode) + 8 * sizeof(foreign_sphpoint_data));
+
+  MaxForeignNodes  = nspace;
+  MaxForeignPoints = 8 * nspace;
+  NumForeignNodes  = 0;
+  NumForeignPoints = 0;
+
+  sum_NumForeignNodes  = 0;
+  sum_NumForeignPoints = 0;
+
+  /* the following two arrays hold imported tree nodes and imported points to augment the local tree */
+  Foreign_Nodes  = (ngbnode *)Mem.mymalloc_movable(&Foreign_Nodes, "Foreign_Nodes", MaxForeignNodes * sizeof(ngbnode));
+  Foreign_Points = (foreign_sphpoint_data *)Mem.mymalloc_movable(&Foreign_Points, "Foreign_Points",
+                                                                 MaxForeignPoints * sizeof(foreign_sphpoint_data));
+
+  tree_initialize_leaf_node_access_info();
+
+  max_ncycles = 0;
+
+  prepare_shared_memory_access();
+
+  do
+    {
+      double t0 = Logs.second();
+
+      /* now do the primary work with this call */
+
+      densities_determine(ndensities, targetList);
+
+      /* do final operations on results */
+      int npleft = 0;
+
+      for(int i = 0; i < ndensities; i++)
+        {
+          int target = targetList[i];
+
+          if(target >= 0)
+            {
+              if(Tp->P[target].getType() != 0)
+                Terminate("P[target].getType() != 0");
+
+              sph_particle_data *SphP = Tp->SphP;
+              if(SphP[target].Density > 0)
+                {
+#ifdef WENDLAND_BIAS_CORRECTION
+                  SphP[target].Density -= get_density_bias(SphP[target].Hsml, Tp->P[target].getMass(), All.DesNumNgb);
+#endif
+                  SphP[target].DhsmlDensityFactor *= SphP[target].Hsml / (NUMDIMS * SphP[target].Density);
+                  if(SphP[target].DhsmlDensityFactor >
+                     -0.9) /* note: this would be -1 if only a single particle at zero lag is found */
+                    SphP[target].DhsmlDensityFactor = 1 / (1 + SphP[target].DhsmlDensityFactor);
+                  else
+                    SphP[target].DhsmlDensityFactor = 1;
+
+#ifndef IMPROVED_VELOCITY_GRADIENTS
+                  SphP[target].CurlVel = sqrt(SphP[target].Rot[0] * SphP[target].Rot[0] + SphP[target].Rot[1] * SphP[target].Rot[1] +
+                                              SphP[target].Rot[2] * SphP[target].Rot[2]) /
+                                         SphP[target].Density;
+
+                  SphP[target].DivVel /= SphP[target].Density;
+#else
+                  SphP[target].set_velocity_gradients();
+#endif
+                  SphP[target].DtHsml    = (1.0 / NUMDIMS) * SphP[target].DivVel * SphP[target].Hsml;
+                  SphP[target].DtDensity = -SphP[target].DivVel * SphP[target].Density;
+
+#ifndef PRESSURE_ENTROPY_SPH
+                  SphP[target].set_thermodynamic_variables();
+#endif
+                }
+
+#ifdef PRESSURE_ENTROPY_SPH
+              if(SphP[target].EntropyToInvGammaPred > 0 && SphP[target].PressureSphDensity > 0)
+                {
+                  SphP[target].DhsmlDerivedDensityFactor *=
+                      SphP[target].Hsml / (NUMDIMS * SphP[target].Density * SphP[target].EntropyToInvGammaPred);
+                  SphP[target].DhsmlDerivedDensityFactor *= -SphP[target].DhsmlDensityFactor;
+                  SphP[target].PressureSphDensity /= SphP[target].EntropyToInvGammaPred;
+#ifdef WENDLAND_BIAS_CORRECTION /* Dehnen & Aly 2012, eq (18), (19) */
+                  SphP[target].PressureSphDensity -= get_density_bias(SphP[target].Hsml, Tp->P[target].getMass(), All.DesNumNgb);
+#endif
+                  SphP[target].DtPressureSphDensity = -SphP[target].DivVel * SphP[target].PressureSphDensity;
+                  SphP[target].set_thermodynamic_variables();
+                }
+              else
+                {
+                  SphP[target].DhsmlDerivedDensityFactor = 0;
+                  SphP[target].EntropyToInvGammaPred     = 0;
+                  SphP[target].PressureSphDensity        = 0;
+                }
+
+#endif
+#ifdef TIMEDEP_ART_VISC
+              double dt = (Tp->P[target].getTimeBinHydro() ? (((integertime)1) << Tp->P[target].getTimeBinHydro()) : 0) *
+                          All.Timebase_interval;
+              double dtime = All.cf_atime * dt / All.cf_atime_hubble_a;
+              SphP[target].set_viscosity_coefficient(dtime);
+#endif
+              /* now check whether we had enough neighbours */
+              double desnumngb    = All.DesNumNgb;
+              double desnumngbdev = All.MaxNumNgbDeviation;
+
+              double hfac = 1;
+              for(int i = 0; i < NUMDIMS; i++)
+                {
+                  hfac *= SphP[target].Hsml;
+                }
+
+              SphP[target].NumNgb = NORM_COEFF * hfac * SphP[target].Density / Tp->P[target].getMass();
+              if(SphP[target].NumNgb < (desnumngb - desnumngbdev) || (SphP[target].NumNgb > (desnumngb + desnumngbdev)))
+                {
+                  if(Left[target] > 0 && Right[target] > 0)
+                    if((Right[target] - Left[target]) < 1.0e-3 * Left[target])
+                      {
+                        /* this one should be ok */
+                        continue;
+                      }
+
+                  /* need to redo this particle */
+                  targetList[npleft++] = target;
+
+                  if(SphP[target].NumNgb < (desnumngb - desnumngbdev))
+                    Left[target] = std::max<double>(SphP[target].Hsml, Left[target]);
+                  else
+                    {
+                      if(Right[target] != 0)
+                        {
+                          if(SphP[target].Hsml < Right[target])
+                            Right[target] = SphP[target].Hsml;
+                        }
+                      else
+                        Right[target] = SphP[target].Hsml;
+                    }
+
+                  if(iter >= MAXITER - 10)
+                    {
+                      double pos[3];
+                      Tp->intpos_to_pos(Tp->P[target].IntPos, pos); /* converts the integer coordinates to floating point */
+
+                      printf("target=%d Hsml=%g  task=%d ID=%llu Left=%g Right=%g Ngbs=%g Right-Left=%g\n   pos=(%g|%g|%g)\n", target,
+                             SphP[target].Hsml, D->ThisTask, (unsigned long long)Tp->P[target].ID.get(), Left[target], Right[target],
+                             SphP[target].NumNgb, Right[target] - Left[target], pos[0], pos[1], pos[2]);
+                      myflush(stdout);
+                    }
+
+                  if(Right[target] > 0 && Left[target] > 0)
+                    SphP[target].Hsml = pow(0.5 * (pow(Left[target], 3) + pow(Right[target], 3)), 1.0 / 3);
+                  else
+                    {
+                      if(Right[target] == 0 && Left[target] == 0)
+                        Terminate("Right[i] == 0 && Left[i] == 0 SphP[i].Hsml=%g\n", SphP[target].Hsml);
+
+                      if(Right[target] == 0 && Left[target] > 0)
+                        {
+                          if(Tp->P[target].getType() == 0 && fabs(SphP[target].NumNgb - desnumngb) < 0.5 * desnumngb)
+                            {
+                              double fac = 1 - (SphP[target].NumNgb - desnumngb) / (NUMDIMS * SphP[target].NumNgb) *
+                                                   SphP[target].DhsmlDensityFactor;
+
+                              if(fac < 1.26)
+                                SphP[target].Hsml *= fac;
+                              else
+                                SphP[target].Hsml *= 1.26;
+                            }
+                          else
+                            SphP[target].Hsml *= 1.26;
+                        }
+
+                      if(Right[target] > 0 && Left[target] == 0)
+                        {
+                          if(Tp->P[target].getType() == 0 && fabs(SphP[target].NumNgb - desnumngb) < 0.5 * desnumngb && iter < 4)
+                            {
+                              double fac = 1 - (SphP[target].NumNgb - desnumngb) / (NUMDIMS * SphP[target].NumNgb) *
+                                                   SphP[target].DhsmlDensityFactor;
+
+                              if(fac > 1 / 1.26)
+                                SphP[target].Hsml *= fac;
+                              else
+                                SphP[target].Hsml /= 1.26;
+                            }
+                          else
+                            SphP[target].Hsml /= 1.26;
+                        }
+                    }
+                }
+            }
+        }
+
+      ndensities = npleft;
+
+      double t1 = Logs.second();
+
+      if(npleft > 0)
+        {
+          iter++;
+
+          D->mpi_printf("SPH-DENSITY: ngb iteration %4d: took %8.3f  , need to repeat for %012lld local particles.\n", iter,
+                        Logs.timediff(t0, t1), npleft);
+
+          if(iter > MAXITER)
+            Terminate("failed to converge in neighbour iteration in density()\n");
+        }
+      else
+        D->mpi_printf("SPH-DENSITY: ngb iteration %4d: took %8.3f\n", ++iter, Logs.timediff(t0, t1));
+    }
+  while(ndensities > 0);
+
+  TIMER_START(CPU_DENSIMBALANCE);
+
+  MPI_Allreduce(MPI_IN_PLACE, &max_ncycles, 1, MPI_INT, MPI_MAX, D->Communicator);
+
+  TIMER_STOP(CPU_DENSIMBALANCE);
+
+  cleanup_shared_memory_access();
+
+  /* free temporary buffers */
+
+  Mem.myfree(Foreign_Points);
+  Mem.myfree(Foreign_Nodes);
+
+  Mem.myfree(Right);
+  Mem.myfree(Left);
+  Mem.myfree(targetList);
+
+  double tb = Logs.second();
+
+  TIMER_STOPSTART(CPU_DENSITY, CPU_LOGS);
+
+  D->mpi_printf("SPH-DENSITY: density computation done. took %8.3f\n", Logs.timediff(ta, tb));
+
+  struct detailed_timings
+  {
+    double tree, wait, fetch, all;
+    double numnodes;
+    double NumForeignNodes, NumForeignPoints;
+    double fillfacFgnNodes, fillfacFgnPoints;
+  };
+  detailed_timings timer, tisum, timax;
+
+  timer.tree             = TIMER_DIFF(CPU_DENSWALK);
+  timer.wait             = TIMER_DIFF(CPU_DENSIMBALANCE);
+  timer.fetch            = TIMER_DIFF(CPU_DENSFETCH);
+  timer.all              = timer.tree + timer.wait + timer.fetch + TIMER_DIFF(CPU_DENSITY);
+  timer.numnodes         = NumNodes;
+  timer.NumForeignNodes  = NumForeignNodes;
+  timer.NumForeignPoints = NumForeignPoints;
+  timer.fillfacFgnNodes  = NumForeignNodes / ((double)MaxForeignNodes);
+  timer.fillfacFgnPoints = NumForeignPoints / ((double)MaxForeignPoints);
+
+  MPI_Reduce((double *)&timer, (double *)&tisum, (int)(sizeof(detailed_timings) / sizeof(double)), MPI_DOUBLE, MPI_SUM, 0,
+             D->Communicator);
+  MPI_Reduce((double *)&timer, (double *)&timax, (int)(sizeof(detailed_timings) / sizeof(double)), MPI_DOUBLE, MPI_MAX, 0,
+             D->Communicator);
+
+  All.TotNumDensity += Tp->TimeBinsHydro.GlobalNActiveParticles;
+
+  if(D->ThisTask == 0)
+    {
+      fprintf(Logs.FdDensity, "Nf=%9lld  highest active timebin=%d  total-Nf=%lld\n", Tp->TimeBinsHydro.GlobalNActiveParticles,
+              All.HighestActiveTimeBin, All.TotNumDensity);
+      fprintf(Logs.FdDensity, "   work-load balance: %g   part/sec: raw=%g, effective=%g\n",
+              timax.tree / ((tisum.tree + 1e-20) / D->NTask), Tp->TimeBinsGravity.GlobalNActiveParticles / (tisum.tree + 1.0e-20),
+              Tp->TimeBinsGravity.GlobalNActiveParticles / ((timax.tree + 1.0e-20) * D->NTask));
+      fprintf(Logs.FdDensity,
+              "   maximum number of nodes: %g, filled: %g  NumForeignNodes: max=%g avg=%g fill=%g NumForeignPoints: max=%g avg=%g "
+              "fill=%g  cycles=%d\n",
+              timax.numnodes, timax.numnodes / MaxNodes, timax.NumForeignNodes, tisum.NumForeignNodes / D->NTask,
+              timax.fillfacFgnNodes, timax.NumForeignPoints, tisum.NumForeignPoints / D->NTask, timax.fillfacFgnPoints, max_ncycles);
+      fprintf(Logs.FdDensity, "   avg times: <all>=%g  <tree>=%g  <wait>=%g  <fetch>=%g  sec\n", tisum.all / D->NTask,
+              tisum.tree / D->NTask, tisum.wait / D->NTask, tisum.fetch / D->NTask);
+      myflush(Logs.FdDensity);
+    }
+
+  TIMER_STOP(CPU_LOGS);
+}
+
+#ifdef EXPLICIT_VECTORIZATION
+
+/* Main SPH compute kernel, in a version that is explicitely vectorized. Several neighbours are
+ * processed through one vector. The calculation should be semantically equivalent to the standard
+ * looped version without explicit vector instructions.
+ */
+void sph::density_evaluate_kernel(pinfo &pdat)
+{
+  particle_data *targetP        = &Tp->P[pdat.target];
+  sph_particle_data *targetSphP = &Tp->SphP[pdat.target];
+
+  double shinv, shinv3, shinv4;
+  kernel_hinv(targetSphP->Hsml, &shinv, &shinv3, &shinv4);
+
+  Vec4d hinv(shinv);
+  Vec4d hinv3(shinv3);
+  Vec4d hinv4(shinv4);
+
+  Vec4d dwnorm(NORM * shinv3);
+  Vec4d dwknorm(NORM * shinv4);
+
+  Vec4d v_i[NUMDIMS];
+  for(int i = 0; i < NUMDIMS; i++)
+    {
+      v_i[i] = targetSphP->VelPred[i];
+    }
+
+  Vec4d cs_i(targetSphP->Csnd);
+  const int vector_length = 4;
+  const int array_length  = (pdat.numngb + vector_length - 1) & (-vector_length);
+
+  for(int n = pdat.numngb; n < array_length; n++) /* fill up neighbour array so that sensible data is accessed */
+    Ngbdensdat[n] = Ngbdensdat[0];
+
+  for(int n = 0; n < array_length; n += vector_length)
+    {
+      struct ngbdata_density *ngb0 = &Ngbdensdat[n + 0];
+      struct ngbdata_density *ngb1 = &Ngbdensdat[n + 1];
+      struct ngbdata_density *ngb2 = &Ngbdensdat[n + 2];
+      struct ngbdata_density *ngb3 = &Ngbdensdat[n + 3];
+
+      Vec4d dpos[NUMDIMS];
+#if defined(LONG_X_BITS) || defined(LONG_Y_BITS) || defined(LONG_Z_BITS)
+      double posdiff[array_length][3];
+      for(int i = 0; i < 4; i++)
+        {
+          Tp->nearest_image_intpos_to_pos(targetP->IntPos, Ngbdensdat[n + i].IntPos, &(posdiff[i][0]));
+        }
+
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          dpos[i] = Vec4d(posdiff[0][i], posdiff[1][i], posdiff[2][i], posdiff[3][i]);
+        }
+#else
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          dpos[i] = Tp->nearest_image_intpos_to_doublepos_vectorial(targetP->IntPos[i], ngb0->IntPos[i], ngb1->IntPos[i],
+                                                                    ngb2->IntPos[i], ngb3->IntPos[i]);
+        }
+#endif
+
+      Vec4d v_j[NUMDIMS];
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          v_j[i] = Vec4d(ngb0->VelPred[i], ngb1->VelPred[i], ngb2->VelPred[i], ngb3->VelPred[i]);
+        }
+
+      Vec4d mass_j(ngb0->Mass, ngb1->Mass, ngb2->Mass, ngb3->Mass);
+      Vec4d r2(0);
+
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          r2 += dpos[i] * dpos[i];
+        }
+
+      Vec4d r = sqrt(r2);
+
+      Vec4d u = r * hinv;
+
+      /* now calculate the kernel */
+      Vec4d wk, dwk;
+      kernel_main_vector(u, dwnorm, dwknorm, &wk, &dwk);
+
+      if(n + vector_length > pdat.numngb) /* we have excess elements */
+        {
+          mass_j.cutoff(vector_length - (array_length - pdat.numngb));
+          wk.cutoff(vector_length - (array_length - pdat.numngb));
+        }
+
+      Vec4d mj_wk = mass_j * wk;
+
+      targetSphP->Density += horizontal_add(mj_wk);
+
+#ifdef PRESSURE_ENTROPY_SPH
+      Vec4d entr_j(ngb0->EntropyToInvGammaPred, ngb1->EntropyToInvGammaPred, ngb2->EntropyToInvGammaPred, ngb3->EntropyToInvGammaPred);
+
+      targetSphP->PressureSphDensity += horizontal_add(mj_wk * entr_j);
+
+      targetSphP->DhsmlDerivedDensityFactor += horizontal_add(-mass_j * entr_j * (NUMDIMS * hinv * wk + u * dwk));
+#endif
+
+      targetSphP->DhsmlDensityFactor += horizontal_add(-mass_j * (NUMDIMS * hinv * wk + u * dwk));
+
+      Vec4db decision = (r > 0);
+
+      r = select(decision, r, 1.0); /* note, for r=0, we have dwk=0 */
+
+      Vec4d mj_dwk_r = mass_j * dwk / r;
+
+      Vec4d dv[NUMDIMS];
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          dv[i] = v_i[i] - v_j[i];
+        }
+
+#ifndef IMPROVED_VELOCITY_GRADIENTS
+      Vec4d dpos_times_dv(0);
+      for(int i = 0; i < NUMDIMS; i++)
+        dpos_times_dv += dpos[i] * dv[i];
+
+      targetSphP->DivVel += horizontal_add(-mj_dwk_r * dpos_times_dv);
+
+#ifdef TWODIMS
+      targetSphP->Rot[2] += horizontal_add(mj_dwk_r * (dpos[1] * dv[0] - dpos[0] * dv[1]));
+#endif
+#ifdef THREEDIMS
+      targetSphP->Rot[0] += horizontal_add(mj_dwk_r * (dpos[2] * dv[1] - dpos[1] * dv[2]));
+      targetSphP->Rot[1] += horizontal_add(mj_dwk_r * (dpos[0] * dv[2] - dpos[2] * dv[0]));
+      targetSphP->Rot[2] += horizontal_add(mj_dwk_r * (dpos[1] * dv[0] - dpos[0] * dv[1]));
+#endif
+#else
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          for(int j = 0; j < NUMDIMS; j++)
+            {
+              targetSphP->dvel[i][j] -= horizontal_add(mj_dwk_r * dv[i] * dpos[j]);
+            }
+        }
+      targetSphP->dpos.dx_dx -= horizontal_add(mj_dwk_r * dpos[0] * dpos[0]);
+      targetSphP->dpos.dx_dy -= horizontal_add(mj_dwk_r * dpos[0] * dpos[1]);
+      targetSphP->dpos.dx_dz -= horizontal_add(mj_dwk_r * dpos[0] * dpos[2]);
+      targetSphP->dpos.dy_dy -= horizontal_add(mj_dwk_r * dpos[1] * dpos[1]);
+      targetSphP->dpos.dy_dz -= horizontal_add(mj_dwk_r * dpos[1] * dpos[2]);
+      targetSphP->dpos.dz_dz -= horizontal_add(mj_dwk_r * dpos[2] * dpos[2]);
+#endif
+#ifdef TIMEDEP_ART_VISC
+      Vec4d vdotr2 = 0;
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          vdotr2 += dpos[i] * dv[i];
+        }
+
+      Vec4d cs_j(ngb0->Csnd, ngb1->Csnd, ngb2->Csnd, ngb3->Csnd);
+      Vec4d cs_sum = cs_i + cs_j;
+
+      Vec4d decay_vel_2 = cs_sum - vdotr2;
+
+      decision = (vdotr2 > 0);
+
+      Vec4d decay_vel = select(decision, cs_sum, decay_vel_2);
+
+      // find maximum element in vector
+      Vec4d h2 = permute4d<2, 3, 0, 1>(decay_vel);
+      Vec4d h3 = max(decay_vel, h2);
+      Vec4d h4 = permute4d<1, 0, 3, 2>(h3);
+      Vec4d h5 = max(h3, h4);
+
+      if(h5[0] > targetSphP->decayVel)
+        targetSphP->decayVel = h5[0];
+#endif
+    }
+}
+
+#else
+
+/* Main SPH compute kernel. This function carries out the SPH computations for the neighbouring particle
+ * data stored in the Ngbdensdat[] array. The results are added to the particle referenced through the pdat
+ * structure.
+ */
+void sph::density_evaluate_kernel(pinfo &pdat)
+{
+  particle_data *targetP = &Tp->P[pdat.target];
+  sph_particle_data *targetSphP = &Tp->SphP[pdat.target];
+
+  kernel_density kernel;
+
+  kernel_hinv(targetSphP->Hsml, &kernel.hinv, &kernel.hinv3, &kernel.hinv4);
+
+  for(int n = 0; n < pdat.numngb; n++)
+    {
+      struct ngbdata_density *ngb = &Ngbdensdat[n];
+
+      /*  note: in periodic case, closest image will be found through integer wrap around  */
+
+      double posdiff[3];
+      Tp->nearest_image_intpos_to_pos(targetP->IntPos, ngb->IntPos, posdiff); /* converts the integer distance to floating point */
+
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          kernel.dpos[i] = posdiff[i];
+        }
+
+      double r2 = 0;
+
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          r2 += kernel.dpos[i] * kernel.dpos[i];
+        }
+
+      kernel.r = sqrt(r2);
+
+      double u = kernel.r * kernel.hinv;
+
+      kernel_main(u, kernel.hinv3, kernel.hinv4, &kernel.wk, &kernel.dwk, COMPUTE_WK_AND_DWK);
+
+      double mass_j = ngb->Mass;
+      kernel.mj_wk = (mass_j * kernel.wk);
+
+      targetSphP->Density += kernel.mj_wk;
+
+#ifdef PRESSURE_ENTROPY_SPH
+      targetSphP->PressureSphDensity += kernel.mj_wk * ngb->EntropyToInvGammaPred;
+      targetSphP->DhsmlDerivedDensityFactor +=
+          (-mass_j * ngb->EntropyToInvGammaPred * (NUMDIMS * kernel.hinv * kernel.wk + u * kernel.dwk));
+
+#endif
+
+      targetSphP->DhsmlDensityFactor += (-mass_j * (NUMDIMS * kernel.hinv * kernel.wk + u * kernel.dwk));
+
+      if(kernel.r > 0)
+        {
+          kernel.mj_dwk_r = mass_j * kernel.dwk / kernel.r;
+
+          for(int i = 0; i < NUMDIMS; i++)
+            {
+              kernel.dv[i] = targetSphP->VelPred[i] - ngb->VelPred[i];
+            }
+
+#ifndef IMPROVED_VELOCITY_GRADIENTS
+          double dpos_times_dv = 0;
+          for(int i = 0; i < NUMDIMS; i++)
+            dpos_times_dv += kernel.dpos[i] * kernel.dv[i];
+
+          targetSphP->DivVel += (-kernel.mj_dwk_r * (dpos_times_dv));
+#ifdef TWODIMS
+          targetSphP->Rot[2] += (kernel.mj_dwk_r * (kernel.dpos[1] * kernel.dv[0] - kernel.dpos[0] * kernel.dv[1]));
+#endif
+#ifdef THREEDIMS
+          targetSphP->Rot[0] += (kernel.mj_dwk_r * (kernel.dpos[2] * kernel.dv[1] - kernel.dpos[1] * kernel.dv[2]));
+          targetSphP->Rot[1] += (kernel.mj_dwk_r * (kernel.dpos[0] * kernel.dv[2] - kernel.dpos[2] * kernel.dv[0]));
+          targetSphP->Rot[2] += (kernel.mj_dwk_r * (kernel.dpos[1] * kernel.dv[0] - kernel.dpos[0] * kernel.dv[1]));
+#endif
+#else
+          for(int i = 0; i < NUMDIMS; i++)
+            {
+              for(int j = 0; j < NUMDIMS; j++)
+                {
+                  targetSphP->dvel[i][j] -= kernel.mj_dwk_r * kernel.dv[i] * kernel.dpos[j];
+                }
+            }
+
+          targetSphP->dpos.dx_dx -= kernel.mj_dwk_r * kernel.dpos[0] * kernel.dpos[0];
+          targetSphP->dpos.dx_dy -= kernel.mj_dwk_r * kernel.dpos[0] * kernel.dpos[1];
+          targetSphP->dpos.dx_dz -= kernel.mj_dwk_r * kernel.dpos[0] * kernel.dpos[2];
+          targetSphP->dpos.dy_dy -= kernel.mj_dwk_r * kernel.dpos[1] * kernel.dpos[1];
+          targetSphP->dpos.dy_dz -= kernel.mj_dwk_r * kernel.dpos[1] * kernel.dpos[2];
+          targetSphP->dpos.dz_dz -= kernel.mj_dwk_r * kernel.dpos[2] * kernel.dpos[2];
+
+#endif
+#ifdef TIMEDEP_ART_VISC
+          double vdotr2 = 0;
+          for(int i = 0; i < NUMDIMS; i++)
+            {
+              vdotr2 += kernel.dpos[i] * kernel.dv[i];
+            }
+
+          double decay_vel;
+          if(vdotr2 < 0)
+            decay_vel = targetSphP->Csnd + ngb->Csnd - vdotr2;
+          else
+            decay_vel = targetSphP->Csnd + ngb->Csnd;
+          if(decay_vel > targetSphP->decayVel)
+            targetSphP->decayVel = decay_vel;
+#endif
+        }
+    }
+}
+#endif
+
+/* this routine clears the fields in the SphP particle structure that are additively computed by the SPH density loop
+ * by summing over neighbours
+ */
+inline void sph::clear_density_result(sph_particle_data *SphP)
+{
+  SphP->Density            = 0;
+  SphP->DhsmlDensityFactor = 0;
+  SphP->DivVel             = 0;
+
+  for(int k = 0; k < 3; k++)
+    SphP->Rot[k] = 0;
+
+#ifdef PRESSURE_ENTROPY_SPH
+  SphP->PressureSphDensity        = 0;
+  SphP->DhsmlDerivedDensityFactor = 0;
+#endif
+#ifdef IMPROVED_VELOCITY_GRADIENTS
+  SphP->dpos = {0};
+  for(int i = 0; i < NUMDIMS; i++)
+    {
+      for(int j = 0; j < NUMDIMS; j++)
+        {
+          SphP->dvel[i][j] = 0;
+        }
+    }
+#endif
+#ifdef TIMEDEP_ART_VISC
+  SphP->decayVel = 0;
+#endif
+}
diff --git a/src/sph/hydra.cc b/src/sph/hydra.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17d13c00c657c7741eccccbd9efdf207c0953d69
--- /dev/null
+++ b/src/sph/hydra.cc
@@ -0,0 +1,994 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  hydra.cc
+ *
+ *  \brief computation of SPH forces and rate of entropy generation
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../sort/cxxsort.h"
+#include "../sph/kernel.h"
+#include "../sph/sph.h"
+#include "../system/system.h"
+
+/*! This file contains the "second SPH loop", where the SPH forces are
+ *  computed, and where the rate of change of entropy due to the shock heating
+ *  (via artificial viscosity) is computed.
+ */
+
+inline int sph::sph_hydro_evaluate_particle_node_opening_criterion(pinfo &pdat, ngbnode *nop)
+{
+  if(nop->level <= LEVEL_ALWAYS_OPEN)  // always open the root node (note: full node length does not fit in the integer type)
+    return NODE_OPEN;
+
+  if(nop->Ti_Current != All.Ti_Current)
+    nop->drift_node(All.Ti_Current, Tp);
+
+  MyNgbTreeFloat dist = std::max<MyNgbTreeFloat>(nop->MaxHsml, pdat.hsml);
+
+  MyIntPosType search_min[3], search_range[3];
+
+  MyIntPosType inthsml = dist * Tp->FacCoordToInt;
+
+  for(int i = 0; i < 3; i++)
+    {
+      search_min[i]   = pdat.searchcenter[i] - inthsml;
+      search_range[i] = inthsml + inthsml;
+    }
+
+  MyIntPosType left[3], right[3];
+
+  left[0]  = Tp->nearest_image_intpos_to_intpos_X(nop->center_offset_min[0] + nop->center[0], search_min[0]);
+  right[0] = Tp->nearest_image_intpos_to_intpos_X(nop->center_offset_max[0] + nop->center[0], search_min[0]);
+
+  /* check whether we can stop walking along this branch */
+  if(left[0] > search_range[0] && right[0] > left[0])
+    return NODE_DISCARD;
+
+  left[1]  = Tp->nearest_image_intpos_to_intpos_Y(nop->center_offset_min[1] + nop->center[1], search_min[1]);
+  right[1] = Tp->nearest_image_intpos_to_intpos_Y(nop->center_offset_max[1] + nop->center[1], search_min[1]);
+
+  /* check whether we can stop walking along this branch */
+  if(left[1] > search_range[1] && right[1] > left[1])
+    return NODE_DISCARD;
+
+  left[2]  = Tp->nearest_image_intpos_to_intpos_Z(nop->center_offset_min[2] + nop->center[2], search_min[2]);
+  right[2] = Tp->nearest_image_intpos_to_intpos_Z(nop->center_offset_max[2] + nop->center[2], search_min[2]);
+
+  /* check whether we can stop walking along this branch */
+  if(left[2] > search_range[2] && right[2] > left[2])
+    return NODE_DISCARD;
+
+  return NODE_OPEN;
+}
+
+inline void sph::sph_hydro_check_particle_particle_interaction(pinfo &pdat, int p, int p_type, unsigned char shmrank)
+{
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  if(skip_actual_force_computation)
+    return;
+#endif
+
+  if(p_type == NODE_TYPE_LOCAL_PARTICLE) /* local particle */
+    {
+      particle_data *P        = get_Pp(p, shmrank);
+      sph_particle_data *SphP = get_SphPp(p, shmrank);
+
+      if(P->getType() > 0)
+        return;
+
+      if(P->get_Ti_Current() != All.Ti_Current)
+        Tp->drift_particle(P, SphP, All.Ti_Current);  // this function avoids race conditions
+
+      MyNgbTreeFloat dist   = std::max<MyNgbTreeFloat>(SphP->Hsml, pdat.hsml);
+      MyNgbTreeFloat distsq = dist * dist;
+
+      double posdiff[3];
+      Tp->nearest_image_intpos_to_pos(P->IntPos, pdat.searchcenter, posdiff); /* converts the integer distance to floating point */
+
+      double rad2 = posdiff[0] * posdiff[0] + posdiff[1] * posdiff[1] + posdiff[2] * posdiff[2];
+      if(rad2 > distsq || rad2 == 0)
+        return;
+
+      if(pdat.numngb >= MAX_NGBS)
+        Terminate("pdat.numngb >= MAX_NGBS");
+
+      int n = pdat.numngb++;
+
+      Ngbhydrodat[n].SphCore = SphP;
+      Ngbhydrodat[n].IntPos  = P->IntPos;
+      Ngbhydrodat[n].Mass    = P->getMass();
+#ifndef LEAN
+      Ngbhydrodat[n].TimeBinHydro = P->TimeBinHydro;
+#endif
+    }
+  else if(p_type == NODE_TYPE_FETCHED_PARTICLE)
+    {
+      foreign_sphpoint_data *foreignpoint = get_foreignpointsp(p - EndOfForeignNodes, shmrank);
+
+      MyNgbTreeFloat dist   = std::max<MyNgbTreeFloat>(foreignpoint->SphCore.Hsml, pdat.hsml);
+      MyNgbTreeFloat distsq = dist * dist;
+
+      /* converts the integer distance to floating point */
+      double posdiff[3];
+      Tp->nearest_image_intpos_to_pos(foreignpoint->IntPos, pdat.searchcenter, posdiff);
+
+      double rad2 = posdiff[0] * posdiff[0] + posdiff[1] * posdiff[1] + posdiff[2] * posdiff[2];
+      if(rad2 > distsq || rad2 == 0)
+        return;
+
+      if(pdat.numngb >= MAX_NGBS)
+        Terminate("pdat.numngb >= MAX_NGBS");
+
+      int n = pdat.numngb++;
+
+      Ngbhydrodat[n].SphCore      = &foreignpoint->SphCore;
+      Ngbhydrodat[n].IntPos       = foreignpoint->IntPos;
+      Ngbhydrodat[n].Mass         = foreignpoint->Mass;
+      Ngbhydrodat[n].TimeBinHydro = foreignpoint->TimeBinHydro;
+    }
+  else
+    Terminate("unexpected");
+}
+
+inline void sph::sph_hydro_open_node(pinfo &pdat, ngbnode *nop, int mintopleafnode, int committed)
+{
+  /* open node */
+  int p                 = nop->nextnode;
+  unsigned char shmrank = nop->nextnode_shmrank;
+
+  while(p != nop->sibling || (shmrank != nop->sibling_shmrank && nop->sibling >= MaxPart + D->NTopnodes))
+    {
+      if(p < 0)
+        Terminate(
+            "p=%d < 0  nop->sibling=%d nop->nextnode=%d shmrank=%d nop->sibling_shmrank=%d nop->foreigntask=%d  "
+            "first_nontoplevelnode=%d",
+            p, nop->sibling, nop->nextnode, shmrank, nop->sibling_shmrank, nop->OriginTask, MaxPart + D->NTopnodes);
+
+      int next;
+      unsigned char next_shmrank;
+      char type;
+
+      if(p < MaxPart) /* a local particle */
+        {
+          /* note: here shmrank cannot change */
+          next         = get_nextnodep(shmrank)[p];
+          next_shmrank = shmrank;
+          type         = NODE_TYPE_LOCAL_PARTICLE;
+        }
+      else if(p < MaxPart + MaxNodes) /* an internal node  */
+        {
+          ngbnode *nop = get_nodep(p, shmrank);
+          next         = nop->sibling;
+          next_shmrank = nop->sibling_shmrank;
+          type         = NODE_TYPE_LOCAL_NODE;
+        }
+      else if(p >= ImportedNodeOffset && p < EndOfTreePoints) /* an imported Treepoint particle  */
+        {
+          Terminate("not expected for SPH");
+        }
+      else if(p >= EndOfTreePoints && p < EndOfForeignNodes) /* an imported tree node */
+        {
+          ngbnode *nop = get_nodep(p, shmrank);
+          next         = nop->sibling;
+          next_shmrank = nop->sibling_shmrank;
+          type         = NODE_TYPE_FETCHED_NODE;
+        }
+      else if(p >= EndOfForeignNodes) /* an imported particle below an imported tree node */
+        {
+          foreign_sphpoint_data *foreignpoint = get_foreignpointsp(p - EndOfForeignNodes, shmrank);
+
+          next         = foreignpoint->Nextnode;
+          next_shmrank = foreignpoint->Nextnode_shmrank;
+          type         = NODE_TYPE_FETCHED_PARTICLE;
+        }
+      else
+        {
+          /* a pseudo point */
+          Terminate(
+              "should not happen: p=%d MaxPart=%d MaxNodes=%d  ImportedNodeOffset=%d  EndOfTreePoints=%d  EndOfForeignNodes=%d "
+              "shmrank=%d",
+              p, MaxPart, MaxNodes, ImportedNodeOffset, EndOfTreePoints, EndOfForeignNodes, shmrank);
+        }
+
+      sph_hydro_interact(pdat, p, type, shmrank, mintopleafnode, committed);
+
+      p       = next;
+      shmrank = next_shmrank;
+    }
+}
+
+inline void sph::sph_hydro_interact(pinfo &pdat, int no, char no_type, unsigned char shmrank, int mintopleafnode, int committed)
+{
+  if(no_type <= NODE_TYPE_FETCHED_PARTICLE)  // we are interacting with a particle
+    {
+      sph_hydro_check_particle_particle_interaction(pdat, no, no_type, shmrank);
+    }
+  else  // we are interacting with a node
+    {
+      ngbnode *nop = get_nodep(no, shmrank);
+
+      if(nop->not_empty == 0)
+        return;
+
+      if(no < MaxPart + MaxNodes)                // we have a top-levelnode
+        if(nop->nextnode >= MaxPart + MaxNodes)  // if the next node is not a top-level, we have a leaf node
+          mintopleafnode = no;
+
+      int openflag = sph_hydro_evaluate_particle_node_opening_criterion(pdat, nop);
+
+      if(openflag == NODE_OPEN) /* we need to open it */
+        {
+          if(nop->cannot_be_opened_locally.load(std::memory_order_acquire))
+            {
+              // are we in the same shared memory node?
+              if(Shmem.GetNodeIDForSimulCommRank[nop->OriginTask] == Shmem.GetNodeIDForSimulCommRank[D->ThisTask])
+                {
+                  Terminate("this should not happen any more");
+                }
+              else
+                {
+                  tree_add_to_fetch_stack(nop, no, shmrank);  // will only add unique copies
+
+                  tree_add_to_work_stack(pdat.target, no, shmrank, mintopleafnode);
+                }
+            }
+          else
+            {
+              int min_buffer_space =
+                  std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+
+              if(min_buffer_space >= committed + 8 * TREE_NUM_BEFORE_NODESPLIT)
+                sph_hydro_open_node(pdat, nop, mintopleafnode, committed + 8 * TREE_NUM_BEFORE_NODESPLIT);
+              else
+                tree_add_to_work_stack(pdat.target, no, shmrank, mintopleafnode);
+            }
+        }
+    }
+}
+
+void sph::hydro_forces_determine(int ntarget, int *targetlist)
+{
+  TIMER_STORE;
+  TIMER_START(CPU_HYDRO);
+
+  D->mpi_printf("SPH-HYDRO: Begin hydro-force calculation.  (presently allocated=%g MB)\n", Mem.getAllocatedBytesInMB());
+  D->mpi_printf("SPH-HYDRO: global Nhydro=%llu (task zero: NumGas=%d, Nhydro=%d)\n", Tp->TimeBinsHydro.GlobalNActiveParticles,
+                Tp->NumGas, ntarget);
+
+  double ta = Logs.second();
+
+  // let's grab at most half the still available memory for imported points and nodes
+  int nspace = (0.33 * Mem.FreeBytes) / (sizeof(ngbnode) + 8 * sizeof(foreign_sphpoint_data));
+
+  MaxForeignNodes  = nspace;
+  MaxForeignPoints = 8 * nspace;
+  NumForeignNodes  = 0;
+  NumForeignPoints = 0;
+
+  sum_NumForeignNodes  = 0;
+  sum_NumForeignPoints = 0;
+
+  /* the following two arrays hold imported tree nodes and imported points to augment the local tree */
+  Foreign_Nodes  = (ngbnode *)Mem.mymalloc_movable(&Foreign_Nodes, "Foreign_Nodes", MaxForeignNodes * sizeof(ngbnode));
+  Foreign_Points = (foreign_sphpoint_data *)Mem.mymalloc_movable(&Foreign_Points, "Foreign_Points",
+                                                                 MaxForeignPoints * sizeof(foreign_sphpoint_data));
+
+  tree_initialize_leaf_node_access_info();
+
+  max_ncycles = 0;
+
+  prepare_shared_memory_access();
+
+  if(All.ComovingIntegrationOn)
+    {
+      fac_mu       = pow(All.Time, 3 * (GAMMA - 1) / 2) / All.Time;
+      fac_vsic_fix = All.cf_hubble_a * pow(All.Time, 3 * GAMMA_MINUS1);
+    }
+  else
+    {
+      fac_mu       = 1.0;
+      fac_vsic_fix = 1.0;
+    }
+
+  Ngbhydrodat = (ngbdata_hydro *)Mem.mymalloc("Ngbhydrodat", MAX_NGBS * sizeof(ngbdata_hydro));
+
+  NumOnWorkStack         = 0;
+  AllocWorkStackBaseLow  = std::max<int>(1.5 * (Tp->NumPart + NumPartImported), TREE_MIN_WORKSTACK_SIZE);
+  AllocWorkStackBaseHigh = AllocWorkStackBaseLow + TREE_EXPECTED_CYCLES * TREE_MIN_WORKSTACK_SIZE;
+  MaxOnWorkStack         = AllocWorkStackBaseLow;
+
+  WorkStack = (workstack_data *)Mem.mymalloc("WorkStack", AllocWorkStackBaseHigh * sizeof(workstack_data));
+
+  for(int i = 0; i < ntarget; i++)
+    {
+      int target = targetlist[i];
+
+      clear_hydro_result(&Tp->SphP[target]);
+
+      WorkStack[NumOnWorkStack].Target         = target;
+      WorkStack[NumOnWorkStack].Node           = MaxPart;
+      WorkStack[NumOnWorkStack].ShmRank        = Shmem.Island_ThisTask;
+      WorkStack[NumOnWorkStack].MinTopLeafNode = MaxPart + D->NTopnodes;
+      NumOnWorkStack++;
+    }
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  workstack_data *WorkStackBak = (workstack_data *)Mem.mymalloc("WorkStackBak", NumOnWorkStack * sizeof(workstack_data));
+  int NumOnWorkStackBak        = NumOnWorkStack;
+  memcpy(WorkStackBak, WorkStack, NumOnWorkStack * sizeof(workstack_data));
+#endif
+
+  // set a default size of the fetch stack equal to half the work stack (this may still be somewhat too large)
+  MaxOnFetchStack = std::max<int>(0.1 * (Tp->NumPart + NumPartImported), TREE_MIN_WORKSTACK_SIZE);
+  StackToFetch    = (fetch_data *)Mem.mymalloc_movable(&StackToFetch, "StackToFetch", MaxOnFetchStack * sizeof(fetch_data));
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  for(int rep = 0; rep < 2; rep++)
+    {
+      if(rep == 0)
+        {
+          skip_actual_force_computation = true;
+        }
+      else
+        {
+          skip_actual_force_computation = false;
+          NumOnWorkStack                = NumOnWorkStackBak;
+          memcpy(WorkStack, WorkStackBak, NumOnWorkStack * sizeof(workstack_data));
+        }
+#endif
+
+      while(NumOnWorkStack > 0)  // repeat until we are out of work
+        {
+          NewOnWorkStack  = 0;  // gives the new entries
+          NumOnFetchStack = 0;
+          MaxOnWorkStack  = std::min<int>(AllocWorkStackBaseLow + max_ncycles * TREE_MIN_WORKSTACK_SIZE, AllocWorkStackBaseHigh);
+
+          TIMER_START(CPU_HYDROWALK);
+
+          int item = 0;
+
+          while(item < NumOnWorkStack)
+            {
+              int committed = 8 * TREE_NUM_BEFORE_NODESPLIT;
+              int min_buffer_space =
+                  std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+              if(min_buffer_space >= committed)
+                {
+                  int target     = WorkStack[item].Target;
+                  int no         = WorkStack[item].Node;
+                  int shmrank    = WorkStack[item].ShmRank;
+                  int mintopleaf = WorkStack[item].MinTopLeafNode;
+                  item++;
+
+                  pinfo pdat;
+                  get_pinfo(target, pdat);
+
+                  if(no == MaxPart)
+                    {
+                      // we have a pristine particle that's processed for the first time
+                      sph_hydro_interact(pdat, no, NODE_TYPE_LOCAL_NODE, shmrank, mintopleaf, committed);
+                    }
+                  else
+                    {
+                      // we have a node that we previously could not open
+                      ngbnode *nop = get_nodep(no, shmrank);
+
+                      if(nop->cannot_be_opened_locally)
+                        {
+                          Terminate("item=%d:  no=%d  now we should be able to open it!", item, no);
+                        }
+                      else
+                        sph_hydro_open_node(pdat, nop, mintopleaf, committed);
+                    }
+
+                  hydro_evaluate_kernel(pdat);
+                }
+              else
+                break;
+            }
+
+          if(item == 0 && NumOnWorkStack > 0)
+            Terminate("Can't even process a single particle");
+
+          TIMER_STOP(CPU_HYDROWALK);
+
+          TIMER_START(CPU_HYDROFETCH);
+
+          tree_fetch_foreign_nodes(FETCH_SPH_HYDRO);
+
+          TIMER_STOP(CPU_HYDROFETCH);
+
+          /* now reorder the workstack such that we are first going to do residual pristine particles, and then
+           * imported nodes that hang below the first leaf nodes */
+          NumOnWorkStack = NumOnWorkStack - item + NewOnWorkStack;
+          memmove(WorkStack, WorkStack + item, NumOnWorkStack * sizeof(workstack_data));
+
+          /* now let's sort such that we can go deep on top-level node branches, allowing us to clear them out eventually */
+          mycxxsort(WorkStack, WorkStack + NumOnWorkStack, compare_workstack);
+
+          max_ncycles++;
+        }
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+    }
+#endif
+
+  Mem.myfree(StackToFetch);
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  Mem.myfree(WorkStackBak);
+#endif
+  Mem.myfree(WorkStack);
+  Mem.myfree(Ngbhydrodat);
+
+  /* now factor in a prefactor for the computed rates */
+  for(int i = 0; i < ntarget; i++)
+    {
+      int target = targetlist[i];
+
+      double fac = GAMMA_MINUS1 / (All.cf_atime2_hubble_a * pow(Tp->SphP[target].Density, GAMMA_MINUS1));
+
+      Tp->SphP[target].DtEntropy *= fac;
+    }
+
+  /* Now the tree-based hydrodynamical force computation is finished,
+   * output some performance metrics
+   */
+
+  TIMER_START(CPU_HYDROIMBALANCE);
+
+  MPI_Allreduce(MPI_IN_PLACE, &max_ncycles, 1, MPI_INT, MPI_MAX, D->Communicator);
+
+  TIMER_STOP(CPU_HYDROIMBALANCE);
+
+  cleanup_shared_memory_access();
+
+  /* free temporary buffers */
+  Mem.myfree(Foreign_Points);
+  Mem.myfree(Foreign_Nodes);
+
+  double tb = Logs.second();
+
+  TIMER_STOPSTART(CPU_HYDRO, CPU_LOGS);
+
+  D->mpi_printf("SPH-HYDRO: hydro-force computation done. took %8.3f\n", Logs.timediff(ta, tb));
+
+  struct detailed_timings
+  {
+    double tree, wait, fetch, all;
+    double numnodes;
+    double NumForeignNodes, NumForeignPoints;
+    double fillfacFgnNodes, fillfacFgnPoints;
+  };
+  detailed_timings timer, tisum, timax;
+
+  timer.tree             = TIMER_DIFF(CPU_HYDROWALK);
+  timer.wait             = TIMER_DIFF(CPU_HYDROIMBALANCE);
+  timer.fetch            = TIMER_DIFF(CPU_HYDROFETCH);
+  timer.all              = timer.tree + timer.wait + timer.fetch + TIMER_DIFF(CPU_HYDRO);
+  timer.numnodes         = NumNodes;
+  timer.NumForeignNodes  = NumForeignNodes;
+  timer.NumForeignPoints = NumForeignPoints;
+  timer.fillfacFgnNodes  = NumForeignNodes / ((double)MaxForeignNodes);
+  timer.fillfacFgnPoints = NumForeignPoints / ((double)MaxForeignPoints);
+
+  MPI_Reduce((double *)&timer, (double *)&tisum, (int)(sizeof(detailed_timings) / sizeof(double)), MPI_DOUBLE, MPI_SUM, 0,
+             D->Communicator);
+  MPI_Reduce((double *)&timer, (double *)&timax, (int)(sizeof(detailed_timings) / sizeof(double)), MPI_DOUBLE, MPI_MAX, 0,
+             D->Communicator);
+
+  All.TotNumHydro += Tp->TimeBinsHydro.GlobalNActiveParticles;
+
+  if(D->ThisTask == 0)
+    {
+      fprintf(Logs.FdHydro, "Nf=%9lld  highest active timebin=%d  total-Nf=%lld\n", Tp->TimeBinsHydro.GlobalNActiveParticles,
+              All.HighestActiveTimeBin, All.TotNumHydro);
+      fprintf(Logs.FdHydro, "   work-load balance: %g   part/sec: raw=%g, effective=%g\n",
+              timax.tree / ((tisum.tree + 1e-20) / D->NTask), Tp->TimeBinsGravity.GlobalNActiveParticles / (tisum.tree + 1.0e-20),
+              Tp->TimeBinsGravity.GlobalNActiveParticles / ((timax.tree + 1.0e-20) * D->NTask));
+      fprintf(Logs.FdHydro,
+              "   maximum number of nodes: %g, filled: %g  NumForeignNodes: max=%g avg=%g fill=%g NumForeignPoints: max=%g avg=%g "
+              "fill=%g  cycles=%d\n",
+              timax.numnodes, timax.numnodes / MaxNodes, timax.NumForeignNodes, tisum.NumForeignNodes / D->NTask,
+              timax.fillfacFgnNodes, timax.NumForeignPoints, tisum.NumForeignPoints / D->NTask, timax.fillfacFgnPoints, max_ncycles);
+      fprintf(Logs.FdHydro, "   avg times: <all>=%g  <tree>=%g  <wait>=%g  <fetch>=%g  sec\n", tisum.all / D->NTask,
+              tisum.tree / D->NTask, tisum.wait / D->NTask, tisum.fetch / D->NTask);
+      myflush(Logs.FdHydro);
+    }
+
+  TIMER_STOP(CPU_LOGS);
+}
+
+#ifdef EXPLICIT_VECTORIZATION
+void sph::hydro_evaluate_kernel(pinfo &pdat)
+{
+#ifndef LEAN
+  particle_data *P_i        = &Tp->P[pdat.target];
+  sph_particle_data *SphP_i = &Tp->SphP[pdat.target];
+
+  /* the particles needs to be active */
+  if(P_i->getTimeBinHydro() > All.HighestSynchronizedTimeBin)
+    Terminate("bummer");
+
+  double shinv, shinv3, shinv4;
+  kernel_hinv(SphP_i->Hsml, &shinv, &shinv3, &shinv4);
+
+  Vec4d hinv(shinv);
+  Vec4d hinv3(shinv3);
+  Vec4d hinv4(shinv4);
+
+  Vec4d dwnorm(NORM * shinv3);
+  Vec4d dwknorm(NORM * shinv4);
+
+  Vec4d rho_i(SphP_i->Density);
+
+#ifdef PRESSURE_ENTROPY_SPH
+  Vec4d p_over_rho2_i((double)SphP_i->Pressure / ((double)SphP_i->PressureSphDensity * (double)SphP_i->PressureSphDensity));
+#else
+  Vec4d p_over_rho2_i((double)SphP_i->Pressure / ((double)SphP_i->Density * (double)SphP_i->Density));
+#endif
+
+  Vec4d sound_i(SphP_i->Csnd);
+  Vec4d h_i(SphP_i->Hsml);
+
+  Vec4d v_i[3];
+  for(int i = 0; i < NUMDIMS; i++)
+    {
+      v_i[i] = SphP_i->VelPred[i];
+    }
+  Vec4d DhsmlDensityFactor_i(SphP_i->DhsmlDensityFactor);
+#ifdef PRESSURE_ENTROPY_SPH
+  Vec4d DhsmlDerivedDensityFactor_i(SphP_i->DhsmlDerivedDensityFactor);
+  Vec4d EntropyToInvGammaPred_i(SphP_i->EntropyToInvGammaPred);
+#endif
+
+#if !defined(NO_SHEAR_VISCOSITY_LIMITER) && !defined(TIMEDEP_ART_VISC)
+  Vec4d f_i(fabs(SphP_i->DivVel) / (fabs(SphP_i->DivVel) + SphP_i->CurlVel + 0.0001 * SphP_i->Csnd / SphP_i->Hsml / fac_mu));
+#endif
+
+#ifdef TIMEDEP_ART_VISC
+  Vec4d alpha_i(SphP_i->Alpha);
+#endif
+  /* Now start the actual SPH computation for this particle */
+
+  double dacc[3]     = {0};
+  double dentr       = 0;
+  Vec4d MaxSignalVel = sound_i;
+
+  const int vector_length = 4;
+  const int array_length  = (pdat.numngb + vector_length - 1) & (-vector_length);
+
+  for(int n = pdat.numngb; n < array_length; n++) /* fill up neighbour array so that sensible data is accessed */
+    Ngbhydrodat[n] = Ngbhydrodat[0];
+
+  for(int n = 0; n < array_length; n += vector_length)
+    {
+      sph_particle_data_hydrocore *ngb0 = Ngbhydrodat[n + 0].SphCore;
+      sph_particle_data_hydrocore *ngb1 = Ngbhydrodat[n + 1].SphCore;
+      sph_particle_data_hydrocore *ngb2 = Ngbhydrodat[n + 2].SphCore;
+      sph_particle_data_hydrocore *ngb3 = Ngbhydrodat[n + 3].SphCore;
+
+      ngbdata_hydro *P0_j = &Ngbhydrodat[n + 0];
+      ngbdata_hydro *P1_j = &Ngbhydrodat[n + 1];
+      ngbdata_hydro *P2_j = &Ngbhydrodat[n + 2];
+      ngbdata_hydro *P3_j = &Ngbhydrodat[n + 3];
+
+      /* converts the integer distance to floating point */
+      Vec4d dpos[NUMDIMS];
+      double posdiff[array_length][3];
+      for(int i = 0; i < 4; i++)
+        {
+          Tp->nearest_image_intpos_to_pos(P_i->IntPos, Ngbhydrodat[n + i].IntPos, &(posdiff[i][0]));
+        }
+
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          dpos[i] = Vec4d(posdiff[0][i], posdiff[1][i], posdiff[2][i], posdiff[3][i]);
+        }
+
+      Vec4d r2(0);
+
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          r2 += dpos[i] * dpos[i];
+        }
+
+      Vec4d r = sqrt(r2);
+
+      Vec4d v_j[NUMDIMS];
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          v_j[i] = Vec4d(ngb0->VelPred[i], ngb1->VelPred[i], ngb2->VelPred[i], ngb3->VelPred[i]);
+        }
+
+      Vec4d pressure(ngb0->Pressure, ngb1->Pressure, ngb2->Pressure, ngb3->Pressure);
+      Vec4d rho_j(ngb0->Density, ngb1->Density, ngb2->Density, ngb3->Density);
+#ifdef PRESSURE_ENTROPY_SPH
+      Vec4d rho_press_j(ngb0->PressureSphDensity, ngb1->PressureSphDensity, ngb2->PressureSphDensity, ngb3->PressureSphDensity);
+      Vec4d p_over_rho2_j = pressure / (rho_press_j * rho_press_j);
+#else
+      Vec4d p_over_rho2_j = pressure / (rho_j * rho_j);
+#endif
+
+      Vec4d wk_i, dwk_i;
+      Vec4d u = r * hinv;
+      kernel_main_vector(u, dwnorm, dwknorm, &wk_i, &dwk_i);
+      Vec4db decision = (r < h_i);
+      Vec4d fac       = select(decision, 1., 0.);
+      wk_i *= fac;
+      dwk_i *= fac;
+
+      Vec4d h_j(ngb0->Hsml, ngb1->Hsml, ngb2->Hsml, ngb3->Hsml);
+      Vec4d hinv_j = 1 / h_j;
+#ifdef THREEDIMS
+      Vec4d hinv3_j = hinv_j * hinv_j * hinv_j;
+#endif
+
+#ifdef TWODIMS
+      Vec4d hinv3_j = hinv_j * hinv_j;
+#endif
+
+#ifdef ONEDIMS
+      Vec4d hinv3_j = hinv_j;
+#endif
+      Vec4d hinv4_j = hinv3_j * hinv_j;
+
+      Vec4d wk_j, dwk_j;
+      u = r * hinv_j;
+      kernel_main_vector(u, NORM * hinv3_j, NORM * hinv4_j, &wk_j, &dwk_j);
+      decision = (r < h_j);
+      fac      = select(decision, 1., 0.);
+      wk_j *= fac;
+      dwk_j *= fac;
+
+      Vec4d sound_j(ngb0->Csnd, ngb1->Csnd, ngb2->Csnd, ngb3->Csnd);
+      Vec4d vsig = sound_i + sound_j;
+      if(n + vector_length > pdat.numngb)
+        {
+          wk_i.cutoff(vector_length - (array_length - pdat.numngb));
+          dwk_i.cutoff(vector_length - (array_length - pdat.numngb));
+          wk_j.cutoff(vector_length - (array_length - pdat.numngb));
+          dwk_j.cutoff(vector_length - (array_length - pdat.numngb));
+          vsig.cutoff(vector_length - (array_length - pdat.numngb));
+        }
+
+      Vec4d dwk_ij = 0.5 * (dwk_i + dwk_j);
+
+      MaxSignalVel = max(MaxSignalVel, vsig);
+
+      Vec4d visc(0);
+
+      Vec4d dv[NUMDIMS];
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          dv[i] = v_i[i] - v_j[i];
+        }
+
+      Vec4d vdotr2(0);
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          vdotr2 += dv[i] * dpos[i];
+        }
+
+      if(All.ComovingIntegrationOn)
+        vdotr2 += All.cf_atime2_hubble_a * r2;
+
+      decision            = (vdotr2 < 0);
+      Vec4d viscosity_fac = select(decision, 1, 0);
+
+      /* ... artificial viscosity */
+
+      Vec4d mu_ij = fac_mu * vdotr2 / r;
+
+      vsig -= 3 * mu_ij;
+
+#if defined(NO_SHEAR_VISCOSITY_LIMITER) || defined(TIMEDEP_ART_VISC)
+      Vec4d f_i(1);
+      Vec4d f_j(1);
+#else
+      Vec4d DivVel_j(ngb0->DivVel, ngb1->DivVel, ngb2->DivVel, ngb3->DivVel);
+      Vec4d CurlVel_j(ngb0->CurlVel, ngb1->CurlVel, ngb2->CurlVel, ngb3->CurlVel);
+      Vec4d f_j = abs(DivVel_j) / (abs(DivVel_j) + CurlVel_j + 0.0001 * sound_j / fac_mu * hinv_j);
+#endif
+
+#ifdef TIMEDEP_ART_VISC
+      Vec4d alpha_j(ngb0->Alpha, ngb1->Alpha, ngb2->Alpha, ngb3->Alpha);
+      Vec4d BulkVisc_ij = 0.5 * (alpha_i + alpha_j);
+
+#else
+      Vec4d BulkVisc_ij(All.ArtBulkViscConst);
+#endif
+      Vec4d rho_ij_inv = 2.0 / (rho_i + rho_j);
+      visc             = 0.25 * BulkVisc_ij * vsig * (-mu_ij) * rho_ij_inv * (f_i + f_j);
+      Vec4d mass_j(P0_j->Mass, P1_j->Mass, P2_j->Mass, P3_j->Mass);
+#ifdef VISCOSITY_LIMITER_FOR_LARGE_TIMESTEPS
+      Vec4i timeBin_i(P_i->TimeBinHydro);
+      Vec4i timeBin_j(P0_j->TimeBinHydro, P1_j->TimeBinHydro, P2_j->TimeBinHydro, P3_j->TimeBinHydro);
+
+      Vec4i timebin = max(timeBin_i, timeBin_j);
+      Vec4i integer_time(((integertime)1) << timebin[0], ((integertime)1) << timebin[1], ((integertime)1) << timebin[2],
+                         ((integertime)1) << timebin[3]);
+
+      Vec4ib decision_i    = (timebin != 0);
+      Vec4i factor_timebin = select(decision_i, Vec4i(1), Vec4i(0));
+      Vec4d dt             = to_double(2 * integer_time * factor_timebin) * All.Timebase_interval;
+
+      decision = (dt > 0 && dwk_ij < 0);
+
+      Vec4d visc_alternavtive = 0.5 * fac_vsic_fix * vdotr2 / ((P_i->getMass() + mass_j) * dwk_ij * r * dt);
+
+      Vec4d visc2 = select(decision, visc_alternavtive, visc);
+      visc        = min(visc, visc2);
+#endif
+
+      Vec4d hfc_visc = mass_j * visc * dwk_ij / r * viscosity_fac;
+
+#ifndef PRESSURE_ENTROPY_SPH
+      /* Formulation derived from the Lagrangian */
+      dwk_i *= DhsmlDensityFactor_i;
+      Vec4d DhsmlDensityFactor_j(ngb0->DhsmlDensityFactor, ngb1->DhsmlDensityFactor, ngb2->DhsmlDensityFactor,
+                                 ngb3->DhsmlDensityFactor);
+      dwk_j *= DhsmlDensityFactor_j;
+
+      Vec4d hfc = mass_j * (p_over_rho2_i * dwk_i + p_over_rho2_j * dwk_j) / r + hfc_visc;
+#else
+      Vec4d EntropyToInvGammaPred_j(ngb0->EntropyToInvGammaPred, ngb1->EntropyToInvGammaPred, ngb2->EntropyToInvGammaPred,
+                                    ngb3->EntropyToInvGammaPred);
+      Vec4d DhsmlDerivedDensityFactor_j(ngb0->DhsmlDerivedDensityFactor, ngb1->DhsmlDerivedDensityFactor,
+                                        ngb2->DhsmlDerivedDensityFactor, ngb3->DhsmlDerivedDensityFactor);
+      /* leading order term */
+      Vec4d hfc = mass_j *
+                  (p_over_rho2_i * dwk_i * EntropyToInvGammaPred_j / EntropyToInvGammaPred_i +
+                   p_over_rho2_j * dwk_j * EntropyToInvGammaPred_i / EntropyToInvGammaPred_j) /
+                  r;
+
+      /* grad-h term */
+      hfc += mass_j *
+             (p_over_rho2_i * dwk_i * SphP_i->DhsmlDerivedDensityFactor + p_over_rho2_j * dwk_j * DhsmlDerivedDensityFactor_j) / r;
+
+      /* add viscous term */
+      hfc += hfc_visc;
+#endif
+
+      for(int i = 0; i < NUMDIMS; i++)
+        {
+          dacc[i] += horizontal_add(-hfc * dpos[i]);
+        }
+      dentr += horizontal_add(0.5 * (hfc_visc)*vdotr2);
+    }
+
+  SphP_i->HydroAccel[0] += dacc[0];
+  SphP_i->HydroAccel[1] += dacc[1];
+  SphP_i->HydroAccel[2] += dacc[2];
+  SphP_i->DtEntropy += dentr;
+
+  for(int i = 0; i < 4; i++)
+    {
+      if(SphP_i->MaxSignalVel < MaxSignalVel[i])
+        SphP_i->MaxSignalVel = MaxSignalVel[i];
+    }
+#endif
+}
+
+#else
+
+/*! This function is the 'core' of the SPH force computation. A target
+ *  particle is specified which may either be local, or reside in the
+ *  communication buffer.
+ */
+void sph::hydro_evaluate_kernel(pinfo &pdat)
+{
+#ifndef LEAN
+  particle_data *P_i        = &Tp->P[pdat.target];
+  sph_particle_data *SphP_i = &Tp->SphP[pdat.target];
+
+  /* the particles needs to be active */
+  if(P_i->getTimeBinHydro() > All.HighestSynchronizedTimeBin)
+    Terminate("bummer");
+
+#ifdef PRESSURE_ENTROPY_SPH
+  double p_over_rho2_i = (double)SphP_i->Pressure / ((double)SphP_i->PressureSphDensity * (double)SphP_i->PressureSphDensity);
+#else
+  double p_over_rho2_i = (double)SphP_i->Pressure / ((double)SphP_i->Density * (double)SphP_i->Density);
+#endif
+
+  kernel_hydra kernel;
+
+  kernel.sound_i = SphP_i->Csnd;
+  kernel.h_i     = SphP_i->Hsml;
+
+  /* Now start the actual SPH computation for this particle */
+
+  double daccx        = 0;
+  double daccy        = 0;
+  double daccz        = 0;
+  double dentr        = 0;
+  double MaxSignalVel = kernel.sound_i;
+
+  for(int n = 0; n < pdat.numngb; n++)
+    {
+      sph_particle_data_hydrocore *SphP_j = Ngbhydrodat[n].SphCore;
+      ngbdata_hydro *P_j                  = &Ngbhydrodat[n];
+
+      /* converts the integer distance to floating point */
+      double posdiff[3];
+      Tp->nearest_image_intpos_to_pos(P_i->IntPos, P_j->IntPos, posdiff);
+
+      kernel.dx = posdiff[0];
+      kernel.dy = posdiff[1];
+      kernel.dz = posdiff[2];
+
+      double r2  = kernel.dx * kernel.dx + kernel.dy * kernel.dy + kernel.dz * kernel.dz;
+      kernel.h_j = SphP_j->Hsml;
+
+      if(r2 < kernel.h_i * kernel.h_i || r2 < kernel.h_j * kernel.h_j)
+        {
+          kernel.r = sqrt(r2);
+          if(kernel.r > 0)
+            {
+#ifdef PRESSURE_ENTROPY_SPH
+              double p_over_rho2_j =
+                  (double)SphP_j->Pressure / ((double)SphP_j->PressureSphDensity * (double)SphP_j->PressureSphDensity);
+#else
+              double p_over_rho2_j = (double)SphP_j->Pressure / ((double)SphP_j->Density * (double)SphP_j->Density);
+#endif
+
+              kernel.sound_j = SphP_j->Csnd;
+
+              kernel.dvx        = SphP_i->VelPred[0] - SphP_j->VelPred[0];
+              kernel.dvy        = SphP_i->VelPred[1] - SphP_j->VelPred[1];
+              kernel.dvz        = SphP_i->VelPred[2] - SphP_j->VelPred[2];
+              kernel.vdotr2     = kernel.dx * kernel.dvx + kernel.dy * kernel.dvy + kernel.dz * kernel.dvz;
+              kernel.rho_ij_inv = 2.0 / (SphP_i->Density + SphP_j->Density);
+
+              if(All.ComovingIntegrationOn)
+                kernel.vdotr2 += All.cf_atime2_hubble_a * r2;
+
+              double hinv, hinv3, hinv4;
+              if(kernel.r < kernel.h_i)
+                {
+                  kernel_hinv(kernel.h_i, &hinv, &hinv3, &hinv4);
+                  double u = kernel.r * hinv;
+                  kernel_main(u, hinv3, hinv4, &kernel.wk_i, &kernel.dwk_i, COMPUTE_DWK);
+                }
+              else
+                {
+                  kernel.dwk_i = 0;
+                  kernel.wk_i  = 0;
+                }
+
+              if(kernel.r < kernel.h_j)
+                {
+                  kernel_hinv(kernel.h_j, &hinv, &hinv3, &hinv4);
+                  double u = kernel.r * hinv;
+                  kernel_main(u, hinv3, hinv4, &kernel.wk_j, &kernel.dwk_j, COMPUTE_DWK);
+                }
+              else
+                {
+                  kernel.dwk_j = 0;
+                  kernel.wk_j  = 0;
+                }
+
+              kernel.dwk_ij = 0.5 * (kernel.dwk_i + kernel.dwk_j);
+
+              kernel.vsig = kernel.sound_i + kernel.sound_j;
+
+              if(kernel.vsig > MaxSignalVel)
+                MaxSignalVel = kernel.vsig;
+
+              double visc = 0;
+
+              if(kernel.vdotr2 < 0) /* ... artificial viscosity */
+                {
+                  double mu_ij = fac_mu * kernel.vdotr2 / kernel.r;
+
+                  kernel.vsig -= 3 * mu_ij;
+
+#if defined(NO_SHEAR_VISCOSITY_LIMITER) || defined(TIMEDEP_ART_VISC)
+                  double f_i         = 1.;
+                  double f_j         = 1.;
+#else
+                  double f_i =
+                      fabs(SphP_i->DivVel) / (fabs(SphP_i->DivVel) + SphP_i->CurlVel + 0.0001 * SphP_i->Csnd / SphP_i->Hsml / fac_mu);
+
+                  double f_j =
+                      fabs(SphP_j->DivVel) / (fabs(SphP_j->DivVel) + SphP_j->CurlVel + 0.0001 * kernel.sound_j / fac_mu / kernel.h_j);
+#endif
+
+#ifdef TIMEDEP_ART_VISC
+                  double BulkVisc_ij = 0.5 * (SphP_i->Alpha + SphP_j->Alpha);
+
+#else
+                  double BulkVisc_ij = All.ArtBulkViscConst;
+#endif
+
+                  visc        = 0.25 * BulkVisc_ij * kernel.vsig * (-mu_ij) * kernel.rho_ij_inv * (f_i + f_j);
+#ifdef VISCOSITY_LIMITER_FOR_LARGE_TIMESTEPS
+                  int timebin = std::max<int>(P_i->TimeBinHydro, P_j->TimeBinHydro);
+
+                  double dt = 2 * (timebin ? (((integertime)1) << timebin) : 0) * All.Timebase_interval;
+
+                  if(dt > 0 && kernel.dwk_ij < 0)
+                    {
+                      visc = std::min<double>(
+                          visc, 0.5 * fac_vsic_fix * kernel.vdotr2 / ((P_i->getMass() + P_j->Mass) * kernel.dwk_ij * kernel.r * dt));
+                    }
+#endif
+                }
+
+              double hfc_visc = P_j->Mass * visc * kernel.dwk_ij / kernel.r;
+
+#ifndef PRESSURE_ENTROPY_SPH
+              /* Formulation derived from the Lagrangian */
+              kernel.dwk_i *= SphP_i->DhsmlDensityFactor;
+              kernel.dwk_j *= SphP_j->DhsmlDensityFactor;
+
+              double hfc = P_j->Mass * (p_over_rho2_i * kernel.dwk_i + p_over_rho2_j * kernel.dwk_j) / kernel.r + hfc_visc;
+#else
+              /* leading order term */
+              double hfc = P_j->Mass *
+                           (p_over_rho2_i * kernel.dwk_i * SphP_j->EntropyToInvGammaPred / SphP_i->EntropyToInvGammaPred +
+                            p_over_rho2_j * kernel.dwk_j * SphP_i->EntropyToInvGammaPred / SphP_j->EntropyToInvGammaPred) /
+                           kernel.r;
+
+              /* grad-h term */
+              hfc += P_j->Mass *
+                     (p_over_rho2_i * kernel.dwk_i * SphP_i->DhsmlDerivedDensityFactor +
+                      p_over_rho2_j * kernel.dwk_j * SphP_j->DhsmlDerivedDensityFactor) /
+                     kernel.r;
+
+              /* add viscous term */
+              hfc += hfc_visc;
+#endif
+
+              daccx += (-hfc * kernel.dx);
+              daccy += (-hfc * kernel.dy);
+              daccz += (-hfc * kernel.dz);
+              dentr += (0.5 * (hfc_visc)*kernel.vdotr2);
+            }
+        }
+    }
+
+  SphP_i->HydroAccel[0] += daccx;
+  SphP_i->HydroAccel[1] += daccy;
+  SphP_i->HydroAccel[2] += daccz;
+  SphP_i->DtEntropy += dentr;
+
+  if(SphP_i->MaxSignalVel < MaxSignalVel)
+    SphP_i->MaxSignalVel = MaxSignalVel;
+#endif
+}
+#endif
+
+/* this routine clears the fields in the SphP particle structure that are additively computed by the SPH density loop
+ * by summing over neighbours
+ */
+inline void sph::clear_hydro_result(sph_particle_data *SphP)
+{
+  for(int k = 0; k < 3; k++)
+    SphP->HydroAccel[k] = 0;
+
+  SphP->DtEntropy    = 0;
+  SphP->MaxSignalVel = 0;
+}
diff --git a/src/sph/init_entropy.cc b/src/sph/init_entropy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..231eeb4a5c2671650b116c1c4a90b3d50aeb5f8c
--- /dev/null
+++ b/src/sph/init_entropy.cc
@@ -0,0 +1,189 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  init_entropy.cc
+ *
+ *  \brief initialization code for the entropy variable of the particles
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../sort/peano.h"
+#include "../sph/kernel.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+#ifdef PRESSURE_ENTROPY_SPH
+
+#define MAX_ITER_ENTROPY 100
+#define ENTROPY_TOLERANCE 1.0e-5
+
+/*! \file init_entropy.c
+ *  \brief SPH entropy computation from internal energies for pressure-entropy formulation of SPH
+ *
+ */
+void sph::init_entropy(void)
+{
+  TIMER_STORE;
+  TIMER_START(CPU_DENSITY);
+
+  D->mpi_printf("SPH-INIT-ENTROPY: Begin entropy calculation.  (presently allocated=%g MB)\n", Mem.getAllocatedBytesInMB());
+
+  /* Create list of targets. We do this here to simplify the treatment later on */
+  int *targetList = (int *)Mem.mymalloc("targetlist", Tp->NumGas * sizeof(int));
+
+  int ndensities = 0;
+
+  for(int i = 0; i < Tp->TimeBinsHydro.NActiveParticles; i++)
+    {
+      int target = Tp->TimeBinsHydro.ActiveParticleList[i];
+      if(target < 0 || target >= Tp->NumGas)
+        Terminate("target=%d i=%d\n", target, i);
+      targetList[ndensities++] = target;
+    }
+
+  int iter = 0;
+
+  // let's grab at most half the still available memory for imported points and nodes
+  int nspace = (0.33 * Mem.FreeBytes) / (sizeof(ngbnode) + 8 * sizeof(foreign_sphpoint_data));
+
+  MaxForeignNodes  = nspace;
+  MaxForeignPoints = 8 * nspace;
+  NumForeignNodes  = 0;
+  NumForeignPoints = 0;
+
+  sum_NumForeignNodes  = 0;
+  sum_NumForeignPoints = 0;
+
+  /* the following two arrays hold imported tree nodes and imported points to augment the local tree */
+  Foreign_Nodes  = (ngbnode *)Mem.mymalloc_movable(&Foreign_Nodes, "Foreign_Nodes", MaxForeignNodes * sizeof(ngbnode));
+  Foreign_Points = (foreign_sphpoint_data *)Mem.mymalloc_movable(&Foreign_Points, "Foreign_Points",
+                                                                 MaxForeignPoints * sizeof(foreign_sphpoint_data));
+
+  tree_initialize_leaf_node_access_info();
+
+  max_ncycles = 0;
+
+  prepare_shared_memory_access();
+
+  double tstart = Logs.second();
+
+  do
+    {
+      double t0 = Logs.second();
+
+      /* now do the primary work with this call */
+      densities_determine(ndensities, targetList);
+
+      /* do final operations on results */
+
+      double entropy_old;
+
+      int npleft = 0;
+
+      for(int i = 0; i < ndensities; i++)
+        {
+          int target = targetList[i];
+          if(target >= 0)
+            {
+              if(Tp->P[target].getType() != 0)
+                Terminate("P[target].getType() != 0");
+
+              sph_particle_data *SphP = Tp->SphP;
+
+              if(SphP[target].EntropyToInvGammaPred > 0 && SphP[target].Density > 0)
+                {
+                  entropy_old = SphP[target].Entropy;
+                  SphP[target].PressureSphDensity /= SphP[target].EntropyToInvGammaPred;
+                  SphP[target].Entropy =
+                      GAMMA_MINUS1 * SphP[target].EntropyPred / pow(SphP[target].PressureSphDensity * All.cf_a3inv, GAMMA_MINUS1);
+                  SphP[target].EntropyToInvGammaPred = pow(SphP[target].Entropy, 1.0 / GAMMA);
+                }
+              else
+                {
+                  entropy_old                        = SphP[target].Entropy;
+                  SphP[target].PressureSphDensity    = 0;
+                  SphP[target].Entropy               = 0;
+                  SphP[target].EntropyToInvGammaPred = 0;
+                }
+              /* entropy has not converged yet */
+              if(fabs(entropy_old - SphP[target].Entropy) > ENTROPY_TOLERANCE * entropy_old)
+                targetList[npleft++] = target;
+            }
+        }
+
+      ndensities = npleft;
+
+      double t1 = Logs.second();
+
+      if(npleft > 0)
+        {
+          iter++;
+
+          D->mpi_printf("SPH-INIT-ENTROPY: ngb iteration %4d: took %8.3f  , need to repeat for %012lld local particles.\n", iter,
+                        Logs.timediff(t0, t1), npleft);
+
+          if(iter > MAXITER)
+            Terminate("failed to converge in neighbour iteration in density()\n");
+        }
+      else
+        D->mpi_printf("SPH-INIT-ENTROPY: ngb iteration %4d: took %8.3f\n", ++iter, Logs.timediff(t0, t1));
+    }
+  while(ndensities > 0);
+
+  MPI_Allreduce(MPI_IN_PLACE, &max_ncycles, 1, MPI_INT, MPI_MAX, D->Communicator);
+
+  TIMER_STOP(CPU_DENSITY);
+
+  cleanup_shared_memory_access();
+
+  /* free temporary buffers */
+
+  Mem.myfree(Foreign_Points);
+  Mem.myfree(Foreign_Nodes);
+
+  Mem.myfree(targetList);
+
+  double tb = Logs.second();
+
+  D->mpi_printf("SPH-INIT-ENTROPY: entropy calculation is done. took: %8.3f\n", Logs.timediff(tstart, tb));
+}
+
+/*! \brief This function is used to find the initial entropy^invgamma for each SPH
+ *  particle in the pressure-entropy formulation of SPH in case the ICs
+ *  file contains internal energies.
+ */
+void sph::setup_entropy_to_invgamma(void)
+{
+  All.set_cosmo_factors_for_current_time();
+
+  /* Initialize entropy and entropy^invgamma with a fist guess coming from standard SPH density estimate. */
+  /* EntropyPred is untouched since it contains the internal energies needed for the iterative process; it */
+  /* will be set in init.c to the correct value.                                                          */
+  for(int i = 0; i < Tp->NumGas; i++)
+    {
+      Tp->SphP[i].Entropy = GAMMA_MINUS1 * Tp->SphP[i].EntropyPred / pow(Tp->SphP[i].Density * All.cf_a3inv, GAMMA_MINUS1);
+      Tp->SphP[i].EntropyToInvGammaPred = pow(Tp->SphP[i].Entropy, 1.0 / GAMMA);
+    }
+
+  init_entropy();
+}
+#endif
diff --git a/src/sph/kernel.h b/src/sph/kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6512c39fa849429b0a6b8c553e942175339ef3d
--- /dev/null
+++ b/src/sph/kernel.h
@@ -0,0 +1,358 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  kernel.h
+ *
+ *  \brief collects definitions of different SPH kernels
+ */
+
+#ifndef KERNEL_H
+#define KERNEL_H
+
+struct kernel_density
+{
+  double dpos[NUMDIMS];
+  double r;
+  double dv[NUMDIMS];
+  double wk, dwk;
+  double hinv, hinv3, hinv4;
+  double mj_wk, mj_dwk_r;
+};
+
+struct kernel_hydra
+{
+  double dx, dy, dz;
+  double r, vsig, sound_i, sound_j;
+  double dvx, dvy, dvz, vdotr2;
+  double wk_i, wk_j, dwk_i, dwk_j;
+  double h_i, h_j, dwk_ij, rho_ij_inv;
+};
+
+#if !defined(CUBIC_SPLINE_KERNEL) && !defined(WENDLAND_C2_KERNEL) && !defined(WENDLAND_C4_KERNEL) && !defined(WENDLAND_C6_KERNEL)
+#define CUBIC_SPLINE_KERNEL /* fall back to cubic spline kernel */
+#endif
+
+/* fall back to three dimensions */
+#if !defined(TWODIMS) && !defined(ONEDIMS)
+#define THREEDIMS
+#endif
+
+/* Norms */
+#ifdef CUBIC_SPLINE_KERNEL
+
+#ifdef THREEDIMS
+#define NORM (8.0 / M_PI) /*!< For 3D-normalized kernel */
+#endif
+
+#ifdef TWODIMS
+#define NORM (40.0 / (7.0 * M_PI)) /*!< For 2D-normalized kernel */
+#endif
+
+#ifdef ONEDIMS
+#define NORM (4.0 / 3.0) /*!< For 1D-normalized kernel */
+#endif
+
+#endif /* CUBIC_SPLINE_KERNEL */
+
+#ifdef WENDLAND_C2_KERNEL
+
+#ifdef THREEDIMS
+#define NORM (21.0 / (2.0 * M_PI)) /*!< For 3D-normalized kernel */
+#endif
+
+#ifdef TWODIMS
+#define NORM (7.0 / M_PI) /*!< For 2D-normalized kernel */
+#endif
+
+#ifdef ONEDIMS
+#define NORM (5.0 / 4.0) /*!< For 1D-normalized kernel */
+#endif
+
+#endif /* WENDLAND_C2_KERNEL */
+
+#ifdef WENDLAND_C4_KERNEL
+
+#ifdef THREEDIMS
+#define NORM (495.0 / (32.0 * M_PI)) /*!< For 3D-normalized kernel */
+#endif
+
+#ifdef TWODIMS
+#define NORM (9.0 / M_PI) /*!< For 2D-normalized kernel */
+#endif
+
+#ifdef ONEDIMS
+#define NORM (3.0 / 2.0) /*!< For 1D-normalized kernel */
+#endif
+
+#endif /* WENDLAND_C4_KERNEL */
+
+#ifdef WENDLAND_C6_KERNEL
+
+#ifdef THREEDIMS
+#define NORM (1365.0 / (64.0 * M_PI)) /*!< For 3D-normalized kernel */
+#endif
+
+#ifdef TWODIMS
+#define NORM (78.0 / (7.0 * M_PI)) /*!< For 2D-normalized kernel */
+#endif
+
+#ifdef ONEDIMS
+#define NORM (55.0 / 32.0) /*!< For 1D-normalized kernel */
+#endif
+
+#endif /* WENDLAND_C6_KERNEL */
+
+#define COMPUTE_WK -1
+#define COMPUTE_WK_AND_DWK 0
+#define COMPUTE_DWK 1
+
+static inline void kernel_hinv(double h, double *hinv, double *hinv3, double *hinv4)
+{
+  *hinv = 1.0 / h;
+
+#ifdef THREEDIMS
+  *hinv3 = *hinv * *hinv * *hinv;
+#endif
+
+#ifdef TWODIMS
+  *hinv3 = *hinv * *hinv;
+#endif
+
+#ifdef ONEDIMS
+  *hinv3 = *hinv;
+#endif
+
+  *hinv4 = *hinv3 * *hinv;
+}
+
+/* Attention: Here we assume that kernel is only called
+   with range 0..1 for u as done in hydra or density !!
+   Call with mode COMPUTE_WK_AND_DWK to calculate dwk and wk
+   Call with mode COMPUTE_WK to calculate only wk
+   Call with mode COMPUTE_DWK to calculate only dwk */
+static inline void kernel_main(double u, double hinv3, double hinv4, double *wk, double *dwk, int mode)
+{
+#ifdef CUBIC_SPLINE_KERNEL
+#if defined(WENDLAND_C2_KERNEL) || defined(WENDLAND_C4_KERNEL) || defined(WENDLAND_C6_KERNEL)
+#error "Only one SPH kernel can be used"
+#endif
+  if(u < 0.5)
+    {
+      if(mode >= COMPUTE_WK_AND_DWK)
+        *dwk = u * (18.0 * u - 12.0);
+      if(mode <= COMPUTE_WK_AND_DWK)
+        *wk = (1.0 + 6.0 * (u - 1.0) * u * u);
+    }
+  else
+    {
+      double t1 = (1.0 - u);
+      double t2 = t1 * t1;
+      if(mode >= COMPUTE_WK_AND_DWK)
+        *dwk = -6.0 * t2;
+      if(mode <= COMPUTE_WK_AND_DWK)
+        *wk = 2.0 * t2 * t1;
+    }
+#endif
+
+#ifdef WENDLAND_C2_KERNEL /* Dehnen & Aly 2012 */
+#ifdef ONEDIMS
+  double t1 = (1.0 - u);
+  double t2 = (t1 * t1);
+
+  if(mode >= COMPUTE_WK_AND_DWK)
+    *dwk = -12.0 * u * t2;
+  if(mode <= COMPUTE_WK_AND_DWK)
+    *wk = t2 * t1 * (1.0 + u * 3.0);
+
+#else /* 2d or 3d */
+  double t1 = (1.0 - u);
+  double t2 = (t1 * t1);
+  double t4 = t2 * t2;
+  if(mode >= COMPUTE_WK_AND_DWK)
+    *dwk = -20.0 * u * t2 * t1;
+  if(mode <= COMPUTE_WK_AND_DWK)
+    *wk = t4 * (1.0 + u * 4.0);
+
+#endif
+#endif /* WENDLAND_C2_KERNEL */
+
+#ifdef WENDLAND_C4_KERNEL /* Dehnen & Aly 2012 */
+#ifdef ONEDIMS
+  double t1 = (1.0 - u);
+  double t2 = t1 * t1;
+  double t4 = t2 * t2;
+  double t5 = t4 * t1;
+
+  if(mode >= COMPUTE_WK_AND_DWK)
+    *dwk = -14.0 * t4 * (4.0 * u + 1) * u;
+  if(mode <= COMPUTE_WK_AND_DWK)
+    *wk = t5 * (1.0 + u * (5.0 + 8.0 * u));
+
+#else /* 2d or 3d */
+  double t1 = (1.0 - u);
+  double t2 = (t1 * t1);
+  double t4 = t2 * t2;
+  double t6 = t2 * t2 * t2;
+  if(mode >= COMPUTE_WK_AND_DWK)
+    *dwk = -56.0 / 3.0 * u * t4 * t1 * (5.0 * u + 1);
+  if(mode <= COMPUTE_WK_AND_DWK)
+    *wk = t6 * (1.0 + u * (6.0 + 35.0 / 3.0 * u));
+
+#endif
+#endif /* WENDLAND_C4_KERNEL */
+
+#ifdef WENDLAND_C6_KERNEL /* Dehnen & Aly 2012 */
+#ifdef ONEDIMS
+  double t1 = (1.0 - u);
+  double t2 = (t1 * t1);
+  double t4 = t2 * t2;
+  double t6 = t4 * t2;
+  double t7 = t4 * t2 * t1;
+  if(mode >= COMPUTE_WK_AND_DWK)
+    *dwk = -6.0 * u * t6 * (3.0 + u * (18.0 + 35.0 * u));
+  if(mode <= COMPUTE_WK_AND_DWK)
+    *wk = t7 * (1.0 + u * (7.0 + u * (19.0 + 21.0 * u)));
+
+#else /* 2d or 3d */
+  double t1 = (1.0 - u);
+  double t2 = (t1 * t1);
+  double t4 = t2 * t2;
+  double t7 = t4 * t2 * t1;
+  double t8 = t4 * t4;
+  if(mode >= COMPUTE_WK_AND_DWK)
+    *dwk = -22.0 * u * (1.0 + u * (7.0 + 16.0 * u)) * t7;
+  if(mode <= COMPUTE_WK_AND_DWK)
+    *wk = t8 * (1.0 + u * (8.0 + u * (25.0 + 32.0 * u)));
+
+#endif
+#endif /* WENDLAND_C6_KERNEL */
+  if(mode >= COMPUTE_WK_AND_DWK)
+    *dwk *= NORM * hinv4;
+  if(mode <= COMPUTE_WK_AND_DWK)
+    *wk *= NORM * hinv3;
+}
+
+#if defined(WENDLAND_BIAS_CORRECTION) && (!(defined(WENDLAND_C2_KERNEL) || defined(WENDLAND_C4_KERNEL) || defined(WENDLAND_C6_KERNEL)))
+#error "WENDLAND_BIAS_CORRECTION only works with a Wendland kernel"
+#endif
+
+#if defined(WENDLAND_BIAS_CORRECTION) && (defined(WENDLAND_C2_KERNEL) || defined(WENDLAND_C4_KERNEL) || defined(WENDLAND_C6_KERNEL))
+
+#if defined(ONEDIMS) || defined(TWODIMS)
+#error "WENDLAND_BIAS_CORRECTION is only implemented for 3D"
+#endif
+
+static inline void get_bias_correction_parameters(double *alpha, double *eps100)
+{
+#ifdef WENDLAND_C2_KERNEL
+  *eps100 = 0.0294;
+  *alpha  = 0.977;
+#endif
+#ifdef WENDLAND_C4_KERNEL
+  *eps100 = 0.01342;
+  *alpha  = 1.579;
+#endif
+#ifdef WENDLAND_C6_KERNEL
+  *eps100 = 0.0116;
+  *alpha  = 2.236;
+#endif
+}
+static inline double get_density_bias(double hsml, double mass, int DesNumNgb)
+{
+  kernel_density kernel;
+  kernel_hinv(hsml, &kernel.hinv, &kernel.hinv3, &kernel.hinv4);
+  kernel_main(0, kernel.hinv3, kernel.hinv4, &kernel.wk, &kernel.dwk, COMPUTE_WK);
+  double alpha  = 0;
+  double eps100 = 0;
+  get_bias_correction_parameters(&alpha, &eps100);
+  double wc_correction = eps100 * pow(DesNumNgb * 0.01, -alpha) * mass * kernel.wk;
+  return wc_correction;
+}
+#endif
+
+#ifdef EXPLICIT_VECTORIZATION
+static inline void kernel_main_vector(Vec4d u, Vec4d hinv3, Vec4d hinv4, Vec4d *wk, Vec4d *dwk)
+{
+#ifdef CUBIC_SPLINE_KERNEL
+  Vec4d ucompl    = u - 1.0;
+  Vec4db decision = (u < 0.5);
+  Vec4d wksub     = 6.0 * u;
+
+  Vec4d ucompsq = ucompl * ucompl;
+  Vec4d uucomp  = u * ucompl;
+  Vec4d wk1     = 1.0 + uucomp * wksub;
+  Vec4d dwk1    = wksub + uucomp * 18.0;
+  Vec4d wk2     = ucompsq * -2.0 * ucompl;
+  Vec4d dwk2    = ucompsq * -6.0;
+  *wk           = select(decision, wk1, wk2);
+  *dwk          = select(decision, dwk1, dwk2);
+#endif
+
+#ifdef WENDLAND_C2_KERNEL /* Dehnen & Aly 2012 */
+#ifdef ONEDIMS
+  Vec4d t1 = 1.0 - u;
+
+  Vec4d t2 = t1 * t1;
+
+  *dwk = -12.0 * u * t2;
+  *wk  = t2 * t1 * (1.0 + u * 3.0);
+
+#else /* 2d or 3d */
+  Vec4d t1 = (1.0 - u);
+  Vec4d t2 = (t1 * t1);
+
+  *dwk     = -20.0 * u * t2 * t1;
+  *wk      = t2 * t2 * (1.0 + u * 4.0);
+#endif
+#endif /* WENDLAND_C2_KERNEL */
+
+#ifdef WENDLAND_C4_KERNEL /* Dehnen & Aly 2012 */
+#ifdef ONEDIMS
+  Vec4d t1 = (1.0 - u);
+  Vec4d t2 = t1 * t1;
+  Vec4d t4 = t2 * t2;
+
+  *dwk = -14.0 * t4 * (4.0 * u + 1) * u;
+  *wk  = t1 * t4 * (1.0 + u * (5.0 + 8.0 * u));
+
+#else /* 2d or 3d */
+  Vec4d t1 = (1.0 - u);
+  Vec4d t2 = (t1 * t1);
+  Vec4d t4 = t2 * t2;
+  Vec4d t6 = t2 * t2 * t2;
+  *dwk     = -56.0 / 3.0 * u * t4 * t1 * (5.0 * u + 1);
+
+  *wk      = t6 * (1.0 + u * (6.0 + 35.0 / 3.0 * u));
+
+#endif
+#endif /* WENDLAND_C4_KERNEL */
+
+#ifdef WENDLAND_C6_KERNEL /* Dehnen & Aly 2012 */
+#ifdef ONEDIMS
+  Vec4d t1 = (1.0 - u);
+  Vec4d t2 = (t1 * t1);
+  Vec4d t4 = t2 * t2;
+  Vec4d t6 = t4 * t2;
+  Vec4d t7 = t4 * t2 * t1;
+  *dwk     = -6.0 * u * t6 * (3.0 + u * (18.0 + 35.0 * u));
+  *wk      = t7 * (1.0 + u * (7.0 + u * (19.0 + 21.0 * u)));
+
+#else /* 2d or 3d */
+  Vec4d t1 = (1.0 - u);
+  Vec4d t2 = (t1 * t1);
+  Vec4d t4 = t2 * t2;
+  Vec4d t7 = t4 * t2 * t1;
+  Vec4d t8 = t4 * t4;
+  *dwk     = -22.0 * u * (1.0 + u * (7.0 + 16.0 * u)) * t7;
+  *wk      = t8 * (1.0 + u * (8.0 + u * (25.0 + 32.0 * u)));
+
+#endif
+#endif /* WENDLAND_C6_KERNEL */
+  *dwk *= hinv4;
+  *wk *= hinv3;
+}
+#endif /* EXPLICIT_VECTORIZATION */
+#endif
diff --git a/src/sph/sph.h b/src/sph/sph.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf6a6ed314382874ff9783752ed4c2f2f0d783f4
--- /dev/null
+++ b/src/sph/sph.h
@@ -0,0 +1,130 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  sph.h
+ *
+ *  \brief defines a class for the SPH computations
+ */
+
+#ifndef SPH_H
+#define SPH_H
+
+#include "../mpi_utils/shared_mem_handler.h"
+#include "../ngbtree/ngbtree.h"
+
+#define MAX_NGBS 100000
+
+class sph : public ngbtree
+{
+ public:
+  void compute_densities(void);
+  void density(int *targetlist, int ntarget);
+  void hydro_forces_determine(int ntarget, int *targetlist);
+  void tree_based_timesteps(void);
+
+#ifdef PRESSURE_ENTROPY_SPH
+  void setup_entropy_to_invgamma(void);
+#endif
+
+  double fac_mu;
+
+ private:
+  int max_ncycles;
+
+  double fac_vsic_fix;
+
+  double MaxBoxDist;
+
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  bool skip_actual_force_computation;
+#endif
+
+  struct pinfo
+  {
+    int target;
+    int numngb;
+
+    MyIntPosType *searchcenter;
+    MyIntPosType search_min[3], search_range[3];
+    MyIntPosType inthsml;
+    MyNgbTreeFloat hsml;
+    MyNgbTreeFloat hsml2;
+  };
+
+  inline void get_pinfo(int i, pinfo &pdat)
+  {
+    pdat.target = i;
+
+    pdat.searchcenter = Tp->P[i].IntPos;
+    pdat.hsml         = Tp->SphP[i].Hsml;
+    pdat.hsml2        = pdat.hsml * pdat.hsml;
+    pdat.inthsml      = pdat.hsml * Tp->FacCoordToInt;
+
+    for(int i = 0; i < 3; i++)
+      {
+        pdat.search_min[i]   = pdat.searchcenter[i] - pdat.inthsml;
+        pdat.search_range[i] = pdat.inthsml + pdat.inthsml;
+      }
+
+    pdat.numngb = 0;
+  }
+
+  struct ngbdata_density
+  {
+    MyIntPosType *IntPos;
+    MyFloat *VelPred;
+    MyDouble Mass;
+#ifdef PRESSURE_ENTROPY_SPH
+    MyDouble EntropyToInvGammaPred;
+#endif
+#ifdef TIMEDEP_ART_VISC
+    MyDouble Csnd;
+#endif
+  };
+
+  ngbdata_density *Ngbdensdat;
+
+  struct ngbdata_hydro
+  {
+    MyIntPosType *IntPos;
+    sph_particle_data_hydrocore *SphCore;
+
+    MyDouble Mass;
+    signed char TimeBinHydro;
+  };
+
+  ngbdata_hydro *Ngbhydrodat;
+
+  inline foreign_sphpoint_data *get_foreignpointsp(int n, unsigned char shmrank)
+  {
+    return (foreign_sphpoint_data *)((char *)TreeSharedMemBaseAddr[shmrank] + TreeForeign_Points_offsets[shmrank]) + n;
+  }
+  void densities_determine(int ntarget, int *targetlist);
+  void density_evaluate_kernel(pinfo &pdat);
+  void sph_density_interact(pinfo &pdat, int no, char no_type, unsigned char shmrank, int mintopleafnode, int committed);
+  inline void sph_density_open_node(pinfo &pdat, ngbnode *nop, int mintopleafnode, int committed);
+  inline int sph_density_evaluate_particle_node_opening_criterion(pinfo &pdat, ngbnode *nop);
+  inline void sph_density_check_particle_particle_interaction(pinfo &pdat, int p, int p_type, unsigned char shmrank);
+  inline void clear_density_result(sph_particle_data *SphP);
+
+  void hydro_evaluate_kernel(pinfo &pdat);
+  inline void sph_hydro_interact(pinfo &pdat, int no, char no_type, unsigned char shmrank, int mintopleafnode, int committed);
+  inline void sph_hydro_open_node(pinfo &pdat, ngbnode *nop, int mintopleafnode, int committed);
+  inline int sph_hydro_evaluate_particle_node_opening_criterion(pinfo &pdat, ngbnode *nop);
+  inline void sph_hydro_check_particle_particle_interaction(pinfo &pdat, int p, int p_type, unsigned char shmrank);
+  inline void clear_hydro_result(sph_particle_data *SphP);
+
+  inline void sph_treetimestep_interact(pinfo &pdat, int no, char no_type, unsigned char shmrank, int mintopleafnode, int committed);
+  inline void sph_treetimestep_open_node(pinfo &pdat, ngbnode *nop, int mintopleafnode, int committed);
+  inline int sph_treetimestep_evaluate_particle_node_opening_criterion(pinfo &pdat, ngbnode *nop);
+  inline void sph_treetimestep_check_particle_particle_interaction(pinfo &pdat, int p, int p_type, unsigned char shmrank);
+
+#ifdef PRESSURE_ENTROPY_SPH
+  void init_entropy(void);
+#endif
+};
+
+#endif
diff --git a/src/subfind/subfind.cc b/src/subfind/subfind.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be21714bde8f4b0ffb11c4d05473767f04bb7ec6
--- /dev/null
+++ b/src/subfind/subfind.cc
@@ -0,0 +1,926 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind.cc
+ *
+ *  \brief main routines for carrying out the SUBFIND or SUBFIND_HBT algorithms on a set of FOF groups
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+
+#include <mpi.h>
+#include <unistd.h>
+#include <algorithm>
+#include <climits>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../fof/fof_io.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/cxxsort.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+/* main routine of SUBFIND algorithm, for output number 'num'.
+ */
+template <typename partset>
+void fof<partset>::subfind_find_subhalos(int num, const char *basename, const char *grpcat_dirbasename)
+{
+  TIMER_START(CPU_SUBFIND);
+
+  double tstart = Logs.second();
+
+  mpi_printf("\nSUBFIND: We now execute a parallel version of SUBFIND.\n");
+
+#ifdef MERGERTREE
+  double lensum = 0, partcount = 0;
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      if(Tp->P[i].PrevSizeOfSubhalo.get() > 0)
+        partcount += 1;
+      lensum += Tp->P[i].PrevSizeOfSubhalo.get();
+      Tp->PS[i].SizeOfSubhalo.set(0);  // initialize
+    }
+  MPI_Allreduce(MPI_IN_PLACE, &partcount, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  MPI_Allreduce(MPI_IN_PLACE, &lensum, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  mpi_printf("SUBFIND: Previous subhalo catalogue had approximately a size %g, and the summed squared subhalo size was %g\n",
+             partcount, lensum);
+#endif
+
+#ifndef SUBFIND_HBT
+
+  /* let's determine the total matter density around each particle/cell. For each point,
+   * we determine a smoothing radius based on a certain number of dark matter particles
+   * from the primary link type.
+   */
+  FoFGravTree.treeallocate(Tp->NumPart, Tp, FoFDomain);
+
+  TIMER_STOP(CPU_SUBFIND);
+  FoFGravTree.treebuild(Tp->NumPart, NULL);
+  TIMER_START(CPU_SUBFIND);
+
+  subfind_density_hsml_guess();
+
+  /* find smoothing lengths for primary and secondary points in groups (counting primary particles only for setting the smoothing
+   * length) and then estimate total matter density around them
+   */
+  double cputime = subfind_density();
+
+  mpi_printf("SUBFIND: iteration to correct primary neighbor count and density estimate took %g sec\n", cputime);
+
+  FoFGravTree.treefree();
+#endif
+
+  All.set_cosmo_factors_for_current_time();
+
+  /* fill internal energy into auxiliary PS[] array */
+#ifndef LEAN
+  for(int i = 0; i < Tp->NumPart; i++)
+    if(Tp->P[i].getType() == 0)
+      Tp->PS[i].Utherm = Tp->get_utherm_from_entropy(i);
+    else
+      Tp->PS[i].Utherm = 0;
+#endif
+
+  double GroupSizeThresholdFactor = 0.55;
+  int ncount = 0, nprocs = 0;
+  long long seriallen = 0;
+  long long sum_seriallen;
+
+  /* Let's set a fiducial size limit for the maximum group size before we select the collective subfind algorithm */
+  do
+    {
+      GroupSizeThresholdFactor += 0.05;
+      ncount = nprocs = seriallen = 0;
+
+      MaxSerialGroupLen = (int)(GroupSizeThresholdFactor * Tp->TotNumPart / NTask);
+
+      /* Count, how many groups are above this limit, and how many processors we need for them */
+      for(int i = 0; i < Ngroups; i++)
+        if(Group[i].Len > MaxSerialGroupLen)
+          {
+            ncount++;
+            nprocs += ((Group[i].Len - 1) / MaxSerialGroupLen) + 1;
+          }
+        else
+          seriallen += Group[i].Len;
+
+      MPI_Allreduce(&ncount, &Ncollective, 1, MPI_INT, MPI_SUM, Communicator);
+      MPI_Allreduce(&nprocs, &NprocsCollective, 1, MPI_INT, MPI_SUM, Communicator);
+      sumup_longs(1, &seriallen, &sum_seriallen, Communicator);
+    }
+  while(NprocsCollective + ((sum_seriallen > 0) ? 1 : 0) > NTask);
+
+  mpi_printf("SUBFIND: Number of FOF halos treated with collective SubFind algorithm = %d\n", Ncollective);
+  mpi_printf("SUBFIND: Number of processors used in different partitions for the collective SubFind code = %d\n", NprocsCollective);
+  mpi_printf("SUBFIND: (The adopted size-limit for the collective algorithm was %d particles, for threshold size factor %g)\n",
+             MaxSerialGroupLen, GroupSizeThresholdFactor);
+  mpi_printf("SUBFIND: The other %lld FOF halos are treated in parallel with serial code\n", TotNgroups - Ncollective);
+
+  /* set up a global table that informs about the processor assignment of the groups that are treated collectively */
+  ProcAssign = (proc_assign_data *)Mem.mymalloc_movable(&ProcAssign, "ProcAssign", Ncollective * sizeof(proc_assign_data));
+  proc_assign_data *locProcAssign = (proc_assign_data *)Mem.mymalloc("locProcAssign", ncount * sizeof(proc_assign_data));
+
+  ncount = 0;
+  for(int i = 0; i < Ngroups; i++)
+    if(Group[i].Len > MaxSerialGroupLen)
+      {
+        locProcAssign[ncount].GroupNr = Group[i].GroupNr;
+        locProcAssign[ncount].Len     = Group[i].Len;
+        ncount++;
+      }
+
+  /* gather the information on the collective groups accross all CPUs */
+  int *recvcounts = (int *)Mem.mymalloc("recvcounts", sizeof(int) * NTask);
+  int *bytecounts = (int *)Mem.mymalloc("bytecounts", sizeof(int) * NTask);
+  int *byteoffset = (int *)Mem.mymalloc("byteoffset", sizeof(int) * NTask);
+
+  MPI_Allgather(&ncount, 1, MPI_INT, recvcounts, 1, MPI_INT, Communicator);
+
+  for(int task = 0; task < NTask; task++)
+    bytecounts[task] = recvcounts[task] * sizeof(proc_assign_data);
+
+  byteoffset[0] = 0;
+  for(int task = 1; task < NTask; task++)
+    byteoffset[task] = byteoffset[task - 1] + bytecounts[task - 1];
+
+  MPI_Allgatherv(locProcAssign, bytecounts[ThisTask], MPI_BYTE, ProcAssign, bytecounts, byteoffset, MPI_BYTE, Communicator);
+
+  Mem.myfree(byteoffset);
+  Mem.myfree(bytecounts);
+  Mem.myfree(recvcounts);
+  Mem.myfree(locProcAssign);
+
+  /* make sure, the table is sorted in ascending group-number order */
+  mycxxsort(ProcAssign, ProcAssign + Ncollective, subfind_compare_procassign_GroupNr);
+
+  /* assign the processor sets for the collective groups and set disjoint color-flag to later split the processors into different
+   * communicators */
+  nprocs         = 0;
+  CommSplitColor = ThisTask; /* by default, this places every processor in his own processor group */
+
+  /* now we assign the same color for groups of CPUs that each do one halo collectively */
+  for(int i = 0; i < Ncollective; i++)
+    {
+      ProcAssign[i].FirstTask = nprocs;
+      ProcAssign[i].NTask     = ((ProcAssign[i].Len - 1) / MaxSerialGroupLen) + 1;
+      nprocs += ProcAssign[i].NTask;
+
+      if(ThisTask >= ProcAssign[i].FirstTask && ThisTask < (ProcAssign[i].FirstTask + ProcAssign[i].NTask))
+        CommSplitColor = i;
+    }
+
+  /* Now assign a target task for each local group. For collective groups, the target task is the master in the CPU set, whereas
+   * the serial ones are distributed in a round-robin fashion to the remaining CPUs.
+   */
+  for(int i = 0; i < Ngroups; i++)
+    {
+      if(Group[i].Len > MaxSerialGroupLen) /* we have a collective group */
+        {
+          if(Group[i].GroupNr >= Ncollective || Group[i].GroupNr < 0)
+            Terminate("odd");
+          Group[i].TargetTask = ProcAssign[Group[i].GroupNr].FirstTask;
+        }
+      else
+        Group[i].TargetTask = ((Group[i].GroupNr - Ncollective) % (NTask - NprocsCollective)) + NprocsCollective;
+    }
+
+  /* distribute the groups */
+  subfind_distribute_groups();
+
+  /* sort the local groups by group number */
+  mycxxsort(Group, Group + Ngroups, fof_compare_Group_GroupNr);
+
+  /* assign target CPUs for the particles in groups */
+  /* the particles not in groups will be distributed such that a close to uniform particle load results */
+  double t0           = Logs.second();
+  int *count_loc_task = (int *)Mem.mymalloc_clear("count_loc_task", NTask * sizeof(int));
+  int *count_task     = (int *)Mem.mymalloc("count_task", NTask * sizeof(int));
+  int *count_free     = (int *)Mem.mymalloc("count_free", NTask * sizeof(int));
+  int count_loc_free  = 0;
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      if(Tp->PS[i].GroupNr.get() < HALONR_MAX) /* particle is in a group */
+        {
+          if(Tp->PS[i].GroupNr.get() < (MyIDType)Ncollective) /* we are in a collectively treated group */
+            Tp->PS[i].TargetTask = ProcAssign[Tp->PS[i].GroupNr.get()].FirstTask + (i % ProcAssign[Tp->PS[i].GroupNr.get()].NTask);
+          else /* otherwise the whole group will be treated by a single CPU */
+            Tp->PS[i].TargetTask = ((Tp->PS[i].GroupNr.get() - Ncollective) % (NTask - NprocsCollective)) + NprocsCollective;
+
+          count_loc_task[Tp->PS[i].TargetTask]++;
+        }
+      else
+        {
+          Tp->PS[i].TargetTask = ThisTask;  // default is that we stay
+          count_loc_free++;
+        }
+
+      Tp->PS[i].TargetIndex = 0; /* unimportant here */
+    }
+
+  /* get a list of how many unassigned and thus in principle movable particles every processor has */
+  MPI_Allgather(&count_loc_free, 1, MPI_INT, count_free, 1, MPI_INT, Communicator);
+
+  /* determine how many particles from groups will end up on each processor */
+  MPI_Allreduce(count_loc_task, count_task, NTask, MPI_INT, MPI_SUM, Communicator);
+
+  /* get the total particle count */
+  long long sum = 0;
+  for(int i = 0; i < NTask; i++)
+    sum += count_task[i] + count_free[i];
+
+  /* find out the average maximum load, and what the local head-room to it would be */
+  int maxload = (sum + NTask - 1) / NTask;
+  for(int i = 0; i < NTask; i++)
+    {
+      count_task[i] = maxload - count_task[i]; /* this is the amount that can fit on this task */
+      if(count_task[i] < 0)
+        count_task[i] = 0;
+    }
+
+  /* let's see how many particles can stay without degrading the load balance */
+  for(int i = 0; i < NTask; i++)
+    {
+      if(count_free[i] <= count_task[i])
+        {
+          count_task[i] -= count_free[i];
+          count_free[i] = 0;
+        }
+      else
+        {
+          count_free[i] -= count_task[i];
+          count_task[i] = 0;
+        }
+    }
+
+  /* now we determine the optimum target task for the subset of local particles that should be moved to another rank for better load
+   * balance */
+  int current_task = 0;
+  for(int i = 0; i < ThisTask; i++)
+    {
+      while(count_free[i] > 0 && current_task < NTask)
+        {
+          if(count_free[i] < count_task[current_task])
+            {
+              count_task[current_task] -= count_free[i];
+              count_free[i] = 0;
+            }
+          else
+            {
+              count_free[i] -= count_task[current_task];
+              count_task[current_task] = 0;
+              current_task++;
+            }
+        }
+    }
+
+  /* move a subset of particles such that close to uniform load is achieved */
+  for(int i = 0; i < Tp->NumPart && count_free[ThisTask] > 0; i++)
+    {
+      if(Tp->PS[i].GroupNr.get() == HALONR_MAX)
+        {
+          /* particle not in a group. Could in principle stay but we move it such that a good load balance is obtained. */
+          while(count_task[current_task] == 0 && current_task < NTask - 1)
+            current_task++;
+
+          Tp->PS[i].TargetTask = current_task;
+          count_task[current_task]--;
+          count_free[ThisTask]--;
+        }
+    }
+
+  Mem.myfree(count_free);
+  Mem.myfree(count_task);
+  Mem.myfree(count_loc_task);
+
+  /* report current balance */
+  double balance = subfind_get_particle_balance();
+  mpi_printf("SUBFIND: particle balance=%g\n", balance);
+
+  /* distribute particles such that groups are completely on the CPU(s) that do the corresponding group(s) */
+  FoFDomain->particle_exchange_based_on_PS(Communicator);
+  double t1 = Logs.second();
+  mpi_printf("SUBFIND: subfind_exchange() took %g sec\n", Logs.timediff(t0, t1));
+
+  /* report balance that has been achieved */
+  balance = subfind_get_particle_balance();
+  mpi_printf("SUBFIND: particle balance for processing=%g\n", balance);
+
+  /* lets estimate the maximum number of substructures we may find and need to store on the local CPU */
+  if(ThisTask < NprocsCollective)
+    {
+      MaxNsubhalos = (ProcAssign[CommSplitColor].Len / ProcAssign[CommSplitColor].NTask) / All.DesLinkNgb;
+    }
+  else
+    {
+      long long nlocid = 0;
+      for(int i = 0; i < Ngroups; i++)
+        nlocid += Group[i].Len;
+
+      MaxNsubhalos = nlocid / All.DesLinkNgb; /* should be a quite conservative upper limit */
+    }
+
+  Mem.myfree(ProcAssign);
+
+  // some log-variables
+  count_decisions           = 0;
+  count_different_decisions = 0;
+
+  /* allocate storage space for locally found subhalos */
+  Nsubhalos = 0;
+  Subhalo   = (subhalo_properties *)Mem.mymalloc_movable(&Subhalo, "Subhalo", MaxNsubhalos * sizeof(subhalo_properties));
+
+  /* we can now split the communicator to give each collectively treated group its own processor set */
+  MPI_Comm_split(Communicator, CommSplitColor, ThisTask, &SubComm);
+  MPI_Comm_size(SubComm, &SubNTask);
+  MPI_Comm_rank(SubComm, &SubThisTask);
+
+  double ti0 = Logs.second();
+  {
+    domain<partset> SubDomain(SubComm, Tp);
+
+    if(SubDomain.NumNodes != 0)
+      Terminate("SubDomain.NumNodes=%d\n", SubDomain.NumNodes);
+
+    if(CommSplitColor < Ncollective)
+      {
+        /* for the collective algorithm, we now do a further reshuffling to distribute the particles
+         * in density-order to the processors. This is here only a performance improvement later on
+         * for out distributed link-lists, reducing the number of MPI messages that need to be sent.
+         * */
+
+        /* allocated storage for an auxiliary array needed for sorting */
+        sort_as_data *as = (sort_as_data *)Mem.mymalloc_movable(&as, "as", Tp->NumPart * sizeof(sort_as_data));
+
+        for(int i = 0; i < Tp->NumPart; i++)
+          {
+            as[i].density = Tp->PS[i].u.s.u.DM_Density;
+            as[i].origin  = (((long long)SubThisTask) << 32) + i;
+          }
+
+        /* sort according to density  */
+        mycxxsort_parallel(as, as + Tp->NumPart, subfind_compare_as_density, SubComm);
+
+        for(int i = 0; i < Tp->NumPart; i++)
+          as[i].targettask = SubThisTask;
+
+        /* revert to original order */
+        mycxxsort_parallel(as, as + Tp->NumPart, subfind_compare_as_origin, SubComm);
+
+        for(int i = 0; i < Tp->NumPart; i++)
+          Tp->PS[i].TargetTask = as[i].targettask;
+
+        Mem.myfree(as);
+
+        SubDomain.particle_exchange_based_on_PS(SubComm);
+
+        if(SubDomain.NumNodes != 0)
+          Terminate("SubDomain.NumNodes=%d\n", SubDomain.NumNodes);
+
+        /* now call the routine that does the actual processing of the groups */
+        subfind_processing(&SubDomain, COLL_SUBFIND); /* we are one of the CPUs that does a collective group */
+      }
+    else
+      subfind_processing(&SubDomain, SERIAL_SUBFIND); /* we have several groups in full to be done by the local CPU */
+  }
+  double ti1 = Logs.second();
+  mpi_printf("SUBFIND: Processing overall took  (total time=%g sec)\n", Logs.timediff(ti0, ti1));
+
+  /* free the communicator again */
+  MPI_Comm_free(&SubComm);
+
+#ifndef SUBFIND_HBT
+  /* report some statistics */
+  MPI_Allreduce(MPI_IN_PLACE, &count_decisions, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
+  MPI_Allreduce(MPI_IN_PLACE, &count_different_decisions, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
+  mpi_printf("SUBFIND: %lld out of %lld decisions, fraction %g, where influenced by previous subhalo size\n",
+             count_different_decisions, count_decisions, ((double)count_different_decisions) / count_decisions);
+
+#endif
+
+  /* reestablish consistent global values for Sp.MaxPart/MaxPartSph in case they have diverged
+   * in the subcommunicators
+   */
+  int max_load, max_loadsph;
+  MPI_Allreduce(&Tp->MaxPart, &max_load, 1, MPI_INT, MPI_MAX, Communicator);
+  MPI_Allreduce(&Tp->MaxPartSph, &max_loadsph, 1, MPI_INT, MPI_MAX, Communicator);
+
+  /* do resize */
+  Tp->reallocate_memory_maxpart(max_load);
+  Tp->PS = (subfind_data *)Mem.myrealloc_movable(Tp->PS, Tp->MaxPart * sizeof(subfind_data));
+
+  Tp->reallocate_memory_maxpartsph(max_loadsph);
+
+  /* distribute particles back to original CPU */
+  t0 = Logs.second();
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      Tp->PS[i].TargetTask  = Tp->PS[i].OriginTask;
+      Tp->PS[i].TargetIndex = Tp->PS[i].OriginIndex;
+    }
+
+  FoFDomain->particle_exchange_based_on_PS(Communicator);
+  t1 = Logs.second();
+  if(ThisTask == 0)
+    printf("SUBFIND: subfind_exchange() (for return to original CPU)  took %g sec\n", Logs.timediff(t0, t1));
+
+  /* Now do he spherical overdensity calculations around group centers.
+   * For this, we need a search tree with all particles again.
+   */
+
+  FoFGravTree.treeallocate(Tp->NumPart, Tp, FoFDomain);
+
+  TIMER_STOP(CPU_SUBFIND);
+  FoFGravTree.treebuild(Tp->NumPart, NULL);
+  TIMER_START(CPU_SUBFIND);
+
+  /* compute spherical overdensities for FOF groups */
+  double cputimeso = subfind_overdensity();
+  mpi_printf("SUBFIND: determining spherical overdensity masses took %g sec\n", cputimeso);
+
+  FoFGravTree.treefree();
+
+  /* sort the groups according to group/subhalo-number */
+  t0 = Logs.second();
+  mycxxsort_parallel(Group, Group + Ngroups, fof_compare_Group_GroupNr, Communicator);
+  mycxxsort_parallel(Subhalo, Subhalo + Nsubhalos, subfind_compare_Subhalo_GroupNr_SubRankInGr, Communicator);
+  t1 = Logs.second();
+  mpi_printf("SUBFIND: assembled and ordered groups and subhalos (took %g sec)\n", Logs.timediff(t0, t1));
+
+  sumup_large_ints(1, &Nsubhalos, &TotNsubhalos, Communicator);
+
+  /* determine largest subhalo and total particle/cell count in substructures */
+  long long lenmax = 0, glob_lenmax;
+  long long totlen = 0;
+  long long totsublength;
+  for(int i = 0; i < Nsubhalos; i++)
+    {
+      totlen += Subhalo[i].Len;
+
+      if(Subhalo[i].Len > lenmax)
+        lenmax = Subhalo[i].Len;
+    }
+  sumup_longs(1, &totlen, &totsublength, Communicator);
+  MPI_Reduce(&lenmax, &glob_lenmax, 1, MPI_LONG_LONG, MPI_MAX, 0, Communicator);
+
+  /* set binding energy of unbound particles to zero (was overwritten with Hsml before) */
+  for(int i = 0; i < Tp->NumPart; i++)
+    if(Tp->PS[i].SubRankInGr == INT_MAX)
+      Tp->PS[i].v.DM_BindingEnergy = 0;
+
+  TIMER_START(CPU_SNAPSHOT);
+
+  /* now do the output of the subhalo catalogue */
+  subfind_save_final(num, basename, grpcat_dirbasename);
+
+  TIMER_STOP(CPU_SNAPSHOT);
+
+  double tend = Logs.second();
+
+  if(ThisTask == 0)
+    {
+      printf("SUBFIND: Finished with SUBFIND.  (total time=%g sec)\n", Logs.timediff(tstart, tend));
+      printf("SUBFIND: Total number of subhalos with at least %d particles: %lld\n", All.DesLinkNgb, TotNsubhalos);
+      if(TotNsubhalos > 0)
+        {
+          printf("SUBFIND: Largest subhalo has %lld particles/cells.\n", glob_lenmax);
+          printf("SUBFIND: Total number of particles/cells in subhalos: %lld\n", totsublength);
+        }
+    }
+
+  Mem.myfree_movable(Subhalo);
+
+  TIMER_STOP(CPU_SUBFIND);
+}
+
+template <typename partset>
+void fof<partset>::subfind_save_final(int num, const char *basename, const char *grpcat_dirbasename)
+{
+  double t0 = Logs.second();
+
+  long long totsubs = 0;
+
+  /* fill in the FirstSub-values */
+  for(int i = 0; i < Ngroups; i++)
+    {
+      if(i > 0)
+        Group[i].FirstSub = Group[i - 1].FirstSub + Group[i - 1].Nsubs;
+      else
+        Group[i].FirstSub = 0;
+      totsubs += Group[i].Nsubs;
+    }
+
+  long long *Send_count  = (long long *)Mem.mymalloc("Send_count", sizeof(long long) * this->NTask);
+  long long *Send_offset = (long long *)Mem.mymalloc("Send_offset", sizeof(long long) * this->NTask);
+
+  MPI_Allgather(&totsubs, 1, MPI_LONG_LONG, Send_count, 1, MPI_LONG_LONG, Communicator);
+
+  Send_offset[0] = 0;
+
+  for(int i = 1; i < NTask; i++)
+    Send_offset[i] = Send_offset[i - 1] + Send_count[i - 1];
+
+  for(int i = 0; i < Ngroups; i++)
+    {
+      if(Group[i].Nsubs > 0)
+        Group[i].FirstSub += Send_offset[ThisTask];
+      else
+        Group[i].FirstSub = -1;
+    }
+
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  subfind_assign_subhalo_offsettype();
+
+  fof_io<partset> FoF_IO{this, this->Communicator, All.SnapFormat};
+  FoF_IO.fof_subfind_save_groups(num, basename, grpcat_dirbasename);
+
+  double t1 = Logs.second();
+
+  mpi_printf("SUBFIND: Subgroup catalogues saved. took = %g sec\n", Logs.timediff(t0, t1));
+}
+
+template <typename partset>
+void fof<partset>::subfind_assign_subhalo_offsettype(void)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * this->NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * this->NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * this->NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * this->NTask);
+
+  if(Nsubhalos > 0)
+    {
+      Subhalo[0].SubRankInGr = 0;
+      for(int j = 0; j < NTYPES; j++)
+        Subhalo[0].OffsetType[j] = 0;
+    }
+
+  for(int i = 1; i < Nsubhalos; i++)
+    if(Subhalo[i].GroupNr != Subhalo[i - 1].GroupNr)
+      {
+        Subhalo[i].SubRankInGr = 0;
+        for(int j = 0; j < NTYPES; j++)
+          Subhalo[i].OffsetType[j] = 0;
+      }
+    else
+      {
+        Subhalo[i].SubRankInGr = Subhalo[i - 1].SubRankInGr + 1;
+        for(int j = 0; j < NTYPES; j++)
+          Subhalo[i].OffsetType[j] = Subhalo[i - 1].OffsetType[j] + Subhalo[i - 1].LenType[j];
+      }
+
+  struct subnr_info
+  {
+    int grnr;
+    int cnt;
+    long long stype_cum[NTYPES];
+  };
+  subnr_info subnr_data;
+
+  if(Nsubhalos > 0)
+    {
+      subnr_data.grnr = Subhalo[Nsubhalos - 1].GroupNr;
+      subnr_data.cnt  = Subhalo[Nsubhalos - 1].SubRankInGr + 1;
+      for(int j = 0; j < NTYPES; j++)
+        subnr_data.stype_cum[j] = Subhalo[Nsubhalos - 1].OffsetType[j] + Subhalo[Nsubhalos - 1].LenType[j];
+    }
+  else
+    subnr_data.grnr = INT_MAX;
+
+  subnr_info *info_all = (subnr_info *)Mem.mymalloc("info_all", NTask * sizeof(subnr_info));
+  MPI_Allgather(&subnr_data, sizeof(subnr_info), MPI_BYTE, info_all, sizeof(subnr_info), MPI_BYTE, Communicator);
+
+  if(Nsubhalos > 0)
+    {
+      int cnt = 0;
+      long long stype_cum[NTYPES];
+      for(int j = 0; j < NTYPES; j++)
+        stype_cum[j] = 0;
+
+      for(int i = ThisTask - 1; i >= 0; i--)
+        if(info_all[i].grnr == Subhalo[0].GroupNr)
+          {
+            cnt += info_all[i].cnt;
+            for(int j = 0; j < NTYPES; j++)
+              stype_cum[j] += info_all[i].stype_cum[j];
+          }
+
+      for(int i = 0; i < Nsubhalos; i++)
+        if(Subhalo[i].GroupNr == Subhalo[0].GroupNr)
+          {
+            Subhalo[i].SubRankInGr += cnt;
+            for(int j = 0; j < NTYPES; j++)
+              Subhalo[i].OffsetType[j] += stype_cum[j];
+          }
+        else
+          break;
+    }
+
+  Mem.myfree(info_all);
+
+  /* now need to send the subhalos to the processor that holds the parent group to inquire about the group
+   * offset and then add this in.
+   */
+
+  /* tell everybody how many groups each processor holds */
+  int *gcount = (int *)Mem.mymalloc("gcount", NTask * sizeof(int));
+  MPI_Allgather(&Ngroups, 1, MPI_INT, gcount, 1, MPI_INT, Communicator);
+
+  int nexport = 0, nimport = 0;
+
+  struct group_info
+  {
+    int grindex;
+    int subindex;
+    long long OffsetType[NTYPES];
+  };
+  group_info *export_group_data = NULL, *import_group_data = NULL;
+
+  for(int mode = 0; mode < 2; mode++)
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int target                   = 0;
+      long long first_gr_in_target = 0; /* this the first particle (of this type) on the current target processor */
+
+      for(int i = 0; i < Nsubhalos; i++)
+        {
+          while(Subhalo[i].GroupNr >= first_gr_in_target + gcount[target])
+            {
+              if(target >= NTask)
+                Terminate("target=%d  i=%d Nsubhalos=%d  Subhalo[i],GroupNr=%lld\n", target, i, Nsubhalos, Subhalo[i].GroupNr);
+
+              first_gr_in_target += gcount[target];
+              target++;
+            }
+
+          if(mode == 0)
+            Send_count[target]++;
+          else
+            {
+              int off                         = Send_offset[target] + Send_count[target]++;
+              export_group_data[off].grindex  = Subhalo[i].GroupNr - first_gr_in_target;
+              export_group_data[off].subindex = i;
+            }
+        }
+
+      if(mode == 0)
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+          Recv_offset[0] = Send_offset[0] = 0;
+          for(int j = 0; j < NTask; j++)
+            {
+              nimport += Recv_count[j];
+              nexport += Send_count[j];
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          export_group_data = (group_info *)Mem.mymalloc("export_group_data", nexport * sizeof(group_info));
+          import_group_data = (group_info *)Mem.mymalloc("import_group_data", nimport * sizeof(group_info));
+        }
+    }
+
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&export_group_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(group_info), MPI_BYTE, recvTask,
+                       TAG_DENS_B, &import_group_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(group_info), MPI_BYTE,
+                       recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* now let's go through the imported groups and assign the offsets */
+  for(int i = 0; i < nimport; i++)
+    for(int j = 0; j < NTYPES; j++)
+      import_group_data[i].OffsetType[j] = Group[import_group_data[i].grindex].OffsetType[j];
+
+  /* send stuff back */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&import_group_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(group_info), MPI_BYTE, recvTask,
+                       TAG_DENS_B, &export_group_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(group_info), MPI_BYTE,
+                       recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* add it in to subhalo offsets */
+  for(int i = 0; i < nexport; i++)
+    for(int j = 0; j < NTYPES; j++)
+      Subhalo[export_group_data[i].subindex].OffsetType[j] += export_group_data[i].OffsetType[j];
+
+  Mem.myfree(import_group_data);
+  Mem.myfree(export_group_data);
+  Mem.myfree(gcount);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+template <typename partset>
+void fof<partset>::subfind_redetermine_groupnr(void)
+{
+  /* tell everybody how many groups we have locally */
+  int *ngroups_all = (int *)Mem.mymalloc("ngroups_all", NTask * sizeof(int));
+  MPI_Allgather(&Ngroups, 1, MPI_INT, ngroups_all, 1, MPI_INT, Communicator);
+
+  int nsubs_local = 0;
+  for(int i = 0; i < Ngroups; i++)
+    nsubs_local += Group[i].Nsubs;
+
+  /* accumulate the corresponding subhalo numbers */
+  int *nsubs_all = (int *)Mem.mymalloc("nsubs_all", NTask * sizeof(int));
+  MPI_Allgather(&nsubs_local, 1, MPI_INT, nsubs_all, 1, MPI_INT, Communicator);
+
+  /* Finally, also tell everybody how many subhalos we have locally */
+  int *nsubhalos_all = (int *)Mem.mymalloc("nsubhalos_all", NTask * sizeof(int));
+  MPI_Allgather(&Nsubhalos, 1, MPI_INT, nsubhalos_all, 1, MPI_INT, Communicator);
+
+  long long subhalo_previous = 0;
+  for(int i = 0; i < ThisTask; i++)
+    subhalo_previous += nsubhalos_all[i];
+
+  int nexport = 0, nimport = 0;
+
+  struct group_info
+  {
+    long long subhalonr;
+    long long groupnr;
+    int subindex;
+  };
+  group_info *export_group_data = NULL, *import_group_data = NULL;
+
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * this->NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * this->NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * this->NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * this->NTask);
+
+  for(int mode = 0; mode < 2; mode++)
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int target               = 0;
+      long long nsubs_previous = 0;
+
+      for(int i = 0; i < Nsubhalos; i++)
+        {
+          long long subhalonr = subhalo_previous + i;
+
+          while(subhalonr >= nsubs_previous + nsubs_all[target])
+            {
+              if(target >= NTask)
+                Terminate("target=%d\n", target);
+
+              nsubs_previous += nsubs_all[target];
+              target++;
+            }
+
+          if(mode == 0)
+            Send_count[target]++;
+          else
+            {
+              int off                          = Send_offset[target] + Send_count[target]++;
+              export_group_data[off].subhalonr = subhalonr;
+              export_group_data[off].subindex  = i;
+            }
+        }
+
+      if(mode == 0)
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+          Recv_offset[0] = Send_offset[0] = 0;
+          for(int j = 0; j < NTask; j++)
+            {
+              nimport += Recv_count[j];
+              nexport += Send_count[j];
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          export_group_data = (group_info *)Mem.mymalloc("export_group_data", nexport * sizeof(group_info));
+          import_group_data = (group_info *)Mem.mymalloc("import_group_data", nimport * sizeof(group_info));
+        }
+    }
+
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&export_group_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(group_info), MPI_BYTE, recvTask,
+                       TAG_DENS_B, &import_group_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(group_info), MPI_BYTE,
+                       recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* now let's go through the imported subhalos and assign the group numbers */
+
+  long long nsubs = 0;
+  for(int i = 0; i < ThisTask; i++)
+    nsubs += nsubs_all[i];
+
+  long long group_previous = 0;
+  for(int i = 0; i < ThisTask; i++)
+    group_previous += ngroups_all[i];
+
+  int gr = 0;
+  for(int i = 0; i < nimport; i++)
+    {
+      while(import_group_data[i].subhalonr >= nsubs + Group[gr].Nsubs)
+        {
+          nsubs += Group[gr].Nsubs;
+          gr++;
+
+          if(gr >= Ngroups)
+            Terminate("i=%d|%d gr=%d  >= Ngroups=%d\n", i, nimport, gr, Ngroups);
+        }
+
+      import_group_data[i].groupnr = group_previous + gr;
+    }
+
+  /* send stuff back */
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&import_group_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(group_info), MPI_BYTE, recvTask,
+                       TAG_DENS_B, &export_group_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(group_info), MPI_BYTE,
+                       recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* assign the groupnr */
+  for(int i = 0; i < nexport; i++)
+    {
+      /*
+      if(Subhalo[export_group_data[i].subindex].GroupNr != export_group_data[i].groupnr)
+        Terminate("Subhalo[export_group_data[i].subindex].GroupNr=%lld  export_group_data[i].groupnr=%lld\n",
+                  Subhalo[export_group_data[i].subindex].GroupNr, export_group_data[i].groupnr);
+      */
+      Subhalo[export_group_data[i].subindex].GroupNr = export_group_data[i].groupnr;
+    }
+
+  Mem.myfree(import_group_data);
+  Mem.myfree(export_group_data);
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  Mem.myfree(nsubhalos_all);
+  Mem.myfree(nsubs_all);
+  Mem.myfree(ngroups_all);
+}
+
+template <typename partset>
+double fof<partset>::subfind_get_particle_balance(void)
+{
+  int maxpart;
+  long long sum;
+  MPI_Allreduce(&Tp->NumPart, &maxpart, 1, MPI_INT, MPI_MAX, Communicator);
+  sumup_large_ints(1, &Tp->NumPart, &sum, Communicator);
+  return maxpart / (((double)sum) / NTask);
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
diff --git a/src/subfind/subfind.h b/src/subfind/subfind.h
new file mode 100644
index 0000000000000000000000000000000000000000..13e8b823e2c7e7dd9af1becae2e9d10239fb5eed
--- /dev/null
+++ b/src/subfind/subfind.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind.h
+ *
+ *  \brief defines various constants used by the Subfind code
+ */
+
+#ifndef SUBFIND_H
+#define SUBFIND_H
+
+#ifdef SUBFIND
+
+#define FIND_SMOOTHING_LENGTHS 0
+#define FIND_TOTAL_DENSITIES 1
+
+#define RECOMPUTE_ALL 0
+#define UPDATE_ALL 1
+
+#define MAX_ITER_UNBIND 500
+
+#define TAG_POLLING_DONE 201
+#define TAG_SET_ALL 202
+#define TAG_GET_NGB_INDICES 204
+#define TAG_GET_TAILANDLEN 205
+#define TAG_GET_TAILANDLEN_DATA 206
+#define TAG_SET_TAILANDLEN 207
+#define TAG_SET_HEADANDNEXT 209
+#define TAG_SETHEADGETNEXT_DATA 210
+#define TAG_SET_NEXT 211
+#define TAG_SETHEADGETNEXT 213
+#define TAG_GET_NEXT 215
+#define TAG_GET_NEXT_DATA 216
+#define TAG_GET_HEAD 217
+#define TAG_GET_HEAD_DATA 218
+#define TAG_ADD_PARTICLE 219
+#define TAG_ADDBOUND 220
+#define TAG_NID 222
+#define TAG_NID_DATA 223
+#define TAG_SETRANK 224
+#define TAG_SETRANK_OUT 226
+#define TAG_GET_RANK 227
+#define TAG_GET_RANK_DATA 228
+#define TAG_MARK_PARTICLE 229
+#define TAG_SET_NEWTAIL 230
+#define TAG_GET_OLDTAIL 231
+#define TAG_GET_TWOHEADS 232
+#define TAG_GET_TWOHEADS_DATA 233
+
+#endif
+#endif
diff --git a/src/subfind/subfind_density.cc b/src/subfind/subfind_density.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e148ddda5f90e410b4b90a68ceacb1e15fda58ef
--- /dev/null
+++ b/src/subfind/subfind_density.cc
@@ -0,0 +1,542 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_density.cc
+ *
+ *  \brief local matter density calculation for Subfind algorithm
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+#ifndef SUBFIND_HBT
+
+#include <gsl/gsl_math.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <algorithm>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/generic_comm.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+// Structure for communication during the density computation. Holds data that is sent to other processors.
+struct subdens_in : data_in_generic
+{
+  MyIntPosType IntPos[3];
+  MyFloat Hsml;
+};
+
+/* local data structure that holds results acquired on remote processors */
+struct subdens_out
+{
+  int Ngb;
+  MyFloat Rho;
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+  MyFloat VelDisp, Vx, Vy, Vz;
+#endif
+};
+
+static int *DM_NumNgb;
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+static MyFloat *Vx, *Vy, *Vz;
+#endif
+
+template <typename T_tree, typename T_domain, typename T_partset>
+class subdens_comm : public generic_comm<subdens_in, subdens_out, T_tree, T_domain, T_partset>
+{
+ public:
+  typedef generic_comm<subdens_in, subdens_out, T_tree, T_domain, T_partset> gcomm;
+  using gcomm::D;
+  using gcomm::Thread;
+  using gcomm::Tp;  // This makes sure that we can access Tp from the base class without having to use "this->Tp"
+  using gcomm::Tree;
+
+  /* need to call the base class constructor explicitly */
+  subdens_comm(T_domain *dptr, T_tree *tptr, T_partset *pptr) : gcomm(dptr, tptr, pptr) {}
+
+  /* routine that fills the relevant particle/cell data into the input structure defined above */
+  void particle2in(subdens_in *in, int i) override
+  {
+    in->IntPos[0] = Tp->P[i].IntPos[0];
+    in->IntPos[1] = Tp->P[i].IntPos[1];
+    in->IntPos[2] = Tp->P[i].IntPos[2];
+    in->Hsml      = Tp->PS[i].v.DM_Hsml;
+  }
+
+  /* routine to store or combine result data */
+  void out2particle(subdens_out *out, int i, int mode) override
+  {
+    if(mode == MODE_LOCAL_PARTICLES) /* initial store */
+      {
+        DM_NumNgb[i]               = out->Ngb;
+        Tp->PS[i].u.s.u.DM_Density = out->Rho;
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+        Vx[i]                    = out->Vx;
+        Vy[i]                    = out->Vy;
+        Vz[i]                    = out->Vz;
+        Tp->PS[i].SubfindVelDisp = out->VelDisp;
+#endif
+      }
+    else /* combine */
+      {
+        DM_NumNgb[i] += out->Ngb;
+        Tp->PS[i].u.s.u.DM_Density += out->Rho;
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+        Vx[i] += out->Vx;
+        Vy[i] += out->Vy;
+        Vz[i] += out->Vz;
+        Tp->PS[i].SubfindVelDisp += out->VelDisp;
+#endif
+      }
+  }
+
+  /*! This function represents the core of the SPH density computation. The
+   *  target particle may either be local, or reside in the communication
+   *  buffer.
+   */
+  int evaluate(int target, int mode, int thread_id, int action, subdens_in *in, int numnodes, node_info *firstnode,
+               subdens_out &out) override
+  {
+    MyIntPosType *intpos = in->IntPos;
+    double hsml          = in->Hsml;
+
+    double h2    = hsml * hsml;
+    double hinv  = 1.0 / hsml;
+    double hinv3 = hinv * hinv * hinv;
+
+    int numngb    = 0;
+    double rhosum = 0;
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+    double vxsum = 0;
+    double vysum = 0;
+    double vzsum = 0;
+    double v2sum = 0;
+#endif
+
+    for(int k = 0; k < numnodes; k++)
+      {
+        int no;
+        if(mode == MODE_LOCAL_PARTICLES)
+          {
+            no = Tree->MaxPart; /* root node */
+          }
+        else
+          {
+            no = firstnode[k].Node;
+            no = Tree->get_nodep(no)->nextnode; /* open it */
+          }
+
+        unsigned int shmrank = Tree->TreeSharedMem_ThisTask;
+
+        while(no >= 0)
+          {
+            int p, type;
+            double mass, r2;
+            particle_data *P;
+
+            if(no < Tree->MaxPart) /* single particle */
+              {
+                p = no;
+                P = Tree->get_Pp(no, shmrank);
+
+                no = Tree->get_nextnodep(shmrank)[no]; /* note: here shmrank cannot change */
+
+                double dxyz[3];
+                Tp->nearest_image_intpos_to_pos(P->IntPos, intpos, dxyz); /* converts the integer distance to floating point */
+
+                r2 = dxyz[0] * dxyz[0];
+                if(r2 > h2)
+                  continue;
+
+                r2 += dxyz[1] * dxyz[1];
+                if(r2 > h2)
+                  continue;
+
+                r2 += dxyz[2] * dxyz[2];
+                if(r2 > h2)
+                  continue;
+
+                mass = P->getMass();
+                type = P->getType();
+              }
+            else if(no < Tree->MaxPart + Tree->MaxNodes) /* internal node */
+              {
+                if(mode == MODE_IMPORTED_PARTICLES)
+                  {
+                    if(no < Tree->FirstNonTopLevelNode) /* we reached a top-level node again, which means that we are done with the
+                                                           branch */
+                      break;
+                  }
+
+                gravnode *current = Tree->get_nodep(no, shmrank);
+
+                no      = current->sibling; /* in case the node can be discarded */
+                shmrank = current->sibling_shmrank;
+
+                double dxyz[3];
+                Tp->nearest_image_intpos_to_pos(current->center.da, intpos,
+                                                dxyz); /* converts the integer distance to floating point */
+
+                double lenhalf = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - 1 - current->level)) * Tp->FacIntToCoord;
+
+                double dist = hsml + lenhalf;
+
+                if(fabs(dxyz[0]) > dist)
+                  continue;
+                if(fabs(dxyz[1]) > dist)
+                  continue;
+                if(fabs(dxyz[2]) > dist)
+                  continue;
+
+                /* now test against the minimal sphere enclosing everything */
+                dist += FACT1 * 2.0 * lenhalf;
+                if(dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2] > dist * dist)
+                  continue;
+
+                no      = current->nextnode; /* ok, we need to open the node */
+                shmrank = current->nextnode_shmrank;
+
+                continue;
+              }
+            else if(no >= Tree->ImportedNodeOffset) /* point from imported nodelist */
+              {
+                int n = no - Tree->ImportedNodeOffset;
+                no    = Tree->Nextnode[no - Tree->MaxNodes];
+                /* note: here shmrank cannot change */
+
+                double dxyz[3];
+                Tp->nearest_image_intpos_to_pos(Tree->Points[n].IntPos, intpos,
+                                                dxyz); /* converts the integer distance to floating point */
+
+                r2 = dxyz[0] * dxyz[0];
+                if(r2 > h2)
+                  continue;
+
+                r2 += dxyz[1] * dxyz[1];
+                if(r2 > h2)
+                  continue;
+
+                r2 += dxyz[2] * dxyz[2];
+                if(r2 > h2)
+                  continue;
+
+                mass = Tree->Points[n].Mass;
+                type = Tree->Points[n].Type;
+
+                p = -1;
+              }
+            else /* pseudo particle */
+              {
+                if(mode == MODE_LOCAL_PARTICLES)
+                  if(target >= 0) /* if no target is given, export will not occur */
+                    Tree->tree_export_node_threads(no, target, &Thread);
+
+                no = Tree->Nextnode[no - Tree->MaxNodes];
+                continue;
+              }
+
+            if(r2 < h2)
+              {
+                if(is_type_primary_link_type(type))
+                  {
+                    numngb++;
+
+                    if(p < 0)
+                      Terminate("this should not occur");
+
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+                    vxsum += P->Vel[0];
+                    vysum += P->Vel[1];
+                    vzsum += P->Vel[2];
+                    v2sum += P->Vel[0] * P->Vel[0] + P->Vel[1] * P->Vel[1] + P->Vel[2] * P->Vel[2];
+#endif
+                  }
+
+                if(is_type_primary_link_type(type) || is_type_secondary_link_type(type))
+                  {
+                    double r = sqrt(r2);
+
+                    double u = r * hinv, wk;
+
+                    if(u < 0.5)
+                      wk = hinv3 * (KERNEL_COEFF_1 + KERNEL_COEFF_2 * (u - 1) * u * u);
+                    else
+                      wk = hinv3 * KERNEL_COEFF_5 * (1.0 - u) * (1.0 - u) * (1.0 - u);
+
+                    rhosum += mass * wk;
+                  }
+              }
+          }
+      }
+
+    out.Ngb = numngb;
+    out.Rho = rhosum;
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+    out.Vx      = vxsum;
+    out.Vy      = vysum;
+    out.Vz      = vzsum;
+    out.VelDisp = v2sum;
+#endif
+    return 0;
+  }
+};
+
+template <typename partset>
+double fof<partset>::subfind_density(void)
+{
+  mpi_printf("SUBFIND: finding total densities around all particles\n");
+
+  double tstart = Logs.second();
+
+  DM_NumNgb      = (int *)Mem.mymalloc_movable(&DM_NumNgb, "DM_NumNgb", sizeof(int) * Tp->NumPart);
+  MyFloat *Left  = (MyFloat *)Mem.mymalloc_movable(&Left, "Left", sizeof(MyFloat) * Tp->NumPart);
+  MyFloat *Right = (MyFloat *)Mem.mymalloc_movable(&Right, "Right", sizeof(MyFloat) * Tp->NumPart);
+
+  int *targetlist = (int *)Mem.mymalloc_movable(&targetlist, "targetlist", sizeof(int) * Tp->NumPart);
+
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+  Vx = (MyFloat *)Mem.mymalloc("Vx", sizeof(MyFloat) * Tp->NumPart);
+  Vy = (MyFloat *)Mem.mymalloc("Vy", sizeof(MyFloat) * Tp->NumPart);
+  Vz = (MyFloat *)Mem.mymalloc("Vz", sizeof(MyFloat) * Tp->NumPart);
+#endif
+
+  int ntodo = 0;
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      Left[i] = Right[i] = 0;
+      DM_NumNgb[i]       = 0;
+
+      Tp->PS[i].u.s.u.DM_Density = 0;
+
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+      Tp->PS[i].SubfindHsml    = 0;
+      Tp->PS[i].SubfindDensity = 0;
+      Tp->PS[i].SubfindVelDisp = 0;
+      if(is_type_primary_link_type(Tp->P[i].getType()) || is_type_secondary_link_type(Tp->P[i].getType()))
+        targetlist[ntodo++] = i;
+#else
+      if(Tp->PS[i].GroupNr.get() != HALONR_MAX) /* do it only for particles is in a group */
+        targetlist[ntodo++] = i;
+#endif
+    }
+
+  subdens_comm<gravtree<partset>, domain<partset>, partset> commpattern{FoFDomain, &FoFGravTree, Tp};
+
+  /* we will repeat the whole thing for those particles where we didn't find enough neighbours */
+  long long ntot;
+  int iter = 0;
+  do
+    {
+      double t0 = Logs.second();
+
+      commpattern.execute(ntodo, targetlist, MODE_DEFAULT);
+
+      /* do final operations on results */
+      int npleft = 0;
+      for(int n = 0; n < ntodo; n++)
+        {
+          int i = targetlist[n];
+
+          /* now check whether we had enough neighbours */
+          if(abs(DM_NumNgb[i] - All.DesNumNgb) > All.MaxNumNgbDeviation &&
+             ((Right[i] - Left[i]) > 1.0e-4 * Left[i] || Left[i] == 0 || Right[i] == 0))
+            {
+              /* need to redo this particle */
+              targetlist[npleft++] = i;
+
+              if(DM_NumNgb[i] < All.DesNumNgb)
+                Left[i] = (MyFloat)std::max<double>(Tp->PS[i].v.DM_Hsml, Left[i]);
+              else
+                {
+                  if(Right[i] != 0)
+                    {
+                      if(Tp->PS[i].v.DM_Hsml < Right[i])
+                        Right[i] = Tp->PS[i].v.DM_Hsml;
+                    }
+                  else
+                    Right[i] = Tp->PS[i].v.DM_Hsml;
+                }
+
+              if(iter >= MAXITER - 10)
+                {
+                  double pos[3];
+                  Tp->intpos_to_pos(Tp->P[i].IntPos, pos);
+
+                  printf("SUBFIND: i=%d task=%d ID=%lld Hsml=%g Left=%g Right=%g Ngbs=%g Right-Left=%g\n   pos=(%g|%g|%g)\n", i,
+                         ThisTask, (long long)Tp->P[i].ID.get(), Tp->PS[i].v.DM_Hsml, Left[i], Right[i], (double)DM_NumNgb[i],
+                         Right[i] - Left[i], pos[0], pos[1], pos[2]);
+                  myflush(stdout);
+                }
+
+              if(Right[i] > 0 && Left[i] > 0)
+                Tp->PS[i].v.DM_Hsml = (MyFloat)pow(0.5 * (pow(Left[i], 3) + pow(Right[i], 3)), 1.0 / 3.0);
+              else
+                {
+                  if(Right[i] == 0 && Left[i] == 0)
+                    Terminate("can't occur");
+
+                  if(Right[i] == 0 && Left[i] > 0)
+                    Tp->PS[i].v.DM_Hsml *= 1.26;
+
+                  if(Right[i] > 0 && Left[i] == 0)
+                    Tp->PS[i].v.DM_Hsml /= 1.26;
+                }
+            }
+        }
+
+      ntodo = npleft;
+
+      sumup_large_ints(1, &npleft, &ntot, Communicator);
+
+      double t1 = Logs.second();
+
+      if(ntot > 0)
+        {
+          iter++;
+
+          if(iter > 0)
+            mpi_printf("SUBFIND: ngb iteration %2d: need to repeat for %15lld particles. (took %g sec)\n", iter, ntot,
+                       Logs.timediff(t0, t1));
+
+          if(iter > MAXITER)
+            Terminate("failed to converge in neighbour iteration in subfind_density()\n");
+        }
+    }
+  while(ntot > 0);
+
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+
+  double vel_to_phys = subfind_vel_to_phys_factor();
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      if(DM_NumNgb[i] > 0)
+        {
+          Vx[i] /= DM_NumNgb[i];
+          Vy[i] /= DM_NumNgb[i];
+          Vz[i] /= DM_NumNgb[i];
+          Tp->PS[i].SubfindVelDisp /= DM_NumNgb[i];
+          Tp->PS[i].SubfindVelDisp = vel_to_phys * sqrt(Tp->PS[i].SubfindVelDisp - Vx[i] * Vx[i] - Vy[i] * Vy[i] - Vz[i] * Vz[i]);
+        }
+      else
+        Tp->PS[i].SubfindVelDisp = 0;
+    }
+
+  Mem.myfree_movable(Vz);
+  Mem.myfree_movable(Vy);
+  Mem.myfree_movable(Vx);
+#endif
+
+  Mem.myfree_movable(targetlist);
+  Mem.myfree_movable(Right);
+  Mem.myfree_movable(Left);
+  Mem.myfree_movable(DM_NumNgb);
+
+#ifdef SUBFIND_STORE_LOCAL_DENSITY
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      Tp->PS[i].SubfindHsml    = Tp->PS[i].v.DM_Hsml;
+      Tp->PS[i].SubfindDensity = Tp->PS[i].u.s.u.DM_Density;
+    }
+#endif
+
+  double tend = Logs.second();
+  return Logs.timediff(tstart, tend);
+}
+
+template <>
+double fof<simparticles>::subfind_vel_to_phys_factor(void)
+{
+  if(All.ComovingIntegrationOn)
+    return 1.0 / All.Time;
+  else
+    return 1.0;
+}
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+template <>
+double fof<lcparticles>::subfind_vel_to_phys_factor(void)
+{
+  return 1.0;
+}
+#endif
+
+template <typename partset>
+void fof<partset>::subfind_density_hsml_guess(void) /* set the initial guess for the smoothing length */
+{
+  double hsml_prev = 0;
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      if(is_type_primary_link_type(Tp->P[i].getType()))
+        {
+          int no = FoFGravTree.Father[i];
+
+          while(8.0 * All.DesNumNgb * Tp->P[i].getMass() > FoFGravTree.get_nodep(no)->mass)
+            {
+              int p = FoFGravTree.get_nodep(no)->father;
+
+              if(p < 0)
+                break;
+
+              no = p;
+            }
+
+          double len = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - FoFGravTree.get_nodep(no)->level)) * Tp->FacIntToCoord;
+
+          Tp->PS[i].v.DM_Hsml = hsml_prev =
+              (pow(3.0 / (4.0 * M_PI) * All.DesNumNgb * Tp->P[i].getMass() / FoFGravTree.get_nodep(no)->mass, 1.0 / 3.0) * len);
+
+          if(Tp->PS[i].v.DM_Hsml == 0)
+            {
+              double pos[3];
+              Tp->intpos_to_pos(Tp->P[i].IntPos, pos);
+
+              Terminate(
+                  "zero hsml guess: Hsml=0 task=%d i=%d no=%d Nodes[no].len=%g Nodes[no].mass=%g P[i].Mass=%g type=%d ID=%llu  "
+                  "pos=(%g|%g|%g)\n",
+                  ThisTask, i, no, len, FoFGravTree.get_nodep(no)->mass, Tp->P[i].getMass(), Tp->P[i].getType(),
+                  (long long)Tp->P[i].ID.get(), pos[0], pos[1], pos[2]);
+            }
+        }
+      else
+        {
+          if(hsml_prev)
+            Tp->PS[i].v.DM_Hsml = hsml_prev;
+          else
+            Tp->PS[i].v.DM_Hsml = All.SofteningTable[Tp->P[i].getType()];
+        }
+    }
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
+#endif
diff --git a/src/subfind/subfind_distribute.cc b/src/subfind/subfind_distribute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9314bd00182a7edb0d4065ad74aa3183ce9859f2
--- /dev/null
+++ b/src/subfind/subfind_distribute.cc
@@ -0,0 +1,304 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_distribute.cc
+ *
+ *  \brief code to distribute particle data for Subfind processing
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+
+#include <mpi.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/cxxsort.h"
+#include "../subfind/subfind.h"
+
+template <typename partset>
+void fof<partset>::subfind_distribute_groups(void)
+{
+  double t0 = Logs.second();
+
+  int *Send_count  = (int *)Mem.mymalloc_movable(&Send_count, "Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc_movable(&Send_offset, "Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc_movable(&Recv_count, "Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc_movable(&Recv_offset, "Recv_offset", sizeof(int) * NTask);
+
+  /* count how many we have of each task */
+  for(int i = 0; i < NTask; i++)
+    Send_count[i] = 0;
+
+  for(int i = 0; i < Ngroups; i++)
+    {
+      int target = Group[i].TargetTask;
+
+      if(target < 0 || target >= NTask)
+        Terminate("target < 0 || target >= NTask");
+
+      if(target != ThisTask)
+        Send_count[target]++;
+    }
+
+  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+  Recv_offset[0] = Send_offset[0] = 0;
+  int nexport = 0, nimport = 0;
+
+  for(int i = 0; i < NTask; i++)
+    {
+      nimport += Recv_count[i];
+      nexport += Send_count[i];
+
+      if(i > 0)
+        {
+          Send_offset[i] = Send_offset[i - 1] + Send_count[i - 1];
+          Recv_offset[i] = Recv_offset[i - 1] + Recv_count[i - 1];
+        }
+    }
+
+  group_properties *send_Group =
+      (group_properties *)Mem.mymalloc_movable(&send_Group, "send_Group", nexport * sizeof(group_properties));
+
+  for(int i = 0; i < NTask; i++)
+    Send_count[i] = 0;
+
+  for(int i = 0; i < Ngroups; i++)
+    {
+      int target = Group[i].TargetTask;
+
+      if(target != ThisTask)
+        {
+          send_Group[Send_offset[target] + Send_count[target]] = Group[i];
+          Send_count[target]++;
+
+          Group[i] = Group[Ngroups - 1];
+          Ngroups--;
+          i--;
+        }
+    }
+
+  if(Ngroups + nimport > MaxNgroups)
+    {
+      MaxNgroups = Ngroups + nimport;
+      Group      = (group_properties *)Mem.myrealloc_movable(Group, sizeof(group_properties) * MaxNgroups);
+    }
+
+  for(int ngrp = 1; ngrp < (1 << PTask); ngrp++)
+    {
+      int recvTask = ThisTask ^ ngrp;
+
+      if(recvTask < NTask)
+        {
+          if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+            {
+              /* get the group info */
+              MPI_Sendrecv(&send_Group[Send_offset[recvTask]], Send_count[recvTask] * sizeof(group_properties), MPI_BYTE, recvTask,
+                           TAG_DENS_A, &Group[Ngroups + Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(group_properties),
+                           MPI_BYTE, recvTask, TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
+            }
+        }
+    }
+
+  Ngroups += nimport;
+
+  Mem.myfree_movable(send_Group);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  double t1 = Logs.second();
+
+  mpi_printf("SUBFIND: subfind_distribute_groups() took %g sec\n", Logs.timediff(t0, t1));
+}
+
+/* This function redistributes the particles in P[] and PS[] according to what is stored in
+ * PS[].TargetTask, and PS[].TargetIndex. NOTE: The associated SphP[] is not moved, i.e. the
+ * association is broken until the particles are moved back into the original order.
+ */
+template <typename partset>
+void fof<partset>::subfind_distribute_particles(MPI_Comm Communicator)
+{
+  int *Send_count  = (int *)Mem.mymalloc_movable(&Send_count, "Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc_movable(&Send_offset, "Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc_movable(&Recv_count, "Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc_movable(&Recv_offset, "Recv_offset", sizeof(int) * NTask);
+
+  int CommThisTask, CommNTask;
+  MPI_Comm_size(Communicator, &CommNTask);
+  MPI_Comm_rank(Communicator, &CommThisTask);
+
+  for(int n = 0; n < CommNTask; n++)
+    Send_count[n] = 0;
+
+  for(int n = 0; n < Tp->NumPart; n++)
+    {
+      int target = Tp->PS[n].TargetTask;
+
+      if(target != CommThisTask)
+        {
+          if(target < 0 || target >= CommNTask)
+            Terminate("n=%d targettask=%d", n, target);
+
+          Send_count[target]++;
+        }
+    }
+
+  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+
+  int nimport = 0, nexport = 0;
+  Recv_offset[0] = 0, Send_offset[0] = 0;
+
+  for(int j = 0; j < CommNTask; j++)
+    {
+      nexport += Send_count[j];
+      nimport += Recv_count[j];
+
+      if(j > 0)
+        {
+          Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+          Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+        }
+    }
+
+  /* for resize */
+  int load = (Tp->NumPart + (nimport - nexport)), max_load;
+  MPI_Allreduce(&load, &max_load, 1, MPI_INT, MPI_MAX, Communicator);
+
+  typename partset::pdata *partBuf =
+      (typename partset::pdata *)Mem.mymalloc_movable(&partBuf, "partBuf", nexport * sizeof(typename partset::pdata));
+  subfind_data *subBuf = (subfind_data *)Mem.mymalloc_movable(&subBuf, "subBuf", nexport * sizeof(subfind_data));
+
+  for(int i = 0; i < CommNTask; i++)
+    Send_count[i] = 0;
+
+  for(int n = 0; n < Tp->NumPart; n++)
+    {
+      int target = Tp->PS[n].TargetTask;
+
+      if(target != CommThisTask)
+        {
+          partBuf[Send_offset[target] + Send_count[target]] = Tp->P[n];
+          subBuf[Send_offset[target] + Send_count[target]]  = Tp->PS[n];
+
+          Tp->P[n]  = Tp->P[Tp->NumPart - 1];
+          Tp->PS[n] = Tp->PS[Tp->NumPart - 1];
+
+          Send_count[target]++;
+          Tp->NumPart--;
+          n--;
+        }
+    }
+
+  /* do resize */
+  if(max_load > (1.0 - ALLOC_TOLERANCE) * Tp->MaxPart || max_load < (1.0 - 3 * ALLOC_TOLERANCE) * Tp->MaxPart)
+    Tp->reallocate_memory_maxpart(max_load / (1.0 - 2 * ALLOC_TOLERANCE));
+
+  Tp->PS = (subfind_data *)Mem.myrealloc_movable(Tp->PS, load * sizeof(subfind_data));
+
+  for(int i = 0; i < CommNTask; i++)
+    Recv_offset[i] += Tp->NumPart;
+
+#ifdef ISEND_IRECV_IN_DOMAIN
+
+  MPI_Request *requests = (MPI_Request *)Mem.mymalloc("requests", 8 * CommNTask * sizeof(MPI_Request));
+  int n_requests        = 0;
+
+  for(int ngrp = 1; ngrp < (1 << PTask); ngrp++)
+    {
+      int target = CommThisTask ^ ngrp;
+
+      if(target < CommNTask)
+        {
+          if(Recv_count[target] > 0)
+            {
+              MPI_Irecv(Tp->P + Recv_offset[target], Recv_count[target] * sizeof(particle_data), MPI_BYTE, target, TAG_PDATA,
+                        Communicator, &requests[n_requests++]);
+              MPI_Irecv(Tp->PS + Recv_offset[target], Recv_count[target] * sizeof(subfind_data), MPI_BYTE, target, TAG_KEY,
+                        Communicator, &requests[n_requests++]);
+            }
+        }
+    }
+
+  for(int ngrp = 1; ngrp < (1 << PTask); ngrp++)
+    {
+      int target = CommThisTask ^ ngrp;
+
+      if(target < CommNTask)
+        {
+          if(Send_count[target] > 0)
+            {
+              MPI_Issend(partBuf + Send_offset[target], Send_count[target] * sizeof(particle_data), MPI_BYTE, target, TAG_PDATA,
+                         Communicator, &requests[n_requests++]);
+              MPI_Issend(subBuf + Send_offset[target], Send_count[target] * sizeof(subfind_data), MPI_BYTE, target, TAG_KEY,
+                         Communicator, &requests[n_requests++]);
+            }
+        }
+    }
+
+  MPI_Waitall(n_requests, requests, MPI_STATUSES_IGNORE);
+  Mem.myfree(requests);
+
+#else
+  for(int ngrp = 1; ngrp < (1 << PTask); ngrp++)
+    {
+      int target = CommThisTask ^ ngrp;
+
+      if(target < CommNTask)
+        {
+          if(Send_count[target] > 0 || Recv_count[target] > 0)
+            {
+              MPI_Sendrecv(partBuf + Send_offset[target], Send_count[target] * sizeof(particle_data), MPI_BYTE, target, TAG_PDATA,
+                           Tp->P + Recv_offset[target], Recv_count[target] * sizeof(particle_data), MPI_BYTE, target, TAG_PDATA,
+                           Communicator, MPI_STATUS_IGNORE);
+
+              MPI_Sendrecv(subBuf + Send_offset[target], Send_count[target] * sizeof(subfind_data), MPI_BYTE, target, TAG_KEY,
+                           Tp->PS + Recv_offset[target], Recv_count[target] * sizeof(subfind_data), MPI_BYTE, target, TAG_KEY,
+                           Communicator, MPI_STATUS_IGNORE);
+            }
+        }
+    }
+#endif
+
+  Tp->NumPart += nimport;
+  Mem.myfree_movable(subBuf);
+  Mem.myfree_movable(partBuf);
+
+  /* finally, let's also address the desired local order according to PS[].TargetIndex */
+  FoFDomain->reorder_P_PS(0, Tp->NumPart);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
diff --git a/src/subfind/subfind_excursionset.cc b/src/subfind/subfind_excursionset.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a06e61f251366bbea3b4b25e3fa77bd6d11a3f99
--- /dev/null
+++ b/src/subfind/subfind_excursionset.cc
@@ -0,0 +1,1821 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_excursionset.cc
+ *
+ *  \brief main routines for processing a halo with the classic Subfind algorithm
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+#ifndef SUBFIND_HBT
+
+#include <gsl/gsl_math.h>
+#include <mpi.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/cxxsort.h"
+#include "../sort/parallel_sort.h"
+#include "../sort/peano.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+#define HIGHBIT (1 << 30)
+
+template <typename partset>
+void fof<partset>::subfind_process_single_group(domain<partset> *SubDomain, domain<partset> *SingleDomain, domain_options mode, int gr)
+{
+  /* set up an inverse look-up of the position of local particles in the index list, for later use
+   */
+  for(int i = 0; i < NumPartGroup; i++)
+    Tp->PS[IndexList[i]].InvIndex = i;
+
+  /* allocated storage for an auxiliary array needed for sorting */
+  sd = (sort_density_data *)Mem.mymalloc_movable(&sd, "sd", NumPartGroup * sizeof(sort_density_data));
+
+  /* construct a tree for all particles in the halo */
+
+  FoFGravTree.treeallocate(Tp->NumPart, Tp, SubDomain);
+  FoFGravTree.treebuild(NumPartGroup, IndexList);
+
+  /* determine the radius that encloses a certain number of link particles */
+  subfind_find_linkngb(SubDomain, NumPartGroup, IndexList);
+
+  /* now determine the indices of the nearest two denser neighbours within this link region */
+
+  /* first, allocate some auxialiary arrays for storing this info */
+
+  Tp->R2Loc = (typename partset::nearest_r2_data *)Mem.mymalloc_movable(&Tp->R2Loc, "R2Loc",
+                                                                        Tp->NumPart * sizeof(typename partset::nearest_r2_data));
+
+  /* find the nearest two denser particles around each point in the group */
+  subfind_find_nearesttwo(SubDomain, NumPartGroup, IndexList);
+
+  /* create an array that we can conveniently sort according to density in group subsets */
+  for(int i = 0; i < NumPartGroup; i++)
+    {
+      if(IndexList[i] >= Tp->NumPart || IndexList[i] < 0)
+        Terminate("Really?");
+
+      sd[i].density    = Tp->PS[IndexList[i]].u.s.u.DM_Density;
+      sd[i].ngbcount   = Tp->PS[IndexList[i]].nearest.count;
+      sd[i].index      = {SubThisTask, i};
+      sd[i].ngb_index1 = Tp->PS[IndexList[i]].nearest.index[0];
+      sd[i].ngb_index2 = Tp->PS[IndexList[i]].nearest.index[1];
+#ifdef MERGERTREE
+      sd[i].PrevSizeOfSubhalo = Tp->P[IndexList[i]].PrevSizeOfSubhalo;
+#else
+      sd[i].PrevSizeOfSubhalo.set(0);
+#endif
+    }
+  Mem.myfree(Tp->R2Loc);
+
+  /* sort sd according to densities (in parallel if needed) */
+  mycxxsort_parallel(sd, sd + NumPartGroup, subfind_compare_densities, SubComm);
+
+  subfind_collective_printf("SUBFIND: root-task=%d: parallel sort of 'sd' done.\n", ThisTask);
+
+  /* can now release the tree */
+  FoFGravTree.treefree();
+
+  /* allocate and initialize distributed link list for storing subhalo connectivity */
+  SFHead    = (location *)Mem.mymalloc_movable(&SFHead, "SFHead", NumPartGroup * sizeof(location));
+  SFNext    = (location *)Mem.mymalloc_movable(&SFNext, "SFNext", NumPartGroup * sizeof(location));
+  SFTail    = (location *)Mem.mymalloc_movable(&SFTail, "SFTail", NumPartGroup * sizeof(location));
+  SFLen     = (MyLenType *)Mem.mymalloc_movable(&SFLen, "SFLen", NumPartGroup * sizeof(MyLenType));
+  SFPrevLen = (double *)Mem.mymalloc_movable(&SFPrevLen, "SFPrevLen", NumPartGroup * sizeof(double));
+
+  for(int i = 0; i < NumPartGroup; i++)
+    {
+      SFHead[i]    = {-1, -1};
+      SFNext[i]    = {-1, -1};
+      SFTail[i]    = {-1, -1};
+      SFLen[i]     = 0;
+      SFPrevLen[i] = 0;
+    }
+
+  /* allocate a list to store subhalo candidates */
+  max_coll_candidates = std::max<int>((NumPartGroup / 50), 200);
+  coll_candidates =
+      (coll_cand_dat *)Mem.mymalloc_movable(&coll_candidates, "coll_candidates", max_coll_candidates * sizeof(coll_cand_dat));
+
+  /* initialize the number of current candidates */
+  count_cand = 0;
+
+  /* get total group length */
+  long long totgrouplen;
+  sumup_large_ints(1, &NumPartGroup, &totgrouplen, SubComm);
+
+  /* determine subhalo candidates */
+  subfind_col_find_coll_candidates(totgrouplen);
+
+  /* establish total number of candidates */
+  long long totcand;
+  sumup_large_ints(1, &count_cand, &totcand, SubComm);
+
+  subfind_collective_printf("SUBFIND: root-task=%d: total number of subhalo coll_candidates=%lld\n", ThisTask, totcand);
+
+  for(int i = 0; i < NumPartGroup; i++)
+    SFTail[i] = {-1, -1};
+
+  /* default is to be not nested */
+  for(int i = 0; i < count_cand; i++)
+    coll_candidates[i].parent = 0;
+
+  int count_leaves     = 0;
+  long long nremaining = totcand;
+
+  do
+    {
+      /* Let's see which coll_candidates can be unbound independent from each other.
+       * We identify them with those candidates that have no embedded other candidate, which are most of them by number.
+       */
+      double t0                          = Logs.second();
+      coll_cand_dat *tmp_coll_candidates = 0;
+      if(SubThisTask == 0)
+        tmp_coll_candidates = (coll_cand_dat *)Mem.mymalloc("tmp_coll_candidates", totcand * sizeof(coll_cand_dat));
+
+      int *countlist = (int *)Mem.mymalloc("countlist", SubNTask * sizeof(int));
+      int *offset    = (int *)Mem.mymalloc("offset", SubNTask * sizeof(int));
+
+      int count = count_cand * sizeof(coll_cand_dat);
+      MPI_Allgather(&count, 1, MPI_INT, countlist, 1, MPI_INT, SubComm);
+
+      offset[0] = 0;
+      for(int i = 1; i < SubNTask; i++)
+        offset[i] = offset[i - 1] + countlist[i - 1];
+
+      /* assemble a list of the candidates on subtask 0 */
+      MPI_Gatherv(coll_candidates, countlist[SubThisTask], MPI_BYTE, tmp_coll_candidates, countlist, offset, MPI_BYTE, 0, SubComm);
+
+      if(SubThisTask == 0)
+        {
+          for(int k = 0; k < totcand; k++)
+            {
+              tmp_coll_candidates[k].nsub  = k;
+              tmp_coll_candidates[k].subnr = k;
+            }
+
+          mycxxsort(tmp_coll_candidates, tmp_coll_candidates + totcand, subfind_compare_coll_candidates_rank);
+          for(int k = 0; k < totcand; k++)
+            {
+              if(tmp_coll_candidates[k].parent >= 0)
+                {
+                  tmp_coll_candidates[k].parent = 0;
+
+                  for(int j = k + 1; j < totcand; j++)
+                    {
+                      if(tmp_coll_candidates[j].rank > tmp_coll_candidates[k].rank + tmp_coll_candidates[k].len)
+                        break;
+
+                      if(tmp_coll_candidates[j].parent < 0) /* ignore these */
+                        continue;
+
+                      if(tmp_coll_candidates[k].rank + tmp_coll_candidates[k].len >=
+                         tmp_coll_candidates[j].rank + tmp_coll_candidates[j].len)
+                        {
+                          tmp_coll_candidates[k].parent++; /* we here count the number of subhalos that are enclosed */
+                        }
+                      else
+                        {
+                          Terminate("k=%d|%lld has rank=%d and len=%d.  j=%d has rank=%d and len=%d\n", k, totcand,
+                                    (int)tmp_coll_candidates[k].rank, (int)tmp_coll_candidates[k].len, j,
+                                    (int)tmp_coll_candidates[j].rank, (int)tmp_coll_candidates[j].len);
+                        }
+                    }
+                }
+            }
+
+          mycxxsort(tmp_coll_candidates, tmp_coll_candidates + totcand, subfind_compare_coll_candidates_subnr);
+        }
+
+      /* send the stuff back */
+      MPI_Scatterv(tmp_coll_candidates, countlist, offset, MPI_BYTE, coll_candidates, countlist[SubThisTask], MPI_BYTE, 0, SubComm);
+
+      Mem.myfree(offset);
+      Mem.myfree(countlist);
+
+      if(SubThisTask == 0)
+        Mem.myfree(tmp_coll_candidates);
+
+      count_leaves   = 0;
+      int max_length = 0;
+      for(int i = 0; i < count_cand; i++)
+        if(coll_candidates[i].parent == 0) /* if it's not a nested one, candidate is eligible for the independent/parallel unbinding */
+          {
+            /* if it seems large (heuristic criterion), let's rather do it collectively */
+            if(coll_candidates[i].len > 0.20 * Tp->TotNumPart / NTask)
+              coll_candidates[i].parent++; /* this trick will ensure that it is not considered in this round */
+            else
+              {
+                if(coll_candidates[i].len > max_length)
+                  max_length = coll_candidates[i].len;
+
+                count_leaves++;
+              }
+          }
+
+      /* get total count of these eligible subhalos, and their maximum length */
+      MPI_Allreduce(MPI_IN_PLACE, &count_leaves, 1, MPI_INT, MPI_SUM, SubComm);
+      MPI_Allreduce(MPI_IN_PLACE, &max_length, 1, MPI_INT, MPI_MAX, SubComm);
+
+      double t1 = Logs.second();
+
+      subfind_collective_printf(
+          "SUBFIND: root-task=%d: number of subhalo coll_candidates that can be done independently=%d. (Largest size %d, finding took "
+          "%g sec)\n",
+          ThisTask, count_leaves, max_length, Logs.timediff(t0, t1));
+
+      /* if there are none left, we break and do the reset collectively */
+      if(count_leaves <= 0)
+        {
+          subfind_collective_printf("SUBFIND: root-task=%d: too few, let's do the rest of %d collectively\n", ThisTask, nremaining);
+          break;
+        }
+
+      /* seems large, let's rather do it collectively */
+      if(max_length > 0.5 * Tp->TotNumPart / NTask)
+        {
+          subfind_collective_printf("SUBFIND: root-task=%d: too big coll_candidates, I do the rest collectively\n", ThisTask);
+          break;
+        }
+
+      nremaining -= count_leaves;
+
+      /* set default values */
+      for(int i = 0; i < Tp->NumPart; i++)
+        {
+          Tp->PS[i].u.s.origintask  = SubThisTask;
+          Tp->PS[i].u.s.originindex = i;
+
+          Tp->PS[i].TargetTask = SubThisTask;
+          Tp->PS[i].submark    = HIGHBIT;
+        }
+
+      for(int i = 0; i < NumPartGroup; i++)
+        {
+          if(SFTail[i].index >= 0) /* this means this particle is already bound to a substructure */
+            Tp->PS[IndexList[i]].u.s.origintask |= HIGHBIT;
+        }
+
+      /* we now mark the particles that are in subhalo candidates that can be processed independently in parallel */
+      int nsubs = 0;
+      for(int master = 0; master < SubNTask; master++)
+        {
+          int ncand = count_cand;
+          MPI_Bcast(&ncand, sizeof(ncand), MPI_BYTE, master, SubComm);
+
+          for(int k = 0; k < ncand; k++)
+            {
+              MyLenType len;
+              int parent;
+              if(SubThisTask == master)
+                {
+                  len    = coll_candidates[k].len;
+                  parent = coll_candidates[k].parent; /* this is here actually the daughter count */
+                }
+
+              MPI_Bcast(&len, sizeof(len), MPI_BYTE, master, SubComm);
+              MPI_Bcast(&parent, sizeof(parent), MPI_BYTE, master, SubComm);
+              MPI_Barrier(SubComm);
+
+              if(parent == 0)
+                {
+                  if(SubThisTask != master)
+                    subfind_poll_for_requests();
+                  else
+                    {
+                      location p = coll_candidates[k].head;
+                      for(MyLenType i = 0; i < coll_candidates[k].len; i++)
+                        {
+                          subfind_distlinklist_mark_particle(p, master, nsubs);
+
+                          if(p.index < 0)
+                            Terminate("Bummer\n");
+
+                          p = subfind_distlinklist_get_next(p);
+                        }
+
+                      /* now tell the others to stop polling */
+                      for(int i = 0; i < SubNTask; i++)
+                        if(i != SubThisTask)
+                          MPI_Send(&i, 1, MPI_INT, i, TAG_POLLING_DONE, SubComm);
+                    }
+
+                  MPI_Barrier(SubComm);
+                }
+
+              nsubs++;
+            }
+        }
+
+      if(mode == COLL_SUBFIND)
+        {
+          /* this will make sure that the particles are grouped by submark on the target task */
+          for(int i = 0; i < Tp->NumPart; i++)
+            Tp->PS[i].TargetIndex = Tp->PS[i].submark;
+
+          /* assemble the particles on individual processors (note: IndexList[] becomes temporarily meaningless)  */
+          subfind_distribute_particles(SubComm);
+
+          PPS = (PPS_data *)Mem.mymalloc("PPS", Tp->NumPart * sizeof(PPS_data));
+
+          for(int i = 0; i < Tp->NumPart; i++)
+            PPS[i].index = i;
+        }
+      else
+        {
+          PPS = (PPS_data *)Mem.mymalloc("PPS", Tp->NumPart * sizeof(PPS_data));
+
+          for(int i = 0; i < Tp->NumPart; i++)
+            {
+              PPS[i].submark = Tp->PS[i].submark;
+              PPS[i].index   = i;
+            }
+
+          mycxxsort(PPS, PPS + Tp->NumPart, subfind_compare_PPS);
+        }
+
+      MPI_Barrier(SubComm);
+      double ta = Logs.second();
+
+      subfind_unbind_independent_ones(SingleDomain, count_cand);
+
+      MPI_Barrier(SubComm);
+      double tb = Logs.second();
+
+      Mem.myfree(PPS);
+
+      subfind_collective_printf("SUBFIND: root-task=%d: unbinding of independent ones took %g sec\n", ThisTask, Logs.timediff(ta, tb));
+
+      if(mode == COLL_SUBFIND)
+        {
+          for(int i = 0; i < Tp->NumPart; i++)
+            {
+              Tp->PS[i].u.s.origintask &= (HIGHBIT - 1); /* clear high bit if set */
+              Tp->PS[i].TargetTask  = Tp->PS[i].u.s.origintask;
+              Tp->PS[i].TargetIndex = Tp->PS[i].u.s.originindex;
+            }
+
+          t0 = Logs.second();
+          subfind_distribute_particles(SubComm); /* bring them back to their original processor */
+          t1 = Logs.second();
+
+          subfind_collective_printf("SUBFIND: root-task=%d: bringing the independent ones back took %g sec\n", ThisTask,
+                                    Logs.timediff(t0, t1));
+
+          /* Since we reestablihed the original order, we can use IndexList[] again */
+        }
+
+      /* now mark the bound particles */
+      for(int i = 0; i < NumPartGroup; i++)
+        if(Tp->PS[IndexList[i]].submark >= 0 && Tp->PS[IndexList[i]].submark < nsubs)
+          SFTail[i].index = Tp->PS[IndexList[i]].submark; /* we use this to flag bound parts of substructures */
+
+      for(int i = 0; i < count_cand; i++)
+        if(coll_candidates[i].parent == 0)
+          coll_candidates[i].parent = -1;
+    }
+  while(count_leaves > 0);
+
+  /*
+   * Now we do the unbinding of the subhalo candidates that contain other subhalo candidates.
+   * This will be done with several CPUs if needed.
+   */
+
+  double t0 = Logs.second();
+
+  for(int master = 0, nr = 0; master < SubNTask; master++)
+    {
+      int ncand = count_cand;
+      MPI_Bcast(&ncand, sizeof(ncand), MPI_BYTE, master, SubComm);
+
+      for(int k = 0; k < ncand; k++)
+        {
+          MyLenType len;
+          int parent, nsubs;
+          if(SubThisTask == master)
+            {
+              len    = coll_candidates[k].len;
+              nsubs  = coll_candidates[k].nsub;
+              parent = coll_candidates[k].parent; /* this is here actually the daughter count */
+            }
+
+          MPI_Bcast(&parent, sizeof(parent), MPI_BYTE, master, SubComm);
+          MPI_Barrier(SubComm);
+
+          if(parent >= 0)
+            {
+              MPI_Bcast(&len, sizeof(len), MPI_BYTE, master, SubComm);
+              MPI_Bcast(&nsubs, sizeof(nsubs), MPI_BYTE, master, SubComm);
+
+              subfind_collective_printf("SUBFIND: root-task=%d: collective unbinding of nr=%d (%d) of length=%d\n", ThisTask, nr,
+                                        nremaining, (int)len);
+
+              nr++;
+
+              LocalLen = 0;
+
+              double tt0 = Logs.second();
+
+              unbind_list = (int *)Mem.mymalloc_movable(&unbind_list, "unbind_list", NumPartGroup * sizeof(int));
+
+              if(SubThisTask != master)
+                subfind_poll_for_requests();
+              else
+                {
+                  location p = coll_candidates[k].head;
+                  for(int i = 0; i < coll_candidates[k].len; i++)
+                    {
+                      if(p.index < 0)
+                        Terminate("Bummer i=%d \n", i);
+
+                      subfind_distlinklist_add_particle(p);
+
+                      p = subfind_distlinklist_get_next(p);
+                    }
+
+                  /* now tell the others to stop polling */
+                  for(int i = 0; i < SubNTask; i++)
+                    if(i != SubThisTask)
+                      MPI_Send(&i, 1, MPI_INT, i, TAG_POLLING_DONE, SubComm);
+                }
+
+              if(LocalLen > NumPartGroup)
+                Terminate("LocalLen=%d  > NumPartGroup=%d", LocalLen, NumPartGroup);
+
+              /* rewrite list of group indices to particle indices */
+              for(int i = 0; i < LocalLen; i++)
+                {
+                  unbind_list[i] = IndexList[unbind_list[i]];
+                  if(unbind_list[i] < 0 || unbind_list[i] >= Tp->NumPart)
+                    Terminate("bad!  unbind_list[i]=%d\n", unbind_list[i]);
+                }
+
+              /* mark the one to be unbound in PS[] */
+              for(int i = 0; i < Tp->NumPart; i++)
+                {
+                  Tp->PS[i].u.s.origintask  = SubThisTask;
+                  Tp->PS[i].u.s.originindex = i;
+                  Tp->PS[i].DomainFlag      = 0;
+                }
+
+              for(int i = 0; i < LocalLen; i++)
+                Tp->PS[unbind_list[i]].DomainFlag = 1;
+
+              Mem.myfree(unbind_list);
+
+              domain<partset> SubUnbindDomain{SubComm, Tp};
+
+              if(SubUnbindDomain.NumNodes != 0)
+                Terminate("SubDomain.NumNodes=%d\n", SubUnbindDomain.NumNodes);
+
+              SubUnbindDomain.domain_decomposition(mode);
+
+              if(mode == COLL_SUBFIND)
+                subfind_distribute_particles(SubComm);
+
+              /* refill unbind_list */
+              LocalLen = 0;
+              for(int i = 0; i < Tp->NumPart; i++)
+                if(Tp->PS[i].DomainFlag)
+                  LocalLen++;
+
+              unbind_list = (int *)Mem.mymalloc_movable(&unbind_list, "unbind_list", LocalLen * sizeof(int));
+
+              /* refill unbind_list */
+              LocalLen = 0;
+              for(int i = 0; i < Tp->NumPart; i++)
+                if(Tp->PS[i].DomainFlag)
+                  unbind_list[LocalLen++] = i;
+
+              LocalLen = subfind_unbind(&SubUnbindDomain, SubComm, unbind_list, LocalLen);
+
+              for(int i = 0; i < Tp->NumPart; i++)
+                {
+                  Tp->PS[i].DomainFlag  = 0;
+                  Tp->PS[i].TargetTask  = Tp->PS[i].u.s.origintask;
+                  Tp->PS[i].TargetIndex = Tp->PS[i].u.s.originindex;
+                }
+
+              for(int i = 0; i < LocalLen; i++)
+                Tp->PS[unbind_list[i]].DomainFlag = 1;
+
+              Mem.myfree(unbind_list);
+
+              SubUnbindDomain.domain_free();
+
+              double ta = Logs.second();
+              subfind_distribute_particles(SubComm); /* bring them back to their original processor */
+              double tb = Logs.second();
+
+              unbind_list = (int *)Mem.mymalloc_movable(&unbind_list, "unbind_list", NumPartGroup * sizeof(int));
+
+              /* refill unbind_list */
+              LocalLen = 0;
+              for(int i = 0; i < Tp->NumPart; i++)
+                if(Tp->PS[i].DomainFlag)
+                  {
+                    if(LocalLen >= NumPartGroup)
+                      Terminate("LocalLen=%d >= NumPartGroup=%d", LocalLen, NumPartGroup);
+                    unbind_list[LocalLen++] = i;
+                  }
+
+              subfind_collective_printf("SUBFIND: root-task=%d: bringing the collective ones back took %g sec\n", ThisTask,
+                                        Logs.timediff(ta, tb));
+
+              /* go from particle indices back to group indices */
+              for(int i = 0; i < LocalLen; i++)
+                {
+                  unbind_list[i] = Tp->PS[unbind_list[i]].InvIndex;
+                  if(unbind_list[i] < 0 || unbind_list[i] >= NumPartGroup)
+                    Terminate("ups, bad!  unbind_list[i]=%d   NumPartGroup=%d\n", unbind_list[i], NumPartGroup);
+                }
+
+              double tt1 = Logs.second();
+
+              int oldlen = len;
+
+              MPI_Allreduce(&LocalLen, &len, 1, MPI_INT, MPI_SUM, SubComm);
+
+              subfind_collective_printf(
+                  "SUBFIND: root-task=%d: collective unbinding of nr=%d (%d) of length=%d, bound length=%d    took %g sec\n", ThisTask,
+                  nr - 1, nremaining, oldlen, (int)len, Logs.timediff(tt0, tt1));
+
+              if(len >= All.DesLinkNgb)
+                {
+                  /* ok, we found a substructure */
+                  for(int i = 0; i < LocalLen; i++)
+                    SFTail[unbind_list[i]].index = nsubs; /* we use this to flag the substructures */
+
+                  if(SubThisTask == master)
+                    coll_candidates[k].bound_length = len;
+                }
+              else
+                {
+                  /* bound particle count too low or zero */
+                  if(SubThisTask == master)
+                    coll_candidates[k].bound_length = 0;
+                }
+
+              Mem.myfree(unbind_list);
+            }
+        }
+    }
+  double t1 = Logs.second();
+
+  subfind_collective_printf("SUBFIND: root-task=%d: the collective unbinding of remaining halos took %g sec\n", ThisTask,
+                            Logs.timediff(t0, t1));
+
+  /* get the total substructure count */
+  int countall = 0;
+  for(int k = 0; k < count_cand; k++)
+    if(coll_candidates[k].bound_length >= All.DesLinkNgb)
+      {
+        if(coll_candidates[k].len < All.DesLinkNgb)
+          Terminate("coll_candidates[k=%d].len=%lld bound=%lld\n", k, (long long)coll_candidates[k].len,
+                    (long long)coll_candidates[k].bound_length);
+
+        countall++;
+      }
+
+  MPI_Allreduce(MPI_IN_PLACE, &countall, 1, MPI_INT, MPI_SUM, SubComm);
+
+  subfind_collective_printf("SUBFIND: root-task=%d: found %d bound substructures in FoF group of length %lld\n", ThisTask, countall,
+                            totgrouplen);
+
+  /* now determine the parent subhalo for each candidate */
+  t0 = Logs.second();
+  mycxxsort_parallel(coll_candidates, coll_candidates + count_cand, subfind_compare_coll_candidates_boundlength, SubComm);
+
+  coll_cand_dat *tmp_coll_candidates = 0;
+
+  if(SubThisTask == 0)
+    tmp_coll_candidates = (coll_cand_dat *)Mem.mymalloc("tmp_coll_candidates", totcand * sizeof(coll_cand_dat));
+
+  int *countlist = (int *)Mem.mymalloc("countlist", SubNTask * sizeof(int));
+  int *offset    = (int *)Mem.mymalloc("offset", SubNTask * sizeof(int));
+
+  int count_size = count_cand * sizeof(coll_cand_dat);
+  MPI_Allgather(&count_size, 1, MPI_INT, countlist, 1, MPI_INT, SubComm);
+
+  offset[0] = 0;
+  for(int i = 1; i < SubNTask; i++)
+    offset[i] = offset[i - 1] + countlist[i - 1];
+
+  MPI_Gatherv(coll_candidates, countlist[SubThisTask], MPI_BYTE, tmp_coll_candidates, countlist, offset, MPI_BYTE, 0, SubComm);
+
+  if(SubThisTask == 0)
+    {
+      for(int k = 0; k < totcand; k++)
+        {
+          tmp_coll_candidates[k].subnr  = k;
+          tmp_coll_candidates[k].parent = 0;
+        }
+
+      mycxxsort(tmp_coll_candidates, tmp_coll_candidates + totcand, subfind_compare_coll_candidates_rank);
+
+      for(int k = 0; k < totcand; k++)
+        {
+          for(int j = k + 1; j < totcand; j++)
+            {
+              if(tmp_coll_candidates[j].rank > tmp_coll_candidates[k].rank + tmp_coll_candidates[k].len)
+                break;
+
+              if(tmp_coll_candidates[k].rank + tmp_coll_candidates[k].len >= tmp_coll_candidates[j].rank + tmp_coll_candidates[j].len)
+                {
+                  if(tmp_coll_candidates[k].bound_length >= All.DesLinkNgb)
+                    tmp_coll_candidates[j].parent = tmp_coll_candidates[k].subnr;
+                }
+              else
+                {
+                  Terminate("k=%d|%d has rank=%d and len=%d.  j=%d has rank=%d and len=%d bound=%d\n", k, countall,
+                            (int)tmp_coll_candidates[k].rank, (int)tmp_coll_candidates[k].len,
+                            (int)tmp_coll_candidates[k].bound_length, (int)tmp_coll_candidates[j].rank,
+                            (int)tmp_coll_candidates[j].len, (int)tmp_coll_candidates[j].bound_length);
+                }
+            }
+        }
+
+      mycxxsort(tmp_coll_candidates, tmp_coll_candidates + totcand, subfind_compare_coll_candidates_subnr);
+    }
+
+  MPI_Scatterv(tmp_coll_candidates, countlist, offset, MPI_BYTE, coll_candidates, countlist[SubThisTask], MPI_BYTE, 0, SubComm);
+
+  Mem.myfree(offset);
+  Mem.myfree(countlist);
+
+  if(SubThisTask == 0)
+    Mem.myfree(tmp_coll_candidates);
+
+  t1 = Logs.second();
+
+  subfind_collective_printf("SUBFIND: root-task=%d: determination of parent subhalo took %g sec (presently allocated %g MB)\n",
+                            ThisTask, Logs.timediff(t0, t1), Mem.getAllocatedBytesInMB());
+
+  /* Now let's save some properties of the substructures */
+  if(SubThisTask == 0)
+    Group[gr].Nsubs = countall;
+
+  t0 = Logs.second();
+
+  unbind_list = (int *)Mem.mymalloc_movable(&unbind_list, "unbind_list", NumPartGroup * sizeof(int));
+
+  for(int master = 0, subnr = 0; master < SubNTask; master++)
+    {
+      int ncand = count_cand;
+      MPI_Bcast(&ncand, sizeof(ncand), MPI_BYTE, master, SubComm);
+
+      for(int k = 0; k < ncand; k++)
+        {
+          MyLenType len;
+          int parent, nsubs;
+          if(SubThisTask == master)
+            {
+              len    = coll_candidates[k].bound_length;
+              nsubs  = coll_candidates[k].nsub;
+              parent = coll_candidates[k].parent;
+            }
+
+          MPI_Bcast(&len, sizeof(len), MPI_BYTE, master, SubComm);
+          MPI_Barrier(SubComm);
+
+          if(len > 0)
+            {
+              MPI_Bcast(&nsubs, sizeof(nsubs), MPI_BYTE, master, SubComm);
+              MPI_Bcast(&parent, sizeof(parent), MPI_BYTE, master, SubComm);
+
+              LocalLen = 0;
+
+              if(SubThisTask != master)
+                subfind_poll_for_requests();
+              else
+                {
+                  location p = coll_candidates[k].head;
+                  for(MyLenType i = 0; i < coll_candidates[k].len; i++)
+                    {
+                      subfind_distlinklist_add_bound_particles(p, nsubs);
+                      p = subfind_distlinklist_get_next(p);
+                    }
+
+                  /* now tell the others to stop polling */
+                  for(int i = 0; i < SubNTask; i++)
+                    if(i != SubThisTask)
+                      MPI_Send(&i, 1, MPI_INT, i, TAG_POLLING_DONE, SubComm);
+                }
+
+              int max_nsubhalos;
+              MPI_Allreduce(&Nsubhalos, &max_nsubhalos, 1, MPI_INT, MPI_MAX, SubComm);
+
+              if(max_nsubhalos >= MaxNsubhalos)
+                {
+                  if(ThisTask == 0)
+                    warn("Nsubhalos=%d >= MaxNsubhalos=%d", max_nsubhalos, MaxNsubhalos);
+
+                  MaxNsubhalos = 1.25 * max_nsubhalos;
+
+                  Subhalo = (subhalo_properties *)Mem.myrealloc_movable(Subhalo, MaxNsubhalos * sizeof(subhalo_properties));
+                }
+
+              for(int i = 0; i < LocalLen; i++)
+                {
+                  unbind_list[i] = IndexList[unbind_list[i]]; /* move to particle index list */
+
+                  if(unbind_list[i] < 0 || unbind_list[i] >= Tp->NumPart)
+                    Terminate("strange");
+                }
+
+              int marked = subfind_determine_sub_halo_properties(unbind_list, LocalLen, &Subhalo[Nsubhalos], SubComm);
+
+              for(int i = 0; i < LocalLen; i++)
+                {
+                  unbind_list[i] = Tp->PS[unbind_list[i]].InvIndex; /* move back to group index list */
+
+                  if(unbind_list[i] < 0 || unbind_list[i] >= NumPartGroup)
+                    Terminate("also strange");
+                }
+
+              MPI_Allreduce(MPI_IN_PLACE, &marked, 1, MPI_INT, MPI_SUM, SubComm);
+
+              if(SubThisTask == 0)
+                {
+                  if(subnr == 0)
+                    {
+                      for(int j = 0; j < 3; j++)
+                        {
+                          Group[gr].Pos[j]    = Subhalo[Nsubhalos].Pos[j];
+                          Group[gr].IntPos[j] = Subhalo[Nsubhalos].IntPos[j];
+                        }
+                    }
+
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+                  Group[gr].LenPrevMostBnd += marked;
+#endif
+
+                  Subhalo[Nsubhalos].GroupNr       = GroupNr;
+                  Subhalo[Nsubhalos].SubRankInGr   = subnr;
+                  Subhalo[Nsubhalos].SubParentRank = parent;
+
+                  Nsubhalos++;
+                }
+
+              /* Let's now assign the subhalo number within the group */
+              for(int i = 0; i < LocalLen; i++)
+                {
+                  Tp->PS[IndexList[unbind_list[i]]].SubRankInGr = subnr;
+#if defined(MERGERTREE)
+                  Tp->PS[IndexList[unbind_list[i]]].SizeOfSubhalo.set(len);
+#endif
+                }
+
+              subnr++;
+            }
+        }
+    }
+
+  subfind_collective_printf("SUBFIND: root-task=%d: determining substructure properties done\n", ThisTask);
+
+  Mem.myfree(unbind_list);
+  Mem.myfree(coll_candidates);
+  Mem.myfree(SFPrevLen);
+  Mem.myfree(SFLen);
+  Mem.myfree(SFTail);
+  Mem.myfree(SFNext);
+  Mem.myfree(SFHead);
+  Mem.myfree(sd);
+}
+
+/* This function finds the subhalo candidates (i.e. locally overdense structures bounded by a saddle point).
+ * They can be nested inside each other, and will later be subjected to an unbinding procedure.
+ */
+template <typename partset>
+void fof<partset>::subfind_col_find_coll_candidates(long long totgrouplen)
+{
+  subfind_collective_printf("SUBFIND: root-task=%d: building distributed linked list. (presently allocated %g MB)\n", ThisTask,
+                            Mem.getAllocatedBytesInMB());
+
+  double t0 = Logs.second();
+
+  /* Now find the subhalo candidates by building up-link lists for them.
+   * We go through the processors in the group in sequence, starting with the first one holding the densest particles according to the
+   * sd[] array In each iteration, the processor we currently deal with is called 'master', the others are listening to incoming
+   * requests for information.
+   */
+  for(int master = 0; master < SubNTask; master++)
+    {
+      double tt0 = Logs.second();
+      if(SubThisTask != master)
+        subfind_poll_for_requests(); /* if we are not the master task, we react to incoming requests for information */
+      else
+        {
+          /* we go through the sd[] indices stored on the master task, which means we start with the densest particle */
+          for(int k = 0; k < NumPartGroup; k++)
+            {
+              int ngbcount        = sd[k].ngbcount;
+              location ngb_index1 = sd[k].ngb_index1;
+              location ngb_index2 = sd[k].ngb_index2;
+
+              switch(ngbcount)
+                /* treat the different possible cases */
+                {
+                  case 0: /* this appears to be a lonely maximum -> new group */
+                    subfind_distlinklist_set_all(sd[k].index, sd[k].index, sd[k].index, 1, {-1, -1}, sd[k].PrevSizeOfSubhalo);
+                    break;
+
+                  case 1: /* the particle is attached to exactly one group */
+                    {
+                      if(ngb_index1.task < 0 || ngb_index1.task >= SubNTask)
+                        Terminate("ngb_index1.task=%d  SubNTask=%d", ngb_index1.task, SubNTask);
+
+                      location head = subfind_distlinklist_get_head(ngb_index1);
+
+                      if(head.index == -1)
+                        Terminate("We have a problem!  head=%d for k=%d on task=%d\n", head.index, k, SubThisTask);
+
+                      location tail;
+                      int retcode =
+                          subfind_distlinklist_get_tail_set_tail_increaselen(head, tail, sd[k].index, sd[k].PrevSizeOfSubhalo);
+
+                      if(!(retcode & 1))
+                        subfind_distlinklist_set_headandnext(sd[k].index, head, {-1, -1});
+                      if(!(retcode & 2))
+                        subfind_distlinklist_set_next(tail, sd[k].index);
+                    }
+                    break;
+
+                  case 2: /* the particle merges two groups together */
+                    {
+                      location head, head_attach;
+
+                      if(ngb_index1.task < 0 || ngb_index1.task >= SubNTask)
+                        Terminate("ngb_index1.task=%d  SubNTask=%d", ngb_index1.task, SubNTask);
+
+                      if(ngb_index2.task < 0 || ngb_index2.task >= SubNTask)
+                        Terminate("ngb_index2.task=%d  SubNTask=%d", ngb_index2.task, SubNTask);
+
+                      if(ngb_index1.task == ngb_index2.task)
+                        {
+                          subfind_distlinklist_get_two_heads(ngb_index1, ngb_index2, head, head_attach);
+                        }
+                      else
+                        {
+                          head        = subfind_distlinklist_get_head(ngb_index1);
+                          head_attach = subfind_distlinklist_get_head(ngb_index2);
+                        }
+
+                      if(head.index == -1 || head_attach.index == -1)
+                        Terminate("We have a problem!  head=%d/%d head_attach=%d/%d for k=%d on task=%d\n", head.task, head.index,
+                                  head_attach.task, head_attach.index, k, SubThisTask);
+
+                      if(head != head_attach)
+                        {
+                          location tail, tail_attach;
+                          MyLenType len, len_attach;
+                          double prevlen, prevlen_attach;
+
+                          subfind_distlinklist_get_tailandlen(head, tail, len, prevlen);
+                          subfind_distlinklist_get_tailandlen(head_attach, tail_attach, len_attach, prevlen_attach);
+
+                          bool swap_len     = false;
+                          bool swap_prevlen = false;
+
+                          if(len_attach > len || (len_attach == len && head_attach < head))
+                            swap_len = true;
+
+                          if(prevlen > 0 && prevlen_attach > 0 && len >= All.DesLinkNgb && len_attach >= All.DesLinkNgb)
+                            {
+                              if(prevlen_attach > prevlen || (prevlen_attach == prevlen && swap_len == true))
+                                swap_prevlen = true;
+                            }
+                          else
+                            swap_prevlen = swap_len;
+
+                          /* if other group is longer, swap */
+                          if(swap_prevlen)
+                            {
+                              location tmp = head;
+                              head         = head_attach;
+                              head_attach  = tmp;
+
+                              tmp         = tail;
+                              tail        = tail_attach;
+                              tail_attach = tmp;
+
+                              MyLenType tmplen = len;
+                              len              = len_attach;
+                              len_attach       = tmplen;
+
+                              double tmpprevlen = prevlen;
+                              prevlen           = prevlen_attach;
+                              prevlen_attach    = tmpprevlen;
+                            }
+
+                          /* only in case the attached group is long enough we bother to register it
+                           as a subhalo candidate */
+
+                          if(len_attach >= All.DesLinkNgb && len >= All.DesLinkNgb)
+                            {
+                              count_decisions++;
+
+                              if(swap_prevlen != swap_len)
+                                {
+                                  printf(
+                                      "SUBFIND: TASK=%d:  made a different main trunk decision due to previous length: prevlen=%g "
+                                      "prevlen_attach=%g   len=%g len_attach=%g\n",
+                                      ThisTask, prevlen / len, prevlen_attach / len_attach, (double)len, (double)len_attach);
+                                  fflush(stdout);
+                                  count_different_decisions++;
+                                }
+
+                              if(count_cand < max_coll_candidates)
+                                {
+                                  coll_candidates[count_cand].len  = len_attach;
+                                  coll_candidates[count_cand].head = head_attach;
+                                  count_cand++;
+                                }
+                              else
+                                Terminate("Task %d: count=%d, max=%d, npartgroup=%d\n", SubThisTask, count_cand, max_coll_candidates,
+                                          NumPartGroup);
+                            }
+
+                          /* now join the two groups */
+                          subfind_distlinklist_set_tailandlen(head, tail_attach, len + len_attach, prevlen + prevlen_attach);
+                          subfind_distlinklist_set_next(tail, head_attach);
+
+                          location ss = head_attach;
+                          do
+                            {
+                              ss = subfind_distlinklist_set_head_get_next(ss, head);
+                            }
+                          while(ss.index >= 0);
+                        }
+
+                      /* finally, attach the particle to 'head' */
+                      location tail;
+                      int retcode =
+                          subfind_distlinklist_get_tail_set_tail_increaselen(head, tail, sd[k].index, sd[k].PrevSizeOfSubhalo);
+
+                      if(!(retcode & 1))
+                        subfind_distlinklist_set_headandnext(sd[k].index, head, {-1, -1});
+                      if(!(retcode & 2))
+                        subfind_distlinklist_set_next(tail, sd[k].index);
+                    }
+                    break;
+                }
+            }
+
+          myflush(stdout);
+
+          /* now tell the others to stop polling */
+          for(int k = 0; k < SubNTask; k++)
+            if(k != SubThisTask)
+              MPI_Send(&k, 1, MPI_INT, k, TAG_POLLING_DONE, SubComm);
+        }
+
+      MPI_Barrier(SubComm);
+      double tt1 = Logs.second();
+
+      subfind_collective_printf("SUBFIND: root-task=%d: ma=%d/%d took %g sec\n", ThisTask, master, SubNTask, Logs.timediff(tt0, tt1));
+    }
+  double t1 = Logs.second();
+
+  subfind_collective_printf("SUBFIND: root-task=%d: identification of primary coll_candidates took %g sec\n", ThisTask,
+                            Logs.timediff(t0, t1));
+
+  /* Add the full group as the final subhalo candidate.
+   */
+  location head = {-1, -1};
+  location prev = {-1, -1};
+  for(int master = 0; master < SubNTask; master++)
+    {
+      if(SubThisTask != master)
+        subfind_poll_for_requests();
+      else
+        {
+          for(int i = 0; i < NumPartGroup; i++)
+            {
+              location index = {SubThisTask, i};
+
+              if(SFHead[i] == index)
+                {
+                  location tail;
+                  MyLenType len;
+                  double prevlen;
+                  subfind_distlinklist_get_tailandlen(SFHead[i], tail, len, prevlen);
+                  location next = subfind_distlinklist_get_next(tail);
+
+                  if(next.index == -1)
+                    {
+                      if(prev.index < 0)
+                        head = index;
+
+                      if(prev.index >= 0)
+                        subfind_distlinklist_set_next(prev, index);
+
+                      prev = tail;
+                    }
+                }
+            }
+
+          /* now tell the others to stop polling */
+          for(int k = 0; k < SubNTask; k++)
+            if(k != SubThisTask)
+              MPI_Send(&k, 1, MPI_INT, k, TAG_POLLING_DONE, SubComm);
+        }
+
+      MPI_Barrier(SubComm);
+      MPI_Bcast(&head, sizeof(head), MPI_BYTE, master, SubComm);
+      MPI_Bcast(&prev, sizeof(prev), MPI_BYTE, master, SubComm);
+    }
+
+  if(SubThisTask == SubNTask - 1)
+    {
+      if(count_cand < max_coll_candidates)
+        {
+          coll_candidates[count_cand].len  = totgrouplen;
+          coll_candidates[count_cand].head = head;
+          count_cand++;
+        }
+      else
+        Terminate("count_cand=%d >= max_coll_candidates=%d", count_cand, max_coll_candidates);
+    }
+
+  subfind_collective_printf("SUBFIND: root-task=%d: adding background as candidate\n", ThisTask);
+
+  /* go through the whole chain once to establish a rank order. For the rank we use SFLen[]
+   */
+  double ta = Logs.second();
+
+  int master = head.task;
+
+  if(master < 0 || master >= SubNTask)
+    Terminate("master=%d  SubNTask=%d\n", master, SubNTask);
+
+  if(SubThisTask != master)
+    subfind_poll_for_requests();
+  else
+    {
+      location p     = head;
+      MyLenType rank = 0;
+
+      while(p.index >= 0)
+        {
+          p = subfind_distlinklist_setrank_and_get_next(p, rank);
+        }
+
+      /* now tell the others to stop polling */
+      for(int i = 0; i < SubNTask; i++)
+        if(i != master)
+          MPI_Send(&i, 1, MPI_INT, i, TAG_POLLING_DONE, SubComm);
+    }
+
+  MPI_Barrier(SubComm);
+
+  /* for each candidate, we now pull out the rank of its head */
+  for(int master = 0; master < SubNTask; master++)
+    {
+      if(SubThisTask != master)
+        subfind_poll_for_requests();
+      else
+        {
+          for(int k = 0; k < count_cand; k++)
+            coll_candidates[k].rank = subfind_distlinklist_get_rank(coll_candidates[k].head);
+
+          /* now tell the others to stop polling */
+          for(int i = 0; i < SubNTask; i++)
+            if(i != SubThisTask)
+              MPI_Send(&i, 1, MPI_INT, i, TAG_POLLING_DONE, SubComm);
+        }
+    }
+  MPI_Barrier(SubComm);
+
+  double tb = Logs.second();
+
+  subfind_collective_printf(
+      "SUBFIND: root-task=%d: establishing of rank order took %g sec (grouplen=%lld) presently allocated %g MB\n", ThisTask,
+      Logs.timediff(ta, tb), totgrouplen, Mem.getAllocatedBytesInMB());
+}
+
+template <typename partset>
+void fof<partset>::subfind_unbind_independent_ones(domain<partset> *SingleDomain, int count_cand_l)
+{
+  unbind_list = (int *)Mem.mymalloc("unbind_list", Tp->NumPart * sizeof(int));
+
+  mycxxsort(coll_candidates, coll_candidates + count_cand_l, subfind_compare_coll_candidates_nsubs);
+
+  for(int k = 0, ii = 0; k < count_cand_l; k++)
+    if(coll_candidates[k].parent == 0)
+      {
+        int i = PPS[ii].index;
+
+        while(Tp->PS[i].submark < coll_candidates[k].nsub)
+          {
+            ii++;
+            i = PPS[ii].index;
+
+            if(i >= Tp->NumPart)
+              Terminate("i >= NumPart");
+          }
+
+        if(Tp->PS[i].submark >= 0 && Tp->PS[i].submark < HIGHBIT)
+          {
+            int len   = 0;
+            int nsubs = Tp->PS[i].submark;
+
+            if(nsubs != coll_candidates[k].nsub)
+              Terminate("TASK=%d i=%d k=%d nsubs=%d coll_candidates[k].nsub=%d\n", SubThisTask, i, k, nsubs, coll_candidates[k].nsub);
+
+            while(i < Tp->NumPart)
+              {
+                if(Tp->PS[i].submark == nsubs)
+                  {
+                    Tp->PS[i].submark = HIGHBIT;
+                    if((Tp->PS[i].u.s.origintask & HIGHBIT) == 0)
+                      {
+                        unbind_list[len] = i;
+                        len++;
+                      }
+                    ii++;
+                    i = PPS[ii].index;
+                  }
+                else
+                  break;
+              }
+
+            /* call the serial unbind function */
+            len = subfind_unbind(SingleDomain, SingleDomain->Communicator, unbind_list, len);
+
+            if(len >= All.DesLinkNgb)
+              {
+                /* ok, we found a substructure */
+                coll_candidates[k].bound_length = len;
+
+                for(int j = 0; j < len; j++)
+                  Tp->PS[unbind_list[j]].submark = nsubs; /* we use this to flag the substructures */
+              }
+            else
+              coll_candidates[k].bound_length = 0;
+          }
+      }
+
+  Mem.myfree(unbind_list);
+}
+
+struct loc_compound0
+{
+  int index;
+  location loc;
+  approxlen prevlen;
+};
+
+struct loc_compound1
+{
+  int index;
+  location loc;
+};
+
+struct loc_compound2
+{
+  location loc;
+  MyLenType len;
+  double prevlen;
+};
+
+struct loc_compound3
+{
+  int index;
+  MyLenType len;
+  location tail;
+  double prevlen;
+};
+
+struct loc_compound4
+{
+  int index;
+  location head;
+  location next;
+};
+
+struct loc_compound5
+{
+  int index;
+  MyLenType len;
+  location head;
+  location tail;
+  location next;
+  approxlen prevlen;
+};
+
+struct loc_compound6
+{
+  location loc;
+  MyLenType len;
+};
+
+template <typename partset>
+void fof<partset>::subfind_poll_for_requests(void)
+{
+  int tag;
+  do
+    {
+      MPI_Status status;
+      MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, SubComm, &status);
+
+      int source = status.MPI_SOURCE;
+      tag        = status.MPI_TAG;
+
+      /* MPI_Get_count(&status, MPI_BYTE, &count); */
+      switch(tag)
+        {
+          case TAG_GET_TWOHEADS:
+            {
+              int ibuf[2];
+              MPI_Recv(ibuf, 2, MPI_INT, source, TAG_GET_TWOHEADS, SubComm, MPI_STATUS_IGNORE);
+              location buf[2];
+              buf[0] = SFHead[ibuf[0]];
+              buf[1] = SFHead[ibuf[1]];
+              MPI_Send(buf, 2 * sizeof(location), MPI_BYTE, source, TAG_GET_TWOHEADS_DATA, SubComm);
+            }
+            break;
+
+          case TAG_SET_NEWTAIL:
+            {
+              loc_compound0 data;
+              MPI_Recv(&data, sizeof(data), MPI_BYTE, source, TAG_SET_NEWTAIL, SubComm, MPI_STATUS_IGNORE);
+
+              int index        = data.index;
+              location newtail = data.loc;
+              location oldtail = SFTail[index]; /* return old tail */
+              SFTail[index]    = newtail;
+              SFLen[index]++;
+              SFPrevLen[index] += data.prevlen.get();
+
+              if(newtail.task == SubThisTask)
+                {
+                  SFHead[newtail.index] = {SubThisTask, index};
+                  SFNext[newtail.index] = {-1, -1};
+                }
+
+              if(oldtail.task == SubThisTask)
+                {
+                  SFNext[oldtail.index] = newtail;
+                }
+
+              MPI_Send(&oldtail, sizeof(location), MPI_BYTE, source, TAG_GET_OLDTAIL, SubComm);
+            }
+            break;
+
+          case TAG_SET_ALL:
+            {
+              loc_compound5 data;
+              MPI_Recv(&data, sizeof(data), MPI_BYTE, source, TAG_SET_ALL, SubComm, MPI_STATUS_IGNORE);
+              int index        = data.index;
+              SFLen[index]     = data.len;
+              SFHead[index]    = data.head;
+              SFTail[index]    = data.tail;
+              SFNext[index]    = data.next;
+              SFPrevLen[index] = data.prevlen.get();
+            }
+            break;
+
+          case TAG_GET_TAILANDLEN:
+            {
+              int index;
+              MPI_Recv(&index, 1, MPI_INT, source, tag, SubComm, &status);
+              loc_compound2 data = {SFTail[index], SFLen[index], SFPrevLen[index]};
+              MPI_Send(&data, sizeof(data), MPI_BYTE, source, TAG_GET_TAILANDLEN_DATA, SubComm);
+            }
+            break;
+
+          case TAG_SET_TAILANDLEN:
+            {
+              loc_compound3 data;
+              MPI_Recv(&data, sizeof(data), MPI_BYTE, source, TAG_SET_TAILANDLEN, SubComm, MPI_STATUS_IGNORE);
+              int index        = data.index;
+              SFTail[index]    = data.tail;
+              SFLen[index]     = data.len;
+              SFPrevLen[index] = data.prevlen;
+            }
+            break;
+
+          case TAG_SET_HEADANDNEXT:
+            {
+              loc_compound4 data;
+              MPI_Recv(&data, sizeof(data), MPI_BYTE, source, TAG_SET_HEADANDNEXT, SubComm, MPI_STATUS_IGNORE);
+              int index     = data.index;
+              SFHead[index] = data.head;
+              SFNext[index] = data.next;
+            }
+            break;
+
+          case TAG_SET_NEXT:
+            {
+              loc_compound1 data;
+              MPI_Recv(&data, sizeof(data), MPI_BYTE, source, TAG_SET_NEXT, SubComm, MPI_STATUS_IGNORE);
+              int index     = data.index;
+              SFNext[index] = data.loc;
+            }
+            break;
+
+          case TAG_SETHEADGETNEXT:
+            {
+              loc_compound1 data;
+              MPI_Recv(&data, sizeof(data), MPI_BYTE, source, TAG_SETHEADGETNEXT, SubComm, MPI_STATUS_IGNORE);
+              int index     = data.index;
+              location head = data.loc;
+              location next;
+              int task;
+              do
+                {
+                  SFHead[index] = head;
+                  next          = SFNext[index];
+                  task          = next.task;
+                  index         = next.index;
+                }
+              while(next.index >= 0 && task == SubThisTask);
+              MPI_Send(&next, sizeof(location), MPI_BYTE, source, TAG_SETHEADGETNEXT_DATA, SubComm);
+            }
+            break;
+
+          case TAG_GET_NEXT:
+            {
+              int index;
+              MPI_Recv(&index, 1, MPI_INT, source, tag, SubComm, &status);
+              MPI_Send(&SFNext[index], sizeof(location), MPI_BYTE, source, TAG_GET_NEXT_DATA, SubComm);
+            }
+            break;
+
+          case TAG_GET_HEAD:
+            {
+              int index;
+              MPI_Recv(&index, 1, MPI_INT, source, tag, SubComm, &status);
+              MPI_Send(&SFHead[index], sizeof(location), MPI_BYTE, source, TAG_GET_HEAD_DATA, SubComm);
+            }
+            break;
+
+          case TAG_ADD_PARTICLE:
+            {
+              int index;
+              MPI_Recv(&index, 1, MPI_INT, source, tag, SubComm, &status);
+              if(SFTail[index].index < 0) /* consider only particles not already in substructures */
+                {
+                  unbind_list[LocalLen] = index;
+                  if(index >= NumPartGroup)
+                    Terminate("What: index=%d NumPartGroup=%d\n", index, NumPartGroup);
+                  LocalLen++;
+                }
+            }
+            break;
+
+          case TAG_MARK_PARTICLE:
+            {
+              int ibuf[3];
+              MPI_Recv(ibuf, 3, MPI_INT, source, TAG_MARK_PARTICLE, SubComm, MPI_STATUS_IGNORE);
+              int index   = ibuf[0];
+              int target  = ibuf[1];
+              int submark = ibuf[2];
+
+              if(Tp->PS[IndexList[index]].submark != HIGHBIT)
+                Terminate("TasK=%d i=%d P[i].submark=%d?\n", SubThisTask, IndexList[index], Tp->PS[IndexList[index]].submark);
+
+              Tp->PS[IndexList[index]].TargetTask = target;
+              Tp->PS[IndexList[index]].submark    = submark;
+            }
+            break;
+
+          case TAG_ADDBOUND:
+            {
+              int ibuf[2];
+              MPI_Recv(ibuf, 2, MPI_INT, source, TAG_ADDBOUND, SubComm, &status);
+              int index = ibuf[0];
+              int nsub  = ibuf[1];
+              if(SFTail[index].index == nsub) /* consider only particles in this substructure */
+                {
+                  unbind_list[LocalLen] = index;
+                  LocalLen++;
+                }
+            }
+            break;
+
+          case TAG_SETRANK:
+            {
+              loc_compound6 data;
+              MPI_Recv(&data, sizeof(data), MPI_BYTE, source, TAG_SETRANK, SubComm, MPI_STATUS_IGNORE);
+              int index      = data.loc.index;
+              MyLenType rank = data.len;
+              location next;
+              do
+                {
+                  SFLen[index] = rank++;
+                  next         = SFNext[index];
+                  if(next.index < 0)
+                    break;
+                  index = next.index;
+                }
+              while(next.task == SubThisTask);
+              data.loc = next;
+              data.len = rank;
+              MPI_Send(&data, sizeof(data), MPI_BYTE, source, TAG_SETRANK_OUT, SubComm);
+            }
+            break;
+
+          case TAG_GET_RANK:
+            {
+              int index;
+              MPI_Recv(&index, 1, MPI_INT, source, tag, SubComm, &status);
+              MyLenType rank = SFLen[index];
+              MPI_Send(&rank, sizeof(MyLenType), MPI_BYTE, source, TAG_GET_RANK_DATA, SubComm);
+            }
+            break;
+
+          case TAG_POLLING_DONE:
+            {
+              int index;
+              MPI_Recv(&index, 1, MPI_INT, source, tag, SubComm, &status);
+            }
+            break;
+
+          default:
+            Terminate("tag not present in the switch");
+            break;
+        }
+    }
+  while(tag != TAG_POLLING_DONE);
+}
+
+template <typename partset>
+location fof<partset>::subfind_distlinklist_setrank_and_get_next(location loc, MyLenType &rank)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  location next;
+
+  if(SubThisTask == task)
+    {
+      SFLen[i] = rank++;
+      next     = SFNext[i];
+    }
+  else
+    {
+      loc_compound6 data = {loc, rank};
+      MPI_Send(&data, sizeof(data), MPI_BYTE, task, TAG_SETRANK, SubComm);
+      MPI_Recv(&data, sizeof(data), MPI_BYTE, task, TAG_SETRANK_OUT, SubComm, MPI_STATUS_IGNORE);
+      next = data.loc;
+      rank = data.len;
+    }
+  return next;
+}
+
+template <typename partset>
+location fof<partset>::subfind_distlinklist_set_head_get_next(location loc, location head)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  location next;
+
+  if(SubThisTask == task)
+    {
+      SFHead[i] = head;
+      next      = SFNext[i];
+    }
+  else
+    {
+      loc_compound1 data = {i, head};
+      MPI_Send(&data, sizeof(data), MPI_BYTE, task, TAG_SETHEADGETNEXT, SubComm);
+      MPI_Recv(&next, sizeof(location), MPI_BYTE, task, TAG_SETHEADGETNEXT_DATA, SubComm, MPI_STATUS_IGNORE);
+    }
+
+  return next;
+}
+
+template <typename partset>
+void fof<partset>::subfind_distlinklist_set_next(location loc, location next)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  if(SubThisTask == task)
+    {
+      SFNext[i] = next;
+    }
+  else
+    {
+      loc_compound1 data = {i, next};
+      MPI_Send(&data, sizeof(data), MPI_BYTE, task, TAG_SET_NEXT, SubComm);
+    }
+}
+
+template <typename partset>
+void fof<partset>::subfind_distlinklist_add_particle(location loc)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  if(SubThisTask == task)
+    {
+      if(SFTail[i].index < 0) /* consider only particles not already in substructures */
+        {
+          if(i >= NumPartGroup)
+            Terminate("What: index=%d NumPartGroup=%d\n", i, NumPartGroup);
+
+          unbind_list[LocalLen] = i;
+          LocalLen++;
+        }
+    }
+  else
+    {
+      MPI_Send(&i, 1, MPI_INT, task, TAG_ADD_PARTICLE, SubComm);
+    }
+}
+
+template <typename partset>
+void fof<partset>::subfind_distlinklist_mark_particle(location loc, int target, int submark)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  if(SubThisTask == task)
+    {
+      if(Tp->PS[IndexList[i]].submark != HIGHBIT)
+        Terminate("Tas=%d i=%d PS[i].submark=%d?\n", SubThisTask, i, Tp->PS[IndexList[i]].submark);
+
+      Tp->PS[IndexList[i]].TargetTask = target;
+      Tp->PS[IndexList[i]].submark    = submark;
+    }
+  else
+    {
+      int ibuf[3] = {i, target, submark};
+      MPI_Send(ibuf, 3, MPI_INT, task, TAG_MARK_PARTICLE, SubComm);
+    }
+}
+
+template <typename partset>
+void fof<partset>::subfind_distlinklist_add_bound_particles(location loc, int nsub)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  if(SubThisTask == task)
+    {
+      if(SFTail[i].index == nsub) /* consider only particles not already in substructures */
+        {
+          unbind_list[LocalLen] = i;
+          LocalLen++;
+        }
+    }
+  else
+    {
+      int ibuf[2] = {i, nsub};
+      MPI_Send(ibuf, 2, MPI_INT, task, TAG_ADDBOUND, SubComm);
+    }
+}
+
+template <typename partset>
+location fof<partset>::subfind_distlinklist_get_next(location loc)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  location next;
+
+  if(SubThisTask == task)
+    {
+      next = SFNext[i];
+    }
+  else
+    {
+      MPI_Send(&i, 1, MPI_INT, task, TAG_GET_NEXT, SubComm);
+      MPI_Recv(&next, sizeof(location), MPI_BYTE, task, TAG_GET_NEXT_DATA, SubComm, MPI_STATUS_IGNORE);
+    }
+
+  return next;
+}
+
+template <typename partset>
+MyLenType fof<partset>::subfind_distlinklist_get_rank(location loc)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  MyLenType rank;
+
+  if(SubThisTask == task)
+    {
+      rank = SFLen[i];
+    }
+  else
+    {
+      MPI_Send(&i, 1, MPI_INT, task, TAG_GET_RANK, SubComm);
+      MPI_Recv(&rank, sizeof(MyLenType), MPI_BYTE, task, TAG_GET_RANK_DATA, SubComm, MPI_STATUS_IGNORE);
+    }
+
+  return rank;
+}
+
+template <typename partset>
+location fof<partset>::subfind_distlinklist_get_head(location loc)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  location head;
+
+  if(SubThisTask == task)
+    {
+      head = SFHead[i];
+    }
+  else
+    {
+      MPI_Send(&i, 1, MPI_INT, task, TAG_GET_HEAD, SubComm);
+      MPI_Recv(&head, sizeof(location), MPI_BYTE, task, TAG_GET_HEAD_DATA, SubComm, MPI_STATUS_IGNORE);
+    }
+
+  return head;
+}
+
+template <typename partset>
+void fof<partset>::subfind_distlinklist_get_two_heads(location ngb_index1, location ngb_index2, location &head, location &head_attach)
+{
+  if(ngb_index1.task != ngb_index2.task)
+    Terminate("ngb_index1.task != ngb_index2.task");
+
+  int task = ngb_index1.task;
+  int i1   = ngb_index1.index;
+  int i2   = ngb_index2.index;
+
+  if(SubThisTask == task)
+    {
+      head        = SFHead[i1];
+      head_attach = SFHead[i2];
+    }
+  else
+    {
+      int ibuf[2] = {i1, i2};
+      MPI_Send(ibuf, 2, MPI_INT, task, TAG_GET_TWOHEADS, SubComm);
+      location buf[2];
+      MPI_Recv(buf, 2 * sizeof(location), MPI_BYTE, task, TAG_GET_TWOHEADS_DATA, SubComm, MPI_STATUS_IGNORE);
+      head        = buf[0];
+      head_attach = buf[1];
+    }
+}
+
+template <typename partset>
+void fof<partset>::subfind_distlinklist_set_headandnext(location loc, location head, location next)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  if(SubThisTask == task)
+    {
+      SFHead[i] = head;
+      SFNext[i] = next;
+    }
+  else
+    {
+      loc_compound4 data = {i, head, next};
+      MPI_Send(&data, sizeof(data), MPI_BYTE, task, TAG_SET_HEADANDNEXT, SubComm);
+    }
+}
+
+template <typename partset>
+int fof<partset>::subfind_distlinklist_get_tail_set_tail_increaselen(location loc, location &tail, location newtail, approxlen prevlen)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  int retcode = 0;
+
+  if(SubThisTask == task)
+    {
+      location oldtail = SFTail[i];
+      SFTail[i]        = newtail;
+      SFLen[i]++;
+      SFPrevLen[i] += prevlen.get();
+      tail = oldtail;
+
+      if(newtail.task == SubThisTask)
+        {
+          SFHead[newtail.index] = loc;
+          SFNext[newtail.index] = {-1, -1};
+          retcode |= 1;
+        }
+
+      if(oldtail.task == SubThisTask)
+        {
+          SFNext[oldtail.index] = newtail;
+          retcode |= 2;
+        }
+    }
+  else
+    {
+      loc_compound0 data = {i, newtail, prevlen};
+      MPI_Send(&data, sizeof(data), MPI_BYTE, task, TAG_SET_NEWTAIL, SubComm);
+      location oldtail;
+      MPI_Recv(&oldtail, sizeof(location), MPI_BYTE, task, TAG_GET_OLDTAIL, SubComm, MPI_STATUS_IGNORE);
+      tail = oldtail;
+
+      if(newtail.task == task)
+        retcode |= 1;
+      if(oldtail.task == task)
+        retcode |= 2;
+    }
+
+  return retcode;
+}
+
+template <typename partset>
+void fof<partset>::subfind_distlinklist_set_tailandlen(location loc, location tail, MyLenType len, double prevlen)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  if(SubThisTask == task)
+    {
+      SFTail[i]    = tail;
+      SFLen[i]     = len;
+      SFPrevLen[i] = prevlen;
+    }
+  else
+    {
+      loc_compound3 data = {i, len, tail, prevlen};
+      MPI_Send(&data, sizeof(data), MPI_BYTE, task, TAG_SET_TAILANDLEN, SubComm);
+    }
+}
+
+template <typename partset>
+void fof<partset>::subfind_distlinklist_get_tailandlen(location loc, location &tail, MyLenType &len, double &prevlen)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  if(SubThisTask == task)
+    {
+      tail    = SFTail[i];
+      len     = SFLen[i];
+      prevlen = SFPrevLen[i];
+    }
+  else
+    {
+      MPI_Send(&i, 1, MPI_INT, task, TAG_GET_TAILANDLEN, SubComm);
+
+      loc_compound2 data;
+      MPI_Recv(&data, sizeof(data), MPI_BYTE, task, TAG_GET_TAILANDLEN_DATA, SubComm, MPI_STATUS_IGNORE);
+      tail    = data.loc;
+      len     = data.len;
+      prevlen = data.prevlen;
+    }
+}
+
+template <typename partset>
+void fof<partset>::subfind_distlinklist_set_all(location loc, location head, location tail, MyLenType len, location next,
+                                                approxlen prevlen)
+{
+  int task = loc.task;
+  int i    = loc.index;
+
+  if(SubThisTask == task)
+    {
+      SFHead[i]    = head;
+      SFTail[i]    = tail;
+      SFNext[i]    = next;
+      SFLen[i]     = len;
+      SFPrevLen[i] = prevlen.get();
+    }
+  else
+    {
+      loc_compound5 data = {i, len, head, tail, next, prevlen};
+      MPI_Send(&data, sizeof(data), MPI_BYTE, task, TAG_SET_ALL, SubComm);
+    }
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
+#endif
diff --git a/src/subfind/subfind_findlinkngb.cc b/src/subfind/subfind_findlinkngb.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e0edb0e8f18fe0b9486347ba89ff9d2df7dccb2
--- /dev/null
+++ b/src/subfind/subfind_findlinkngb.cc
@@ -0,0 +1,361 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_findlinkngb.cc
+ *
+ *  \brief find the nearest linking neighbors used in looking for saddle points
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+#ifndef SUBFIND_HBT
+
+#include <gsl/gsl_math.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+#include <cstdio>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/generic_comm.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/cxxsort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+struct r2type
+{
+  MyFloat r2;
+  int index;
+};
+
+static bool subfind_ngb_compare_dist(const r2type &a, const r2type &b) { return a.r2 < b.r2; }
+
+static int *DM_NumNgb;
+static double *Dist2list;
+static MyFloat *Left, *Right;
+
+/* local data structure for collecting particle/cell data that is sent to other processors if needed */
+struct nearest_in : data_in_generic
+{
+  MyIntPosType IntPos[3];
+  MyFloat DM_Hsml;
+};
+
+struct nearest_out
+{
+  int Ngb;
+};
+
+template <typename T_tree, typename T_domain, typename T_partset>
+class nearest_comm : public generic_comm<nearest_in, nearest_out, T_tree, T_domain, T_partset>
+{
+ public:
+  typedef generic_comm<nearest_in, nearest_out, T_tree, T_domain, T_partset> gcomm;
+  using gcomm::D;
+  using gcomm::Thread;
+  using gcomm::Tp;  // This makes sure that we can access Tp from the base class without having to use "this->Tp"
+  using gcomm::Tree;
+
+  /* need to call the base class constructor explicitly */
+  nearest_comm(T_domain *dptr, T_tree *tptr, T_partset *pptr) : gcomm(dptr, tptr, pptr) {}
+
+  /* routine that fills the relevant particle/cell data into the input structure defined above */
+  void particle2in(nearest_in *in, int i)
+  {
+    in->IntPos[0] = Tp->P[i].IntPos[0];
+    in->IntPos[1] = Tp->P[i].IntPos[1];
+    in->IntPos[2] = Tp->P[i].IntPos[2];
+    in->DM_Hsml   = Tp->PS[i].v.DM_Hsml;
+  }
+
+  /* routine to store or combine result data */
+  void out2particle(nearest_out *out, int i, int mode)
+  {
+    if(mode == MODE_LOCAL_PARTICLES) /* initial store */
+      DM_NumNgb[i] = out->Ngb;
+    else /* combine */
+      DM_NumNgb[i] += out->Ngb;
+  }
+
+  /*! This function represents the core of the SPH density computation. The
+   *  target particle may either be local, or reside in the communication
+   *  buffer.
+   */
+  int evaluate(int target, int mode, int thread_id, int action, nearest_in *in, int numnodes, node_info *firstnode, nearest_out &out)
+  {
+    MyIntPosType *intpos = in->IntPos;
+    double hsml          = in->DM_Hsml;
+    int numngb           = 0;
+    int exported         = 0;
+
+    for(int k = 0; k < numnodes; k++)
+      {
+        int no;
+        if(mode == MODE_LOCAL_PARTICLES)
+          {
+            no = Tree->MaxPart; /* root node */
+          }
+        else
+          {
+            no = firstnode[k].Node;
+            no = Tree->get_nodep(no)->nextnode; /* open it */
+          }
+
+        unsigned int shmrank = Tree->TreeSharedMem_ThisTask;
+
+        while(no >= 0)
+          {
+            if(no < Tree->MaxPart) /* single particle */
+              {
+                particle_data *P = Tree->get_Pp(no, shmrank);
+
+                no = Tree->get_nextnodep(shmrank)[no]; /* note: here shmrank cannot change */
+
+                double dxyz[3];
+                Tp->nearest_image_intpos_to_pos(P->IntPos, intpos, dxyz); /* converts the integer distance to floating point */
+
+                double h2 = hsml * hsml;
+
+                double r2 = dxyz[0] * dxyz[0];
+                if(r2 > h2)
+                  continue;
+
+                r2 += dxyz[1] * dxyz[1];
+                if(r2 > h2)
+                  continue;
+
+                r2 += dxyz[2] * dxyz[2];
+                if(r2 > h2)
+                  continue;
+
+                if(numngb >= Tp->NumPart)
+                  Terminate("numngb >= Tp->NumPart");
+
+                Dist2list[numngb++] = r2;
+              }
+            else if(no < Tree->MaxPart + Tree->MaxNodes) /* internal node */
+              {
+                if(mode == 1)
+                  {
+                    if(no < Tree->FirstNonTopLevelNode) /* we reached a top-level node again, which means that we are done with the
+                                                           branch */
+                      break;
+                  }
+
+                gravnode *current = Tree->get_nodep(no, shmrank);
+
+                no      = current->sibling; /* in case the node can be discarded */
+                shmrank = current->sibling_shmrank;
+
+                double dxyz[3];
+                Tp->nearest_image_intpos_to_pos(current->center.da, intpos,
+                                                dxyz); /* converts the integer distance to floating point */
+
+                double lenhalf = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - 1 - current->level)) * Tp->FacIntToCoord;
+
+                double dist = hsml + lenhalf;
+
+                if(fabs(dxyz[0]) > dist)
+                  continue;
+                if(fabs(dxyz[1]) > dist)
+                  continue;
+                if(fabs(dxyz[2]) > dist)
+                  continue;
+
+                /* now test against the minimal sphere enclosing everything */
+                dist += FACT1 * 2.0 * lenhalf;
+                if(dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2] > dist * dist)
+                  continue;
+
+                no      = current->nextnode; /* ok, we need to open the node */
+                shmrank = current->nextnode_shmrank;
+              }
+            else
+              {
+                /* pseudo particle */
+
+                if(mode == MODE_LOCAL_PARTICLES)
+                  if(target >= 0) /* if no target is given, export will not occur */
+                    {
+                      exported = 1;
+
+                      if(mode == MODE_LOCAL_PARTICLES)
+                        Tree->tree_export_node_threads(no, target, &Thread);
+                    }
+
+                no = Tree->Nextnode[no - Tree->MaxNodes];
+              }
+          }
+      }
+
+    if(mode == MODE_LOCAL_PARTICLES) /* local particle */
+      if(exported == 0)              /* completely local */
+        if(numngb >= All.DesLinkNgb)
+          {
+            r2type *R2list = (r2type *)Mem.mymalloc("R2list", sizeof(r2type) * numngb);
+            for(int i = 0; i < numngb; i++)
+              {
+                R2list[i].r2 = Dist2list[i];
+              }
+
+            mycxxsort(R2list, R2list + numngb, subfind_ngb_compare_dist);
+
+            Tp->PS[target].v.DM_Hsml = sqrt(R2list[All.DesLinkNgb - 1].r2);
+            numngb                   = All.DesLinkNgb;
+
+            for(int i = 0; i < numngb; i++)
+              {
+                Dist2list[i] = R2list[i].r2;
+              }
+
+            Mem.myfree(R2list);
+          }
+
+    out.Ngb = numngb;
+
+    return 0;
+  }
+};
+
+template <typename partset>
+void fof<partset>::subfind_find_linkngb(domain<partset> *SubDomain, int num, int *list)
+{
+  subfind_collective_printf("SUBFIND: root-task=%d: Start find_linkngb. (%d particles on root-task)\n", ThisTask, num);
+
+  Dist2list = (double *)Mem.mymalloc("Dist2list", Tp->NumPart * sizeof(double));
+  Left      = (MyFloat *)Mem.mymalloc("Left", sizeof(MyFloat) * Tp->NumPart);
+  Right     = (MyFloat *)Mem.mymalloc("Right", sizeof(MyFloat) * Tp->NumPart);
+  DM_NumNgb = (int *)Mem.mymalloc_movable(&DM_NumNgb, "DM_NumNgb", sizeof(int) * Tp->NumPart);
+
+  int *targetlist = (int *)Mem.mymalloc("targetlist", num * sizeof(int));
+
+  for(int idx = 0; idx < num; idx++)
+    {
+      targetlist[idx] = list[idx]; /* to preserve the input list, we make a copy */
+
+      int i   = list[idx];
+      Left[i] = Right[i] = 0;
+    }
+
+  nearest_comm<gravtree<partset>, domain<partset>, partset> commpattern{SubDomain, &FoFGravTree, Tp};
+
+  /* we will repeat the whole thing for those particles where we didn't find enough neighbours */
+  long long ntot;
+  int iter = 0;
+
+  do
+    {
+      double t0 = Logs.second();
+
+      commpattern.execute(num, targetlist, MODE_DEFAULT);
+
+      /* do final operations on results */
+      int npleft = 0;
+      for(int idx = 0; idx < num; idx++)
+        {
+          int i = targetlist[idx];
+
+          /* now check whether we had enough neighbours */
+
+          if(DM_NumNgb[i] != All.DesLinkNgb && ((Right[i] - Left[i]) > 1.0e-6 * Left[i] || Left[i] == 0 || Right[i] == 0))
+            {
+              /* need to redo this particle */
+              targetlist[npleft++] = i;
+
+              if(DM_NumNgb[i] < All.DesLinkNgb)
+                Left[i] = std::max<double>(Tp->PS[i].v.DM_Hsml, Left[i]);
+              else
+                {
+                  if(Right[i] != 0)
+                    {
+                      if(Tp->PS[i].v.DM_Hsml < Right[i])
+                        Right[i] = Tp->PS[i].v.DM_Hsml;
+                    }
+                  else
+                    Right[i] = Tp->PS[i].v.DM_Hsml;
+                }
+
+              if(iter >= MAXITER - 10)
+                {
+                  double pos[3];
+                  Tp->intpos_to_pos(Tp->P[i].IntPos, pos);
+
+                  printf("i=%d task=%d ID=%d DM_Hsml=%g Left=%g Right=%g Right-Left=%g\n   pos=(%g|%g|%g)\n", i, ThisTask,
+                         (int)Tp->P[i].ID.get(), Tp->PS[i].v.DM_Hsml, Left[i], Right[i], (double)(Right[i] - Left[i]), pos[0], pos[1],
+                         pos[2]);
+                  myflush(stdout);
+                }
+
+              if(Right[i] > 0 && Left[i] > 0)
+                Tp->PS[i].v.DM_Hsml = pow(0.5 * (pow(Left[i], 3) + pow(Right[i], 3)), 1.0 / 3);
+              else
+                {
+                  if(Right[i] == 0 && Left[i] == 0)
+                    Terminate("can't occur");
+
+                  if(Right[i] == 0 && Left[i] > 0)
+                    Tp->PS[i].v.DM_Hsml *= 1.26;
+
+                  if(Right[i] > 0 && Left[i] == 0)
+                    Tp->PS[i].v.DM_Hsml /= 1.26;
+                }
+            }
+        }
+
+      num = npleft;
+
+      sumup_large_ints(1, &npleft, &ntot, SubComm);
+
+      double t1 = Logs.second();
+
+      if(ntot > 0)
+        {
+          iter++;
+
+          if(iter > 0)
+            subfind_collective_printf(
+                "SUBFIND: root-task=%d: find linkngb iteration %d, need to repeat for %lld particles. (took %g sec)\n", ThisTask, iter,
+                ntot, Logs.timediff(t0, t1));
+
+          if(iter > MAXITER)
+            Terminate("failed to converge in neighbour iteration in density_findlinkngb()\n");
+        }
+    }
+  while(ntot > 0);
+
+  Mem.myfree(targetlist);
+  Mem.myfree(DM_NumNgb);
+  Mem.myfree(Right);
+  Mem.myfree(Left);
+
+  Mem.myfree(Dist2list);
+
+  subfind_collective_printf("SUBFIND: root-task=%d: Done with find_linkngb\n", ThisTask);
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
+#endif
diff --git a/src/subfind/subfind_history.cc b/src/subfind/subfind_history.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a384b52e36b7c11c9548b4a26d32756b794aab1
--- /dev/null
+++ b/src/subfind/subfind_history.cc
@@ -0,0 +1,784 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_history.cc
+ *
+ *  \brief this implements the SUBFIND_HBT algorithm for substructure finding
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+#if defined(MERGERTREE) && defined(SUBFIND_HBT)
+
+#include <gsl/gsl_math.h>
+#include <mpi.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/cxxsort.h"
+#include "../sort/parallel_sort.h"
+#include "../sort/peano.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+template <typename partset>
+void fof<partset>::subfind_hbt_single_group(domain<partset> *SubDomain, domain<partset> *SingleDomain, domain_options mode, int gr)
+{
+  /* get total group length */
+  long long totgrouplen;
+  sumup_large_ints(1, &NumPartGroup, &totgrouplen, SubComm);
+
+  /******************* determine subhalo candidates based on previous subhalo catalogue ***************/
+
+  hbt_pcand_t *pcand = (hbt_pcand_t *)Mem.mymalloc_movable(&pcand, "pcand", (NumPartGroup + 1) * sizeof(hbt_pcand_t));
+
+  for(int k = 0; k < NumPartGroup; k++)
+    {
+      /* provisionally assign a new subhalo number based on the previous group catalogue - this will be modified in the following  */
+      Tp->PS[IndexList[k]].SubhaloNr = Tp->P[IndexList[k]].PrevSubhaloNr;
+
+      pcand[k].SubhaloNr         = Tp->PS[IndexList[k]].SubhaloNr;
+      pcand[k].PrevSizeOfSubhalo = Tp->P[IndexList[k]].PrevSizeOfSubhalo;
+      pcand[k].index = -1;  // just put here to signal that this is invalid at this stage (note that we'll do a parallel sort)
+    }
+
+  /* sort according to subhalonr  */
+  mycxxsort_parallel(pcand, pcand + NumPartGroup, subfind_hbt_compare_pcand_subhalonr, SubComm);
+
+  int *NumPartGroup_list = (int *)Mem.mymalloc_movable(&NumPartGroup_list, "NumPartGroup_list", SubNTask * sizeof(int));
+  MPI_Allgather(&NumPartGroup, sizeof(int), MPI_BYTE, NumPartGroup_list, sizeof(int), MPI_BYTE, SubComm);
+
+  /* get the last element of each task */
+  hbt_pcand_t *elem_last = (hbt_pcand_t *)Mem.mymalloc_movable(&elem_last, "elem_last", SubNTask * sizeof(hbt_pcand_t));
+
+  /* note: the 0th element is guaranteed to be allocated even on ranks with zero NumPartGroup */
+  MPI_Allgather(&pcand[NumPartGroup > 0 ? NumPartGroup - 1 : 0], sizeof(hbt_pcand_t), MPI_BYTE, elem_last, sizeof(hbt_pcand_t),
+                MPI_BYTE, SubComm);
+
+  /* if a new section begins on the current processor, we register it on this processor as a candidate */
+  /* initialize the number of current candidates */
+  bool element_before_present = false;
+  hbt_pcand_t element_before{};
+
+  for(int task = SubThisTask - 1; task >= 0; task--)
+    {
+      if(NumPartGroup_list[task] > 0)
+        {
+          element_before_present = true;
+          element_before         = elem_last[task];
+          break;
+        }
+    }
+
+  int marked = 0;
+  count_cand = 0;
+
+  for(int i = 0; i < NumPartGroup; i++)
+    {
+      if(i == 0 && !element_before_present)
+        count_cand++;
+      else
+        {
+          MyHaloNrType prevnr;
+
+          if(i == 0 && element_before_present)
+            prevnr = element_before.SubhaloNr;
+          else
+            prevnr = pcand[i - 1].SubhaloNr;
+
+          if(pcand[i].SubhaloNr != prevnr)
+            count_cand++;
+        }
+    }
+
+  /* allocate a list to store local subhalo candidates for this group */
+  hbt_subcand_t *loc_candidates =
+      (hbt_subcand_t *)Mem.mymalloc_movable(&loc_candidates, "loc_candidates", count_cand * sizeof(hbt_subcand_t));
+
+  count_cand = 0;
+
+  for(int i = 0; i < NumPartGroup; i++)
+    {
+      if(i == 0 && !element_before_present)
+        {
+          loc_candidates[count_cand].SubhaloNr = pcand[i].SubhaloNr;
+          count_cand++;
+        }
+      else
+        {
+          MyHaloNrType prevnr;
+
+          if(i == 0 && element_before_present)
+            prevnr = element_before.SubhaloNr;
+          else
+            prevnr = pcand[i - 1].SubhaloNr;
+
+          if(pcand[i].SubhaloNr != prevnr)
+            {
+              loc_candidates[count_cand].SubhaloNr = pcand[i].SubhaloNr;
+              count_cand++;
+            }
+        }
+    }
+
+  /* establish total number of candidates */
+  long long totcand;
+  sumup_large_ints(1, &count_cand, &totcand, SubComm);
+
+  int nsubhalos_old = Nsubhalos;
+
+  if(Nsubhalos + totcand + 1 > MaxNsubhalos)
+    {
+      // warn("Nsubhalos=%d  + totcand=%lld >= MaxNsubhalos=%d", Nsubhalos, totcand, MaxNsubhalos);
+
+      MaxNsubhalos = 1.25 * (Nsubhalos + totcand + 1);
+
+      Subhalo = (subhalo_properties *)Mem.myrealloc_movable(Subhalo, MaxNsubhalos * sizeof(subhalo_properties));
+    }
+
+  /* assemble a list of the candidates on all tasks */
+  hbt_subcand_t *all_candidates =
+      (hbt_subcand_t *)Mem.mymalloc_movable(&all_candidates, "all_candidates", (totcand + 1) * sizeof(hbt_subcand_t));
+
+  int *countlist = (int *)Mem.mymalloc_movable(&countlist, "countlist", SubNTask * sizeof(int));
+  int *offset    = (int *)Mem.mymalloc_movable(&offset, "offset", SubNTask * sizeof(int));
+
+  int count = count_cand * sizeof(hbt_subcand_t); /* length in bytes */
+  MPI_Allgather(&count, 1, MPI_INT, countlist, 1, MPI_INT, SubComm);
+
+  offset[0] = 0;
+  for(int i = 1; i < SubNTask; i++)
+    offset[i] = offset[i - 1] + countlist[i - 1];
+
+  MPI_Allgatherv(loc_candidates, count, MPI_BYTE, all_candidates, countlist, offset, MPI_BYTE, SubComm);
+
+  /* sort the candidates by subhalonr */
+  mycxxsort(all_candidates, all_candidates + totcand, subfind_hbt_compare_subcand_subhalonr);
+
+  /* now determine the size of the candidates */
+  long long *size_list            = (long long *)Mem.mymalloc_clear("size_list", totcand * sizeof(long long));
+  long long *summed_prevsize_list = (long long *)Mem.mymalloc_clear("summed_prevsize_list", totcand * sizeof(long long));
+
+  int j = 0;
+  for(int k = 0; k < NumPartGroup; k++)
+    {
+      if(j >= totcand)
+        Terminate("can't be: k=%d   j=%d  NumPartGroup=%d totcand=%lld\n", k, j, NumPartGroup, totcand);
+
+      while(j < totcand && pcand[k].SubhaloNr > all_candidates[j].SubhaloNr)
+        j++;
+
+      if(pcand[k].SubhaloNr != all_candidates[j].SubhaloNr)
+        Terminate("can't be:  k=%d NumPartGroup=%d   pcand[k].SubhaloNr=%lld    j=%d  all_candidates[j].SubhaloNr=%lld\n", k,
+                  NumPartGroup, (long long)pcand[k].SubhaloNr.get(), j, (long long)all_candidates[j].SubhaloNr.get());
+
+      size_list[j]++;
+      summed_prevsize_list[j] += pcand[k].PrevSizeOfSubhalo.get();
+    }
+
+  MPI_Allreduce(MPI_IN_PLACE, size_list, totcand, MPI_LONG_LONG, MPI_SUM, SubComm);
+  MPI_Allreduce(MPI_IN_PLACE, summed_prevsize_list, totcand, MPI_LONG_LONG, MPI_SUM, SubComm);
+
+  for(int i = 0; i < totcand; i++)
+    {
+      all_candidates[i].len           = size_list[i];
+      all_candidates[i].summedprevlen = summed_prevsize_list[i];
+    }
+
+  Mem.myfree(summed_prevsize_list);
+  Mem.myfree(size_list);
+
+  /* do a sanity test */
+  long long lensum = 0;
+  for(int i = 0; i < totcand; i++)
+    lensum += all_candidates[i].len;
+
+  if(lensum != totgrouplen)
+    Terminate("lensum=%lld != totgrouplen=%lld\n", lensum, totgrouplen);
+
+  /*******************************************/
+
+  /* find the group of previously unbound ones, and if this candidate exists, eliminate it */
+  for(int i = 0; i < totcand; i++)
+    if(all_candidates[i].SubhaloNr.get() == HALONR_MAX)
+      {
+        all_candidates[i] = all_candidates[totcand - 1];
+        totcand--;
+        break;
+      }
+
+  /* let's now eliminate small groups and flag the corresponding particles as unbound */
+  {
+    /* sort the candidates according to previous subhalonr */
+    mycxxsort(all_candidates, all_candidates + totcand, subfind_hbt_compare_subcand_subhalonr);
+
+    /* reestablish a locally sorted pcand */
+    for(int k = 0; k < NumPartGroup; k++)
+      {
+        pcand[k].SubhaloNr = Tp->PS[IndexList[k]].SubhaloNr;
+        pcand[k].index     = IndexList[k];
+      }
+    mycxxsort(pcand, pcand + NumPartGroup, subfind_hbt_compare_pcand_subhalonr);
+
+    long long sum = 0;
+
+    int p = 0;
+
+    for(int i = 0; i < totcand; i++)
+      if(all_candidates[i].len < All.DesLinkNgb)
+        {
+          while(p < NumPartGroup && pcand[p].SubhaloNr < all_candidates[i].SubhaloNr)
+            p++;
+
+          while(p < NumPartGroup && pcand[p].SubhaloNr == all_candidates[i].SubhaloNr)
+            {
+              if(Tp->PS[pcand[p].index].SubhaloNr != all_candidates[i].SubhaloNr)
+                Terminate(
+                    "we have an issue! p=%d  NumPartGroup=%d  pcand[p].index=%d  pcand[p].SubhaloNr=%lld  "
+                    "all_candidates[i].SubhaloNr=%lld  Tp->P[pcand[p].index].SubhaloNr=%lld",
+                    p, NumPartGroup, pcand[p].index, (long long)pcand[p].SubhaloNr.get(), (long long)all_candidates[i].SubhaloNr.get(),
+                    (long long)Tp->PS[pcand[p].index].SubhaloNr.get());
+
+              pcand[p].SubhaloNr.set(HALONR_MAX);
+              Tp->PS[pcand[p].index].SubhaloNr.set(HALONR_MAX);
+              p++;
+              sum++;
+            }
+        }
+
+    MPI_Allreduce(MPI_IN_PLACE, &sum, 1, MPI_LONG_LONG, MPI_SUM, SubComm);
+
+    long long sum2 = 0;
+
+    for(int i = 0; i < totcand; i++)
+      if(all_candidates[i].len < All.DesLinkNgb)
+        {
+          sum2 += all_candidates[i].len;
+          all_candidates[i] = all_candidates[totcand - 1];
+          totcand--;
+          i--;
+        }
+
+    if(sum != sum2)
+      Terminate("consistency check failed sum = %lld  sum2 = %lld\n", sum, sum2);
+
+    /* sort according to subhalonr  */
+    mycxxsort_parallel(pcand, pcand + NumPartGroup, subfind_hbt_compare_pcand_subhalonr, SubComm);
+  }
+
+  /* if a largest one exists, eliminate it, because this is the one we will treat as background halo */
+  if(totcand > 0)
+    {
+      /* sort the candidates by size */
+      mycxxsort(all_candidates, all_candidates + totcand, subfind_hbt_compare_subcand_len);
+
+      hbt_subcand_t maxlen_candidate = all_candidates[totcand - 1];
+
+      /* sort the candidates by summed previous length, as this is arguably a more robust decision of which one should be the largest
+       */
+      mycxxsort(all_candidates, all_candidates + totcand, subfind_hbt_compare_subcand_summedprevlen);
+
+      if(maxlen_candidate.SubhaloNr != all_candidates[totcand - 1].SubhaloNr)
+        {
+          /*
+          printf(
+              "SUBFIND_HBT:  Made a different decision on largest on task=%d:  select one with len=%lld over len=%lld because "
+              "sumprevlen=%lld is bigger than %lld\n",
+              ThisTask, (long long)all_candidates[totcand - 1].len, (long long)maxlen_candidate.len,
+              (long long)all_candidates[totcand - 1].summedprevlen, (long long)maxlen_candidate.summedprevlen);
+              */
+        }
+
+      totcand--;
+      for(int k = 0; k < NumPartGroup; k++)
+        if(Tp->PS[IndexList[k]].SubhaloNr == all_candidates[totcand].SubhaloNr)
+          Tp->PS[IndexList[k]].SubhaloNr.set(HALONR_MAX);
+    }
+
+  /* sort the candidates according to previous subhalonr */
+  mycxxsort(all_candidates, all_candidates + totcand, subfind_hbt_compare_subcand_subhalonr);
+
+  subfind_collective_printf("SUBFIND: root-task=%d: total number of subhalo coll_candidates=%lld\n", ThisTask, totcand);
+
+  /*******************************************/
+  /* Let's now see which candidates can be treated with serial CPUs, which is more efficient than doing them all collectively.
+   * We identify them with those candidates that are sufficiently small, which should be most of them by number.
+   */
+
+  int n_small_cand = 0;
+  int max_length   = 0;
+
+  int task         = 0;
+  int task_scatter = 0;
+  for(int i = 0; i < totcand; i++)
+    {
+      if(all_candidates[i].len < 0.20 * Tp->TotNumPart / NTask)  // small enough
+        {
+          all_candidates[i].DoIt        = true;
+          all_candidates[i].TargetTask  = task++;
+          all_candidates[i].TargetIndex = n_small_cand;
+
+          if(task >= SubNTask)
+            task = 0;
+
+          if(all_candidates[i].len > max_length)
+            max_length = all_candidates[i].len;
+
+          n_small_cand++;
+        }
+      else
+        {
+          all_candidates[i].DoIt        = false;
+          all_candidates[i].TargetTask  = task_scatter++;
+          all_candidates[i].TargetIndex = INT_MAX;
+          if(task_scatter >= SubNTask)
+            task_scatter = 0;
+        }
+    }
+
+  subfind_collective_printf(
+      "SUBFIND: root-task=%d: number of subhalo candidates small enough to be done with one cpu: %d. (Largest size %d)\n", ThisTask,
+      n_small_cand, max_length);
+
+  /* now get target information to particles */
+  for(int k = 0; k < NumPartGroup; k++)
+    {
+      pcand[k].SubhaloNr = Tp->PS[IndexList[k]].SubhaloNr;
+      pcand[k].index     = IndexList[k];
+    }
+
+  // note: local serial sort sufficient here
+  mycxxsort(pcand, pcand + NumPartGroup, subfind_hbt_compare_pcand_subhalonr);
+
+  if(SubNTask > 1)
+    {
+      /* we only need to redistribute the particles if we are processing groups collectively */
+      /* Note: setting of TargetIndex make sure that the particles are grouped together on the target task */
+
+      /* set default values for current particle distribution */
+      for(int i = 0; i < Tp->NumPart; i++)
+        {
+          Tp->PS[i].u.s.origintask  = SubThisTask;
+          Tp->PS[i].u.s.originindex = i;
+
+          Tp->PS[i].TargetTask  = SubThisTask;
+          Tp->PS[i].TargetIndex = INT_MAX;
+        }
+
+      int i = 0;
+      for(int k = 0; k < totcand; k++)
+        if(all_candidates[k].DoIt)
+          {
+            while(i < NumPartGroup && pcand[i].SubhaloNr < all_candidates[k].SubhaloNr)
+              i++;
+
+            while(i < NumPartGroup && pcand[i].SubhaloNr == all_candidates[k].SubhaloNr)
+              {
+                Tp->PS[pcand[i].index].TargetTask  = all_candidates[k].TargetTask;
+                Tp->PS[pcand[i].index].TargetIndex = all_candidates[k].TargetIndex;
+                i++;
+              }
+          }
+
+      /* assemble the particles on individual processors (note: IndexList[] becomes temporarily meaningless)  */
+      subfind_distribute_particles(SubComm);
+    }
+
+  /* now do the serial unbinding */
+  /*----------------------------------------------------*/
+
+  {
+    unbind_list = (int *)Mem.mymalloc("unbind_list", Tp->NumPart * sizeof(int));
+
+    int i = 0;  // particle index
+
+    for(int k = 0; k < totcand; k++)
+      if(all_candidates[k].DoIt)
+        if(all_candidates[k].TargetTask == SubThisTask)
+          {
+            int len = 0;
+
+            if(SubNTask > 1)
+              {
+                while(i < Tp->NumPart && Tp->PS[i].SubhaloNr < all_candidates[k].SubhaloNr)
+                  i++;
+
+                while(i < Tp->NumPart && Tp->PS[i].SubhaloNr == all_candidates[k].SubhaloNr && Tp->PS[i].GroupNr.get() == GroupNr)
+                  {
+                    unbind_list[len] = i;
+                    len++;
+                    i++;
+                  }
+              }
+            else
+              {
+                while(i < NumPartGroup && Tp->PS[pcand[i].index].SubhaloNr < all_candidates[k].SubhaloNr)
+                  i++;
+
+                while(i < NumPartGroup && Tp->PS[pcand[i].index].SubhaloNr == all_candidates[k].SubhaloNr &&
+                      Tp->PS[pcand[i].index].GroupNr.get() == GroupNr)
+                  {
+                    unbind_list[len] = pcand[i].index;
+                    len++;
+                    i++;
+                  }
+              }
+
+            if(len != all_candidates[k].len)
+              Terminate("this is unexpected: k=%d   len=%lld != all_candidates[k].len=%lld) \n", k, (long long)len,
+                        (long long)all_candidates[k].len);
+
+            /* default is that all particles end up unbound */
+            for(int n = 0; n < len; n++)
+              Tp->PS[unbind_list[n]].SubhaloNr.set(HALONR_MAX);
+
+            /* call the serial unbind function */
+            len = subfind_unbind(SingleDomain, SingleDomain->Communicator, unbind_list, len);
+
+            if(len >= All.DesLinkNgb)
+              {
+                /* set as provisional group number the previous group number */
+                for(int n = 0; n < len; n++)
+                  {
+                    Tp->PS[unbind_list[n]].SubhaloNr = all_candidates[k].SubhaloNr;
+                    Tp->PS[unbind_list[n]].SizeOfSubhalo.set(len);
+                  }
+
+                if(Nsubhalos >= MaxNsubhalos)
+                  Terminate("no storage: Nsubhalos=%d  MaxNsubhalos=%d  nsubhalos_old=%d totcand=%lld\n", Nsubhalos, MaxNsubhalos,
+                            nsubhalos_old, totcand);
+
+                /* ok, we found a substructure */
+                marked += subfind_determine_sub_halo_properties(unbind_list, len, &Subhalo[Nsubhalos], SingleDomain->Communicator);
+
+                Subhalo[Nsubhalos].GroupNr       = GroupNr;
+                Subhalo[Nsubhalos].SubParentRank = 0;
+                Subhalo[Nsubhalos].SubhaloNr     = all_candidates[k].SubhaloNr.get();
+
+                Nsubhalos++;
+              }
+          }
+
+    Mem.myfree(unbind_list);
+  }
+
+  if(SubNTask > 1)
+    {
+      /* bring them back to their original processor */
+      for(int i = 0; i < Tp->NumPart; i++)
+        {
+          Tp->PS[i].TargetTask  = Tp->PS[i].u.s.origintask;
+          Tp->PS[i].TargetIndex = Tp->PS[i].u.s.originindex;
+        }
+
+      subfind_distribute_particles(SubComm);
+    }
+
+  double t0 = Logs.second();
+
+  /**************************************************/
+  /**************************************************/
+  /*******  now do remaining ones collectively  *****/
+
+  /* first, add a fiducial candidate which will be our background halo, swallowing all unbound particles */
+
+  all_candidates[totcand].DoIt = false; /* marks collective ones */
+  all_candidates[totcand].SubhaloNr.set(HALONR_MAX);
+  totcand++;
+
+  for(int k = 0; k < totcand; k++)
+    if(all_candidates[k].DoIt == false)
+      {
+        domain<partset> SubUnbindDomain{SubComm, Tp};
+
+        int *unbind_list;
+
+        if(mode == COLL_SUBFIND)
+          {
+            for(int i = 0; i < Tp->NumPart; i++)
+              {
+                Tp->PS[i].u.s.origintask  = SubThisTask;
+                Tp->PS[i].u.s.originindex = i;
+                Tp->PS[i].DomainFlag      = 0;
+              }
+
+            /* mark the one to be unbound in PS[] */
+            for(int i = 0; i < NumPartGroup; i++)
+              if(Tp->PS[IndexList[i]].SubhaloNr == all_candidates[k].SubhaloNr)
+                Tp->PS[IndexList[i]].DomainFlag = 1;
+
+            SubUnbindDomain.domain_decomposition(mode);
+            subfind_distribute_particles(SubComm);
+
+            LocalLen = 0;
+            for(int i = 0; i < Tp->NumPart; i++)
+              if(Tp->PS[i].DomainFlag)
+                LocalLen++;
+
+            unbind_list = (int *)Mem.mymalloc_movable(&unbind_list, "unbind_list", LocalLen * sizeof(int));
+
+            /* refill unbind_list */
+            LocalLen = 0;
+            for(int i = 0; i < Tp->NumPart; i++)
+              if(Tp->PS[i].DomainFlag)
+                unbind_list[LocalLen++] = i;
+          }
+
+        else
+          {
+            unbind_list = (int *)Mem.mymalloc_movable(&unbind_list, "unbind_list", NumPartGroup * sizeof(int));
+
+            LocalLen = 0;
+            for(int i = 0; i < NumPartGroup; i++)
+              if(Tp->PS[IndexList[i]].SubhaloNr == all_candidates[k].SubhaloNr)
+                unbind_list[LocalLen++] = IndexList[i];
+          }
+
+        /* default is that all particles end up unbound */
+        for(int n = 0; n < LocalLen; n++)
+          Tp->PS[unbind_list[n]].SubhaloNr.set(HALONR_MAX);
+
+        if(mode == COLL_SUBFIND)
+          LocalLen = subfind_unbind(&SubUnbindDomain, SubComm, unbind_list, LocalLen);
+        else
+          LocalLen = subfind_unbind(SingleDomain, SingleDomain->Communicator, unbind_list, LocalLen);
+
+        int FullLen;
+        MPI_Allreduce(&LocalLen, &FullLen, 1, MPI_INT, MPI_SUM, SubComm);
+
+        if(FullLen >= All.DesLinkNgb)
+          {
+            if(all_candidates[k].SubhaloNr.get() == HALONR_MAX)
+              all_candidates[k].SubhaloNr.set(HALONR_MAX - 1);
+
+            /* set as provisional group number the previous group number */
+            for(int n = 0; n < LocalLen; n++)
+              {
+                Tp->PS[unbind_list[n]].SubhaloNr = all_candidates[k].SubhaloNr;
+#if defined(MERGERTREE)
+                Tp->PS[unbind_list[n]].SizeOfSubhalo.set(FullLen);
+#endif
+              }
+
+            if(Nsubhalos >= MaxNsubhalos)
+              Terminate("no storage: Nsubhalos=%d  MaxNsubhalos=%d  nsubhalos_old=%d totcand=%lld\n", Nsubhalos, MaxNsubhalos,
+                        nsubhalos_old, totcand);
+
+            marked += subfind_determine_sub_halo_properties(unbind_list, LocalLen, &Subhalo[Nsubhalos], SubComm);
+
+            if(SubThisTask == 0)
+              {
+                Subhalo[Nsubhalos].GroupNr       = GroupNr;
+                Subhalo[Nsubhalos].SubParentRank = 0;
+                Subhalo[Nsubhalos].SubhaloNr     = all_candidates[k].SubhaloNr.get();
+
+                Nsubhalos++;
+              }
+          }
+
+        Mem.myfree(unbind_list);
+
+        if(mode == COLL_SUBFIND)
+          {
+            SubUnbindDomain.domain_free();
+
+            for(int i = 0; i < Tp->NumPart; i++)
+              {
+                Tp->PS[i].TargetTask  = Tp->PS[i].u.s.origintask;
+                Tp->PS[i].TargetIndex = Tp->PS[i].u.s.originindex;
+              }
+
+            double t0 = Logs.second();
+            subfind_distribute_particles(SubComm); /* bring them back to their original processor */
+            double t1 = Logs.second();
+
+            subfind_collective_printf("SUBFIND: root-task=%d: bringing the independent ones back took %g sec\n", ThisTask,
+                                      Logs.timediff(t0, t1));
+
+            /* Since we reestablished the original order, we can use IndexList[] again */
+          }
+      }
+
+  double t1 = Logs.second();
+  subfind_collective_printf("SUBFIND: root-task=%d: the collective unbinding of remaining halos took %g sec\n", ThisTask,
+                            Logs.timediff(t0, t1));
+
+  /* get the total substructure count */
+  int countloc = Nsubhalos - nsubhalos_old;
+  int countall;
+
+  MPI_Allreduce(&countloc, &countall, 1, MPI_INT, MPI_SUM, SubComm);
+
+  MPI_Allreduce(MPI_IN_PLACE, &marked, 1, MPI_INT, MPI_SUM, SubComm);
+
+  /* Now let's save some properties of the substructures */
+  if(SubThisTask == 0)
+    {
+      Group[gr].Nsubs = countall;
+
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+      Group[gr].LenPrevMostBnd += marked;
+#endif
+    }
+
+  /* also need to set the field SubRankInGr in all found Subhalos */
+
+  hbt_subhalo_t *subhalo_list = (hbt_subhalo_t *)Mem.mymalloc("subhalo_list", countloc * sizeof(hbt_subhalo_t));
+
+  for(int n = 0; n < countloc; n++)
+    {
+      subhalo_list[n].Len       = Subhalo[n + nsubhalos_old].Len;
+      subhalo_list[n].ThisTask  = SubThisTask;
+      subhalo_list[n].ThisIndex = n;
+      subhalo_list[n].SubhaloNr = Subhalo[n + nsubhalos_old].SubhaloNr;
+    }
+
+  mycxxsort_parallel(subhalo_list, subhalo_list + countloc, subfind_hbt_compare_subhalolist_len, SubComm);
+
+  int *countloc_list = (int *)Mem.mymalloc("countloc_list", SubNTask * sizeof(int));
+  MPI_Allgather(&countloc, 1, MPI_INT, countloc_list, 1, MPI_INT, SubComm);
+  int npreviousranks = 0;
+  for(int i = 0; i < SubThisTask; i++)
+    npreviousranks += countloc_list[i];
+
+  for(int n = 0; n < countloc; n++)
+    subhalo_list[n].SubRankInGr = n + npreviousranks;
+
+  mycxxsort_parallel(subhalo_list, subhalo_list + countloc, subfind_hbt_compare_subhalolist_thistask_thisindex, SubComm);
+
+  /* rank and index of main subhalo */
+  int taskmain  = 0;
+  int indexmain = 0;
+
+  for(int n = 0; n < countloc; n++)
+    {
+      Subhalo[n + nsubhalos_old].SubRankInGr = subhalo_list[n].SubRankInGr;
+
+      if(subhalo_list[n].SubRankInGr == 0)
+        {
+          /* here we have the main subhalo */
+          taskmain  = SubThisTask;
+          indexmain = n + nsubhalos_old;
+        }
+    }
+
+  /* now we need to fill in SubRankInGr to T->PS[] so that the particles can be sorted in subhalo order
+   * we use subhalo_list[] as translation table
+   */
+
+  for(int k = 0; k < NumPartGroup; k++)
+    {
+      pcand[k].SubhaloNr = Tp->PS[IndexList[k]].SubhaloNr;
+      pcand[k].index     = IndexList[k];
+    }
+
+  /* sort locally  */
+  mycxxsort(pcand, pcand + NumPartGroup, subfind_hbt_compare_pcand_subhalonr);
+
+  int sizelocsubhalolist = countloc * sizeof(hbt_subhalo_t); /* length in bytes */
+  MPI_Allgather(&sizelocsubhalolist, 1, MPI_INT, countlist, 1, MPI_INT, SubComm);
+
+  offset[0] = 0;
+  for(int i = 1; i < SubNTask; i++)
+    offset[i] = offset[i - 1] + countlist[i - 1];
+
+  hbt_subhalo_t *all_subhalo_list = (hbt_subhalo_t *)Mem.mymalloc("all_subhalo_list", countall * sizeof(hbt_subhalo_t));
+
+  MPI_Allgatherv(subhalo_list, sizelocsubhalolist, MPI_BYTE, all_subhalo_list, countlist, offset, MPI_BYTE, SubComm);
+
+  /* sort locally  */
+  mycxxsort(all_subhalo_list, all_subhalo_list + countall, subfind_hbt_compare_subhalolist_prevsubhalonr);
+
+  int n = 0;
+
+  for(int k = 0; k < NumPartGroup; k++)
+    {
+      if(pcand[k].SubhaloNr.get() == HALONR_MAX)
+        Tp->PS[pcand[k].index].SubRankInGr = INT_MAX;
+      else
+        {
+          while(n < countall && all_subhalo_list[n].SubhaloNr < (long long)pcand[k].SubhaloNr.get())
+            n++;
+
+          if(n >= countall)
+            Terminate("unexpected: n=%d countall=%d", n, countall);
+
+          if(all_subhalo_list[n].SubhaloNr != (long long)pcand[k].SubhaloNr.get())
+            Terminate("also unexpected: k=%d NumPartGroup=%d  all_subhalo_list[n].SubhaloNr=%lld != pcand[k].SubhaloNr=%lld\n", k,
+                      NumPartGroup, (long long)all_subhalo_list[n].SubhaloNr, (long long)pcand[k].SubhaloNr.get());
+
+          Tp->PS[pcand[k].index].SubRankInGr = all_subhalo_list[n].SubRankInGr;
+        }
+    }
+
+  if(countall > 0)
+    {
+      MPI_Allreduce(MPI_IN_PLACE, &taskmain, 1, MPI_INT, MPI_MAX, SubComm);
+      MPI_Allreduce(MPI_IN_PLACE, &indexmain, 1, MPI_INT, MPI_MAX, SubComm);
+
+      subhalo_properties MainSubhalo;
+
+      if(taskmain == SubThisTask)
+        {
+          MainSubhalo = Subhalo[indexmain];
+
+          if(taskmain != 0)
+            MPI_Send(&MainSubhalo, sizeof(subhalo_properties), MPI_BYTE, 0, TAG_N, SubComm);
+        }
+
+      if(SubThisTask == 0)
+        {
+          if(taskmain != 0)
+            MPI_Recv(&MainSubhalo, sizeof(subhalo_properties), MPI_BYTE, taskmain, TAG_N, SubComm, MPI_STATUS_IGNORE);
+
+          for(int j = 0; j < 3; j++)
+            {
+              Group[gr].Pos[j]    = MainSubhalo.Pos[j];
+              Group[gr].IntPos[j] = MainSubhalo.IntPos[j];
+            }
+        }
+    }
+
+  Mem.myfree(all_subhalo_list);
+  Mem.myfree(countloc_list);
+  Mem.myfree(subhalo_list);
+  Mem.myfree(offset);
+  Mem.myfree(countlist);
+  Mem.myfree(all_candidates);
+  Mem.myfree(loc_candidates);
+  Mem.myfree(elem_last);
+  Mem.myfree(NumPartGroup_list);
+  Mem.myfree(pcand);
+
+  subfind_collective_printf("SUBFIND: root-task=%d: found %d bound substructures in FoF group of length %lld\n", ThisTask, countall,
+                            totgrouplen);
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
+#endif
diff --git a/src/subfind/subfind_nearesttwo.cc b/src/subfind/subfind_nearesttwo.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fb3cef0bcca198e5c5f0f96417c9d01d8a9439fc
--- /dev/null
+++ b/src/subfind/subfind_nearesttwo.cc
@@ -0,0 +1,337 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_nearesttwo.cc
+ *
+ *  \brief determine the nearest two denser neighbours for linking them in excursion set formalism of SUBFIND
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+#ifndef SUBFIND_HBT
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/generic_comm.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+/*! Structure for communication during the density computation. Holds data that is sent to other processors.
+ */
+
+/* local data structure for collecting particle/cell data that is sent to other processors if needed */
+struct sngb_in : data_in_generic
+{
+  MyIntPosType IntPos[3];
+  MyIDType ID;
+  MyFloat Hsml;
+  MyFloat Density;
+  MyFloat Dist[2];
+  int Count;
+  location Index[2];
+};
+
+struct sngb_out
+{
+  MyFloat Dist[2];
+  location Index[2];
+  int Count;
+};
+
+template <typename T_tree, typename T_domain, typename T_partset>
+class sngb_comm : public generic_comm<sngb_in, sngb_out, T_tree, T_domain, T_partset>
+{
+ public:
+  typedef generic_comm<sngb_in, sngb_out, T_tree, T_domain, T_partset> gcomm;
+  using gcomm::D;
+  using gcomm::Thread;
+  using gcomm::Tp;  // This makes sure that we can access Tp from the base class without having to use "this->Tp"
+  using gcomm::Tree;
+
+  /* need to call the base class constructor explicitly */
+  sngb_comm(T_domain *dptr, T_tree *tptr, T_partset *pptr) : gcomm(dptr, tptr, pptr) {}
+
+  /* routine that fills the relevant particle/cell data into the input structure defined above */
+  void particle2in(sngb_in *in, int i) override
+  {
+    in->IntPos[0] = Tp->P[i].IntPos[0];
+    in->IntPos[1] = Tp->P[i].IntPos[1];
+    in->IntPos[2] = Tp->P[i].IntPos[2];
+
+    in->Hsml    = Tp->PS[i].v.DM_Hsml;
+    in->ID      = Tp->P[i].ID.get();
+    in->Density = Tp->PS[i].u.s.u.DM_Density;
+    in->Count   = Tp->PS[i].nearest.count;
+    for(int k = 0; k < Tp->PS[i].nearest.count; k++)
+      {
+        in->Dist[k]  = Tp->R2Loc[i].dist[k];
+        in->Index[k] = Tp->PS[i].nearest.index[k];
+      }
+  }
+
+  /* routine to store or combine result data */
+  void out2particle(sngb_out *out, int i, int mode) override
+  {
+    if(mode == MODE_LOCAL_PARTICLES) /* initial store */
+      {
+        Tp->PS[i].nearest.count = out->Count;
+
+        for(int k = 0; k < out->Count; k++)
+          {
+            Tp->R2Loc[i].dist[k]       = out->Dist[k];
+            Tp->PS[i].nearest.index[k] = out->Index[k];
+          }
+      }
+    else /* combine */
+      {
+        for(int k = 0; k < out->Count; k++)
+          {
+            if(Tp->PS[i].nearest.count >= 1)
+              if(Tp->PS[i].nearest.index[0] == out->Index[k])
+                continue;
+
+            if(Tp->PS[i].nearest.count == 2)
+              if(Tp->PS[i].nearest.index[1] == out->Index[k])
+                continue;
+
+            int l;
+
+            if(Tp->PS[i].nearest.count < 2)
+              {
+                l = Tp->PS[i].nearest.count;
+                Tp->PS[i].nearest.count++;
+              }
+            else
+              {
+                l = (Tp->R2Loc[i].dist[0] > Tp->R2Loc[i].dist[1]) ? 0 : 1;
+
+                if(out->Dist[k] >= Tp->R2Loc[i].dist[l])
+                  continue;
+              }
+
+            Tp->R2Loc[i].dist[l]       = out->Dist[k];
+            Tp->PS[i].nearest.index[l] = out->Index[k];
+
+            if(Tp->PS[i].nearest.count == 2)
+              if(Tp->PS[i].nearest.index[0] == Tp->PS[i].nearest.index[1])
+                Terminate("this is not supposed to happen");
+          }
+      }
+  }
+
+  /*! This function represents the core of the neighbor search. The target particle may either be local, or reside in the communication
+   *  buffer.
+   */
+  int evaluate(int target, int mode, int thread_id, int action, sngb_in *in, int numnodes, node_info *firstnode,
+               sngb_out &out) override
+  {
+    MyIntPosType *intpos = in->IntPos;
+    MyIDType ID          = in->ID;
+    double density       = in->Density;
+    double hsml          = in->Hsml;
+    int count            = in->Count;
+
+    location index[2];
+    double dist[2];
+    for(int k = 0; k < count; k++)
+      {
+        dist[k]  = in->Dist[k];
+        index[k] = in->Index[k];
+      }
+
+    if(count == 2)
+      if(index[0] == index[1])
+        {
+          Terminate("target=%d mode=%d\n", target, mode);
+        }
+
+    count = 0;
+
+    hsml *= 1.00001; /* prevents that the most distant neighbour on the edge of the search region may not be found.
+                      * (needed for consistency with serial algorithm)
+                      */
+
+    for(int k = 0; k < numnodes; k++)
+      {
+        int no;
+
+        if(mode == MODE_LOCAL_PARTICLES)
+          {
+            no = Tree->MaxPart; /* root node */
+          }
+        else
+          {
+            no = firstnode[k].Node;
+            no = Tree->get_nodep(no)->nextnode; /* open it */
+          }
+
+        int shmrank = Tree->TreeSharedMem_ThisTask;
+
+        while(no >= 0)
+          {
+            if(no < Tree->MaxPart) /* single particle */
+              {
+                particle_data *P = Tree->get_Pp(no, shmrank);
+                subfind_data *PS = Tree->get_PSp(no, shmrank);
+
+                no = Tree->get_nextnodep(shmrank)[no]; /* note: here shmrank cannot change */
+
+                if(P->ID.get() != ID) /* exclude the target particle itself */
+                  {
+                    if(PS->u.s.u.DM_Density > density) /* we only need to look at neighbors that are denser */
+                      {
+                        /* converts the integer distance to floating point */
+                        double dxyz[3];
+                        Tp->nearest_image_intpos_to_pos(P->IntPos, intpos, dxyz);
+
+                        double h2 = hsml * hsml;
+
+                        double r2 = dxyz[0] * dxyz[0];
+                        if(r2 > h2)
+                          continue;
+
+                        r2 += dxyz[1] * dxyz[1];
+                        if(r2 > h2)
+                          continue;
+
+                        r2 += dxyz[2] * dxyz[2];
+                        if(r2 > h2)
+                          continue;
+
+                        // ok, we found a particle. Only store up to two closest ones.
+
+                        int task = D->ThisTask + (shmrank - Tree->TreeSharedMem_ThisTask);
+
+                        if(task < 0 || task >= D->NTask)
+                          Terminate("illegal task=%d  D->NTask=%d", task, D->NTask);
+
+                        if(count < 2)
+                          {
+                            dist[count]  = r2;
+                            index[count] = {task, PS->InvIndex}; /* note: ThisTask refers here to the Subdomain */
+                            count++;
+                          }
+                        else
+                          {
+                            int k = (dist[0] > dist[1]) ? 0 : 1;
+
+                            if(r2 < dist[k])
+                              {
+                                dist[k]  = r2;
+                                index[k] = {task, PS->InvIndex}; /* note: ThisTask refers here to the Subdomain */
+                              }
+                          }
+                      }
+                  }
+              }
+            else if(no < Tree->MaxPart + Tree->MaxNodes) /* internal node */
+              {
+                if(mode == 1)
+                  {
+                    if(no < Tree->FirstNonTopLevelNode) /* we reached a top-level node again, which means that we are done with the
+                                                           branch */
+                      {
+                        break;
+                      }
+                  }
+
+                gravnode *current = Tree->get_nodep(no, shmrank);
+
+                no      = current->sibling; /* in case the node can be discarded */
+                shmrank = current->sibling_shmrank;
+
+                double dxyz[3];
+                Tp->nearest_image_intpos_to_pos(current->center.da, intpos,
+                                                dxyz); /* converts the integer distance to floating point */
+
+                double lenhalf = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - 1 - current->level)) * Tp->FacIntToCoord;
+
+                double dist = hsml + lenhalf;
+
+                if(fabs(dxyz[0]) > dist)
+                  continue;
+                if(fabs(dxyz[1]) > dist)
+                  continue;
+                if(fabs(dxyz[2]) > dist)
+                  continue;
+
+                /* now test against the minimal sphere enclosing everything */
+                dist += FACT1 * 2.0 * lenhalf;
+                if(dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2] > dist * dist)
+                  continue;
+
+                no      = current->nextnode; /* ok, we need to open the node */
+                shmrank = current->nextnode_shmrank;
+              }
+            else if(no >= Tree->ImportedNodeOffset) /* point from imported nodelist */
+              {
+                Terminate("do not expect imported points here");
+              }
+            else /* pseudo particle */
+              {
+                if(mode == MODE_LOCAL_PARTICLES)
+                  if(target >= 0) /* note: if no target is given, export will not occur */
+                    Tree->tree_export_node_threads(no, target, &Thread);
+
+                no = Tree->Nextnode[no - Tree->MaxNodes];
+              }
+          }
+      }
+
+    out.Count = count;
+
+    for(int k = 0; k < count; k++)
+      {
+        out.Dist[k]  = dist[k];
+        out.Index[k] = index[k];
+      }
+
+    return 0;
+  }
+};
+
+template <typename partset>
+void fof<partset>::subfind_find_nearesttwo(domain<partset> *SubDomain, int num, int *list)
+{
+  subfind_collective_printf("SUBFIND: root-task=%d: Start finding nearest two.\n", ThisTask);
+
+  for(int i = 0; i < num; i++)
+    Tp->PS[list[i]].nearest.count = 0;
+
+  /* create an object for handling the communication */
+  sngb_comm<gravtree<partset>, domain<partset>, partset> commpattern{SubDomain, &FoFGravTree, Tp};
+
+  commpattern.execute(num, list, MODE_DEFAULT);
+
+  subfind_collective_printf("SUBFIND: root-task=%d: Done with nearest two.\n", ThisTask);
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
+#endif
diff --git a/src/subfind/subfind_orphanids.cc b/src/subfind/subfind_orphanids.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ae109ccf516d89e915346a23566686b9111fe0b
--- /dev/null
+++ b/src/subfind/subfind_orphanids.cc
@@ -0,0 +1,165 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_orphanids.cc
+ *
+ *  \brief determine orphan IDs so that those are flagged
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND_ORPHAN_TREATMENT
+
+#include <errno.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_readsnap.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../system/system.h"
+
+template <>
+void fof<simparticles>::subfind_match_ids_of_previously_most_bound_ids(simparticles *Sp)
+{
+  int *Send_count  = (int *)Mem.mymalloc("Send_count", sizeof(int) * NTask);
+  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * NTask);
+  int *Recv_count  = (int *)Mem.mymalloc("Recv_count", sizeof(int) * NTask);
+  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * NTask);
+
+  mycxxsort_parallel(Sp->IdStore.ID, Sp->IdStore.ID + Sp->IdStore.NumPart, Sp->compare_IDs, Communicator);
+  mycxxsort_parallel(Sp->P, Sp->P + Sp->NumPart, Sp->compare_SpP_ID, Communicator);
+
+  MyIDType *list_min_id = (MyIDType *)Mem.mymalloc("list_min_id", NTask * sizeof(MyIDType));
+  MyIDType *list_max_id = (MyIDType *)Mem.mymalloc("list_max_id", NTask * sizeof(MyIDType));
+
+  MyIDType idmin = Sp->P[0].ID.get();
+  MyIDType idmax = Sp->P[Sp->NumPart - 1].ID.get();
+
+  MPI_Allgather(&idmin, sizeof(MyIDType), MPI_BYTE, list_min_id, sizeof(MyIDType), MPI_BYTE, Communicator);
+  MPI_Allgather(&idmax, sizeof(MyIDType), MPI_BYTE, list_max_id, sizeof(MyIDType), MPI_BYTE, Communicator);
+
+  int *num_list = (int *)Mem.mymalloc("num_list", NTask * sizeof(int));
+  MPI_Allgather(&Sp->NumPart, 1, MPI_INT, num_list, 1, MPI_INT, Communicator);
+
+  int nexport = 0, nimport = 0;
+  MyIDType *import_data = NULL, *export_data = NULL;
+
+  for(int mode = 0; mode < 2; mode++)
+    {
+      for(int i = 0; i < NTask; i++)
+        Send_count[i] = 0;
+
+      int target = 0;
+
+      for(int i = 0; i < Sp->IdStore.NumPart; i++)
+        {
+          while(target < NTask - 1 && (num_list[target] == 0 || Sp->IdStore.ID[i] > list_max_id[target]))
+            target++;
+
+          if(num_list[target] == 0)
+            Terminate("How can this be? target=%d", target);
+
+          if(Sp->IdStore.ID[i] >= list_min_id[target] && Sp->IdStore.ID[i] <= list_max_id[target])
+            {
+              if(mode == 0)
+                Send_count[target]++;
+              else
+                {
+                  int off          = Send_offset[target] + Send_count[target]++;
+                  export_data[off] = Sp->IdStore.ID[i];
+                }
+            }
+        }
+
+      if(mode == 0)
+        {
+          MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
+          Recv_offset[0] = Send_offset[0] = 0;
+          for(int j = 0; j < NTask; j++)
+            {
+              nimport += Recv_count[j];
+              nexport += Send_count[j];
+              if(j > 0)
+                {
+                  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+                  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+                }
+            }
+
+          export_data = (MyIDType *)Mem.mymalloc("export_data", nexport * sizeof(MyIDType));
+          import_data = (MyIDType *)Mem.mymalloc("import_data", nimport * sizeof(MyIDType));
+        }
+    }
+
+  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
+    {
+      int recvTask = ThisTask ^ ngrp;
+      if(recvTask < NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&export_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(MyIDType), MPI_BYTE, recvTask, TAG_DENS_B,
+                       &import_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(MyIDType), MPI_BYTE, recvTask, TAG_DENS_B,
+                       Communicator, MPI_STATUS_IGNORE);
+    }
+
+  /* incoming data should already be sorted, so now do the match */
+
+  int nmarked = 0;
+  for(int i = 0, j = 0; i < Sp->NumPart && j < nimport;)
+    {
+      if(Sp->P[i].ID.get() < import_data[j])
+        i++;
+      else if(Sp->P[i].ID.get() > import_data[j])
+        j++;
+      else
+        {
+          if(!Sp->P[i].ID.is_previously_most_bound())
+            {
+              Sp->P[i].ID.mark_as_formerly_most_bound();
+              nmarked++;
+            }
+          i++;
+          j++;
+        }
+    }
+
+  Mem.myfree(import_data);
+  Mem.myfree(export_data);
+
+  Mem.myfree(num_list);
+  Mem.myfree(list_max_id);
+  Mem.myfree(list_min_id);
+
+  Mem.myfree(Recv_offset);
+  Mem.myfree(Recv_count);
+  Mem.myfree(Send_offset);
+  Mem.myfree(Send_count);
+
+  long long tot_ncheck, tot_nmarked;
+  sumup_large_ints(1, &Sp->IdStore.NumPart, &tot_ncheck, Communicator);
+  sumup_large_ints(1, &nmarked, &tot_nmarked, Communicator);
+
+  mpi_printf("SUBFIND_ORPHAN_TREATMENT: Got %lld particles from previous snapshot, led to %lld additionally marked particles\n",
+             tot_ncheck, tot_nmarked);
+}
+
+#endif
diff --git a/src/subfind/subfind_processing.cc b/src/subfind/subfind_processing.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c1f1da7978d0f03eee4d4d2d4430908aaf7d239
--- /dev/null
+++ b/src/subfind/subfind_processing.cc
@@ -0,0 +1,209 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_processing.cc
+ *
+ *  \brief main routines for processing a halo with the Subfind algorithm
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+
+#include <gsl/gsl_math.h>
+#include <mpi.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/cxxsort.h"
+#include "../sort/parallel_sort.h"
+#include "../sort/peano.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+template <typename partset>
+void fof<partset>::subfind_processing(domain<partset> *SubDomain, domain_options mode)
+{
+  double t0 = Logs.second();
+
+  if(mode == COLL_SUBFIND)
+    {
+      /* make a sanity check: We should have exactly 1 group, stored on the root of the processor subset if we collectively do a group
+       */
+      if(SubThisTask == 0 && Ngroups != 1)
+        Terminate("Ngroups=%d != 1  SubNTask=%d SubThisTask=%d", Ngroups, SubNTask, SubThisTask);
+
+      if(SubThisTask != 0 && Ngroups != 0)
+        Terminate("Ngroups=%d != 0  SubNTask=%d SubThisTask=%d", Ngroups, SubNTask, SubThisTask);
+
+      subfind_collective_printf("SUBFIND: root-task=%d: Collectively doing halo %lld of length  %lld  on  %d  processors.\n", ThisTask,
+                                Group[0].GroupNr, (long long)Group[0].Len, SubNTask);
+
+      if(SubThisTask == 0)
+        {
+          GroupNr = Group[0].GroupNr;
+          Ascale  = Group[0].Ascale;
+        }
+
+      for(int i = 0; i < Tp->NumPart; i++)
+        if(Tp->PS[i].GroupNr.get() == GroupNr)
+          Tp->PS[i].DomainFlag = 1;
+        else
+          Tp->PS[i].DomainFlag = 0;
+
+      /* tell everybody in the set the group number */
+      MPI_Bcast(&GroupNr, 1, MPI_LONG_LONG, 0, SubComm);
+
+      /* tell everybody in the set the group's scale factor */
+      MPI_Bcast(&Ascale, 1, MPI_DOUBLE, 0, SubComm);
+    }
+  else
+    {
+      for(int i = 0; i < Tp->NumPart; i++)
+        Tp->PS[i].DomainFlag = 1;
+
+      if(SubNTask != 1)
+        Terminate("Strange: SubNTask=%d  Ngroups=%d  SubThisTask=%d (expect to be a single processor here)", SubNTask, Ngroups,
+                  SubThisTask);
+    }
+
+  /* Create a domain decomposition for the sub-communicator and the particles in it.
+   * For the serial algorithm, this will be trivial, for collectively treated groups, only particles in the group get a gravity weight.
+   * The outline of the toplevel tree nodes resulting from the domain decomposition can be used to create a gravity tree.
+   */
+
+  if(SubDomain->NumNodes != 0)
+    Terminate("SubDomain.NumNodes=%d\n", SubDomain->NumNodes);
+
+  double ta = Logs.second();
+  SubDomain->domain_decomposition(mode);
+  double tb = Logs.second();
+
+  mpi_printf("SUBFIND: subdomain decomposition took %g sec\n", Logs.timediff(ta, tb));
+
+  if(mode == COLL_SUBFIND)
+    SubDomain->particle_exchange_based_on_PS(SubComm);
+
+  for(int i = 0; i < Tp->NumPart; i++)
+    Tp->PS[i].SubRankInGr = INT_MAX; /* set a default that is larger than any reasonable group number */
+
+  /* now let us sort according to GroupNr and Density. This step will temporarily break the association with SphP[] and other arrays!
+   */
+  submp = (submp_data *)Mem.mymalloc_movable(&submp, "submp", sizeof(submp_data) * Tp->NumPart);
+  for(int i = 0; i < Tp->NumPart; i++)
+    {
+      submp[i].index   = i;
+      submp[i].GroupNr = Tp->PS[i].GroupNr.get();
+#ifndef SUBFIND_HBT
+      submp[i].DM_Density = Tp->PS[i].u.s.u.DM_Density;
+#endif
+    }
+  mycxxsort(submp, submp + Tp->NumPart, subfind_compare_submp_GroupNr_DM_Density);
+
+  /* In this index list, we store the indices of the local particles in the group that we currently need to process (it is for sure
+   * shorter than NumPart) */
+  IndexList = (int *)Mem.mymalloc_movable(&IndexList, "IndexList", Tp->NumPart * sizeof(int));
+
+  MPI_Comm SingleComm;
+  int thistask;
+  MPI_Comm_rank(SubDomain->Communicator, &thistask);
+  MPI_Comm_split(SubDomain->Communicator, thistask, thistask, &SingleComm);  // create a communicator for single ranks
+
+  /* prepare a domain decomposition for unbinding locally independent ones */
+  domain<partset> SingleDomain{SingleComm, Tp};
+
+  if(SingleDomain.NumNodes != 0)
+    Terminate("SubDomain.NumNodes=%d\n", SingleDomain.NumNodes);
+
+  double taa = Logs.second();
+  SingleDomain.domain_decomposition(SERIAL_SUBFIND);
+  double tbb = Logs.second();
+
+  mpi_printf("SUBFIND: serial subfind subdomain decomposition took %g sec\n", Logs.timediff(taa, tbb));
+
+  double ta0 = Logs.second();
+  if(mode == COLL_SUBFIND)
+    {
+      /* determine the number of local group particles, and fill the list of the indices */
+      NumPartGroup = 0;
+      for(int i = 0; i < Tp->NumPart; i++)
+        if(Tp->PS[i].GroupNr.get() == GroupNr)
+          IndexList[NumPartGroup++] = i;
+
+          /* call the processing of the group */
+#ifdef SUBFIND_HBT
+      subfind_hbt_single_group(SubDomain, &SingleDomain, mode, 0);
+#else
+      subfind_process_single_group(SubDomain, &SingleDomain, mode, 0);
+#endif
+    }
+  else
+    {
+      int i = 0;
+      for(int gr = 0; gr < Ngroups; gr++) /* process all local groups */
+        {
+          GroupNr = Group[gr].GroupNr;
+          Ascale  = Group[gr].Ascale;
+
+          /* determine the number of local group particles, and set up the list of the indices */
+          NumPartGroup = 0;
+          for(; i < Tp->NumPart; i++)
+            if(Tp->PS[submp[i].index].GroupNr.get() == GroupNr)
+              IndexList[NumPartGroup++] = submp[i].index;
+            else
+              break;
+
+              /* do local group with Group[] index 'gr' */
+#ifdef SUBFIND_HBT
+          subfind_hbt_single_group(SubDomain, &SingleDomain, mode, gr);
+#else
+          subfind_process_single_group(SubDomain, &SingleDomain, mode, gr);
+#endif
+        }
+    }
+  double tb0 = Logs.second();
+  mpi_printf("SUBFIND: subfind_hbt_single_group() processing for Ngroups=%d took %g sec\n", Ngroups, Logs.timediff(ta0, tb0));
+
+  SingleDomain.domain_free();
+  MPI_Comm_free(&SingleComm);
+
+  Mem.myfree(IndexList);
+  Mem.myfree(submp);
+
+  SubDomain->domain_free();
+
+  double t1 = Logs.second();
+
+  subfind_collective_printf("SUBFIND: root-task=%d: Collective processing of halo %d took %g\n", ThisTask, Group[0].GroupNr,
+                            Logs.timediff(t0, t1));
+
+  if(!(mode == COLL_SUBFIND) && ThisTask == 0)
+    mpi_printf("SUBFIND: root-task=%d: Serial processing of halo %d took %g\n", ThisTask, Group[0].GroupNr, Logs.timediff(t0, t1));
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
diff --git a/src/subfind/subfind_properties.cc b/src/subfind/subfind_properties.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04eca09842ec0d9e8a8b8dd3aaa0b6cdb78451fa
--- /dev/null
+++ b/src/subfind/subfind_properties.cc
@@ -0,0 +1,594 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_properties.cc
+ *
+ *  \brief determination of subhalo properties
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+
+#include <mpi.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/cxxsort.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+#include "../time_integration/driftfac.h"
+
+template <>
+void fof<simparticles>::subfind_get_factors(double &fac_vel_to_phys, double &fac_hubbleflow, double &fac_comov_to_phys)
+{
+  if(All.ComovingIntegrationOn)
+    {
+      fac_vel_to_phys   = 1.0 / All.Time;  // converts to physical velocity
+      fac_hubbleflow    = Driftfac.hubble_function(All.Time);
+      fac_comov_to_phys = All.Time;  // converts comoving distance to physical distance
+    }
+  else
+    {  // non-comoving integration
+      fac_vel_to_phys   = 1;
+      fac_hubbleflow    = 0;
+      fac_comov_to_phys = 1;
+    }
+}
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+template <>
+void fof<lcparticles>::subfind_get_factors(double &fac_vel_to_phys, double &fac_hubbleflow, double &fac_comov_to_phys)
+{
+  fac_vel_to_phys   = 1.0;  // the velocities on the lightcone are already peculiar velocities
+  fac_hubbleflow    = Driftfac.hubble_function(Ascale);
+  fac_comov_to_phys = Ascale;
+}
+#endif
+
+template <typename partset>
+int fof<partset>::subfind_determine_sub_halo_properties(int *d, int num, subhalo_properties *subhalo, MPI_Comm Communicator)
+{
+  /* get the local communication context */
+  int commNTask, commThisTask;
+  MPI_Comm_size(Communicator, &commNTask);
+  MPI_Comm_rank(Communicator, &commThisTask);
+
+  double fac_vel_to_phys, fac_hubbleflow, fac_comov_to_phys;
+  subfind_get_factors(fac_vel_to_phys, fac_hubbleflow, fac_comov_to_phys);
+
+  typename partset::pdata *P = Tp->P;
+  subfind_data *PS           = Tp->PS;
+
+  double mass_tab[NTYPES], halfmassradtype[NTYPES];
+  long long len_type[NTYPES];
+  for(int j = 0; j < NTYPES; j++)
+    {
+      mass_tab[j]        = 0;
+      len_type[j]        = 0;
+      halfmassradtype[j] = 0;
+    }
+
+#ifdef STARFORMATION
+  double sfr = 0;
+#endif
+
+  int minindex  = -1;
+  double minpot = MAX_DOUBLE_NUMBER;
+  for(int i = 0; i < num; i++)
+    {
+      int p = d[i];
+      if(PS[p].u.s.u.DM_Potential < minpot || minindex == -1)
+        {
+          minpot   = PS[p].u.s.u.DM_Potential;
+          minindex = p;
+        }
+      len_type[P[p].getType()]++;
+
+#ifdef STARFORMATION
+      if(P[p].getType() == 0)
+        sfr += Tp->SphP[p].Sfr;
+#endif
+    }
+
+  MPI_Allreduce(MPI_IN_PLACE, len_type, NTYPES, MPI_LONG_LONG, MPI_SUM, Communicator);
+
+  double *minpotlist = (double *)Mem.mymalloc("minpotlist", commNTask * sizeof(double));
+  MPI_Allgather(&minpot, 1, MPI_DOUBLE, minpotlist, 1, MPI_DOUBLE, Communicator);
+
+  int mincpu = -1;
+  minpot     = MAX_DOUBLE_NUMBER;
+  for(int i = 0; i < commNTask; i++)
+    if(minpotlist[i] < minpot)
+      {
+        mincpu = i;
+        minpot = minpotlist[mincpu];
+      }
+
+  Mem.myfree(minpotlist);
+
+  if(mincpu < 0)
+    Terminate("mincpu < 0");
+
+  MyIntPosType intpos[3];
+
+  if(commThisTask == mincpu)
+    for(int j = 0; j < 3; j++)
+      intpos[j] = P[minindex].IntPos[j];
+
+  MPI_Bcast(intpos, 3 * sizeof(MyIntPosType), MPI_BYTE, mincpu, Communicator);
+
+#ifdef STARFORMATION
+  MPI_Allreduce(MPI_IN_PLACE, &sfr, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+#endif
+
+  /* pos[] now holds the position of minimum potential */
+  /* we'll take it that as the center */
+
+  /* determine the particle ID with the smallest binding energy */
+  minindex = -1;
+  minpot   = MAX_DOUBLE_NUMBER;
+  for(int i = 0; i < num; i++)
+    {
+      int p = d[i];
+      if(PS[p].v.DM_BindingEnergy < minpot || minindex == -1)
+        {
+          minpot   = PS[p].v.DM_BindingEnergy;
+          minindex = p;
+        }
+    }
+
+  MyIDType mostboundid;
+
+  minpotlist = (double *)Mem.mymalloc("minpotlist", commNTask * sizeof(double));
+  MPI_Allgather(&minpot, 1, MPI_DOUBLE, minpotlist, 1, MPI_DOUBLE, Communicator);
+
+  mincpu = -1;
+  minpot = MAX_DOUBLE_NUMBER;
+  for(int i = 0; i < commNTask; i++)
+    if(minpotlist[i] < minpot)
+      {
+        mincpu = i;
+        minpot = minpotlist[mincpu];
+      }
+
+  Mem.myfree(minpotlist);
+
+  if(mincpu < 0)
+    Terminate("mincpu < 0");
+
+  int marked = 0;
+
+  if(commThisTask == mincpu)
+    {
+      mostboundid = P[minindex].ID.get();
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+      if(!P[minindex].ID.is_previously_most_bound())
+        {
+          P[minindex].ID.mark_as_formerly_most_bound();
+          marked++;
+        }
+#endif
+    }
+
+  MPI_Bcast(&mostboundid, sizeof(mostboundid), MPI_BYTE, mincpu, Communicator);
+
+  /* let's get bulk velocity and the center-of-mass */
+  /* here we still take all particles */
+
+  double s[3] = {0, 0, 0}, v[3] = {0, 0, 0};
+  double mass = 0;
+
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+  int totprevmostboundlen = 0;
+#endif
+
+  for(int i = 0; i < num; i++)
+    {
+      int p = d[i];
+
+      double dxyz[3];
+      Tp->nearest_image_intpos_to_pos(P[p].IntPos, intpos, dxyz);
+
+      s[0] += Tp->P[p].getMass() * dxyz[0];
+      s[1] += Tp->P[p].getMass() * dxyz[1];
+      s[2] += Tp->P[p].getMass() * dxyz[2];
+
+      for(int j = 0; j < 3; j++)
+        v[j] += Tp->P[p].getMass() * P[p].Vel[j];
+
+      mass += Tp->P[p].getMass();
+
+      int ptype = P[p].getType();
+      mass_tab[ptype] += Tp->P[p].getMass();
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+      if(P[p].ID.is_previously_most_bound())
+        totprevmostboundlen++;
+#endif
+    }
+
+  double stot[3], vtot[3], masstot, mass_tabtot[NTYPES];
+
+  MPI_Allreduce(s, stot, 3, MPI_DOUBLE, MPI_SUM, Communicator);
+  MPI_Allreduce(&mass, &masstot, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  MPI_Allreduce(v, vtot, 3, MPI_DOUBLE, MPI_SUM, Communicator);
+  MPI_Allreduce(mass_tab, mass_tabtot, NTYPES, MPI_DOUBLE, MPI_SUM, Communicator);
+
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+  MPI_Allreduce(MPI_IN_PLACE, &totprevmostboundlen, 1, MPI_INT, MPI_SUM, Communicator);
+#endif
+
+  mass = masstot;
+  for(int j = 0; j < 3; j++)
+    {
+      s[j] = stot[j];
+      v[j] = vtot[j];
+    }
+
+  for(int j = 0; j < NTYPES; j++)
+    mass_tab[j] = mass_tabtot[j];
+
+  double vel[3];
+  for(int j = 0; j < 3; j++)
+    {
+      s[j] /= mass; /* center of mass */
+      v[j] /= mass;
+      vel[j] = fac_vel_to_phys * v[j];
+    }
+
+  MySignedIntPosType off[3];
+  Tp->pos_to_signedintpos(s, off);
+
+  MyIntPosType int_cm[3];
+
+  int_cm[0] = off[0] + intpos[0];
+  int_cm[1] = off[1] + intpos[1];
+  int_cm[2] = off[2] + intpos[2];
+
+  double cm[3];
+  Tp->intpos_to_pos(int_cm, cm);
+
+  double disp = 0, lx = 0, ly = 0, lz = 0;
+
+  sort_r2list *rr_list = (sort_r2list *)Mem.mymalloc("rr_list", sizeof(sort_r2list) * (num + 1));
+
+  for(int i = 0; i < num; i++)
+    {
+      int p = d[i];
+
+      double dx[3], dv[3];
+
+      /* relative to center of mass */
+
+      Tp->nearest_image_intpos_to_pos(P[p].IntPos, int_cm, dx);
+
+      dx[0] *= fac_comov_to_phys;
+      dx[1] *= fac_comov_to_phys;
+      dx[2] *= fac_comov_to_phys;
+
+      for(int j = 0; j < 3; j++)
+        {
+          dv[j] = fac_vel_to_phys * (P[p].Vel[j] - v[j]);
+          dv[j] += fac_hubbleflow * dx[j];
+
+          disp += Tp->P[p].getMass() * dv[j] * dv[j];
+        }
+
+      lx += Tp->P[p].getMass() * (dx[1] * dv[2] - dx[2] * dv[1]);
+      ly += Tp->P[p].getMass() * (dx[2] * dv[0] - dx[0] * dv[2]);
+      lz += Tp->P[p].getMass() * (dx[0] * dv[1] - dx[1] * dv[0]);
+
+      /* for rotation curve computation, take minimum of potential as center */
+
+      double dxyz[3];
+      Tp->nearest_image_intpos_to_pos(P[p].IntPos, intpos, dxyz);
+
+      for(int j = 0; j < 3; j++)
+        dxyz[j] *= fac_comov_to_phys;
+
+      double r2 = dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2];
+
+      double r = sqrt(r2);
+
+      rr_list[i].mass = Tp->P[p].getMass();
+      rr_list[i].r    = r;
+    }
+
+  double spintot[3], disploc = disp;
+  double spinloc[3] = {lx, ly, lz};
+  MPI_Allreduce(spinloc, spintot, 3, MPI_DOUBLE, MPI_SUM, Communicator);
+  MPI_Allreduce(&disploc, &disp, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  lx = spintot[0];
+  ly = spintot[1];
+  lz = spintot[2];
+
+  double spin[3] = {lx / mass, ly / mass, lz / mass};
+
+  double veldisp = sqrt(disp / (3 * mass)); /* convert to 1d velocity dispersion */
+
+  mycxxsort_parallel(rr_list, rr_list + num, subfind_compare_dist_rotcurve, Communicator);
+
+  /* calculate cumulative mass */
+  for(int i = 1; i < num; i++)
+    rr_list[i].mass = rr_list[i - 1].mass + rr_list[i].mass;
+
+  double halfmassrad = 0;
+  double max = 0, maxrad = 0;
+
+  double mass_part = 0;
+  if(num)
+    mass_part = rr_list[num - 1].mass;
+  double *masslist = (double *)Mem.mymalloc("masslist", commNTask * sizeof(double));
+  MPI_Allgather(&mass_part, 1, MPI_DOUBLE, masslist, 1, MPI_DOUBLE, Communicator);
+
+  double massbefore = 0;
+  for(int i = 0; i < commThisTask; i++)
+    massbefore += masslist[i];
+
+  for(int i = 0; i < num; i++)
+    rr_list[i].mass += massbefore;
+
+  Mem.myfree(masslist);
+
+  /* now calculate rotation curve maximum and half mass radius */
+
+  double halfmassrad_loc  = 0;
+  sort_r2list *rr_lowlist = (sort_r2list *)Mem.mymalloc("rr_lowlist", commNTask * sizeof(sort_r2list));
+  sort_r2list low_element;
+  if(num > 0)
+    low_element = rr_list[0];
+  else
+    {
+      low_element.mass = 0;
+      low_element.r    = 0;
+    }
+  MPI_Allgather(&low_element, sizeof(sort_r2list), MPI_BYTE, rr_lowlist, sizeof(sort_r2list), MPI_BYTE, Communicator);
+
+  rr_list[num].mass = 0;
+  rr_list[num].r    = 0;
+
+  for(int j = commThisTask + 1; j < commNTask; j++)
+    if(rr_lowlist[j].mass > 0)
+      {
+        rr_list[num] = rr_lowlist[j];
+        break;
+      }
+
+  Mem.myfree(rr_lowlist);
+
+  int *numlist = (int *)Mem.mymalloc("numlist", commNTask * sizeof(int));
+  MPI_Allgather(&num, 1, MPI_INT, numlist, 1, MPI_INT, Communicator);
+
+  int nbefore = 0;
+  for(int i = 0; i < commThisTask; i++)
+    nbefore += numlist[i];
+
+  for(int i = num - 1; i >= 0; i--)
+    {
+      if((i + nbefore) > 5 && rr_list[i].mass > max * rr_list[i].r)
+        {
+          max    = rr_list[i].mass / rr_list[i].r;
+          maxrad = rr_list[i].r;
+        }
+
+      if(rr_list[i].mass < 0.5 * mass && rr_list[i + 1].mass >= 0.5 * mass)
+        halfmassrad_loc = 0.5 * (rr_list[i].r + rr_list[i + 1].r);
+    }
+
+  Mem.myfree(numlist);
+
+  MPI_Allreduce(&halfmassrad_loc, &halfmassrad, 1, MPI_DOUBLE, MPI_MAX, Communicator);
+  double *maxlist    = (double *)Mem.mymalloc("maxlist", commNTask * sizeof(double));
+  double *maxradlist = (double *)Mem.mymalloc("maxradlist", commNTask * sizeof(double));
+  MPI_Allgather(&max, 1, MPI_DOUBLE, maxlist, 1, MPI_DOUBLE, Communicator);
+  MPI_Allgather(&maxrad, 1, MPI_DOUBLE, maxradlist, 1, MPI_DOUBLE, Communicator);
+
+  max = maxrad = 0;
+  for(int i = 0; i < commNTask; i++)
+    {
+      if(maxlist[i] > max)
+        {
+          max    = maxlist[i];
+          maxrad = maxradlist[i];
+        }
+    }
+  Mem.myfree(maxradlist);
+  Mem.myfree(maxlist);
+
+  double vmax    = sqrt(All.G * max);
+  double vmaxrad = maxrad;
+
+  Mem.myfree(rr_list);
+
+  /* half mass radii for different types */
+
+  int len_type_loc[NTYPES];
+  for(int j = 0; j < NTYPES; j++)
+    len_type_loc[j] = 0;
+
+  for(int i = 0; i < num; i++)
+    {
+      int p     = d[i];
+      int ptype = P[p].getType();
+      len_type_loc[ptype]++;
+    }
+
+  for(int type = 0; type < NTYPES; type++)
+    {
+      sort_r2list *rr_list = (sort_r2list *)Mem.mymalloc("rr_list", sizeof(sort_r2list) * (len_type_loc[type] + 1));
+      int itmp             = 0;
+      for(int i = 0; i < num; i++)
+        {
+          int p = d[i];
+
+          int ptype = P[p].getType();
+
+          if(ptype == type)
+            {
+              double dxyz[3];
+              Tp->nearest_image_intpos_to_pos(P[p].IntPos, intpos, dxyz);
+
+              for(int j = 0; j < 3; j++)
+                dxyz[j] *= fac_comov_to_phys;
+
+              double r2 = dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2];
+              double r  = sqrt(r2);
+
+              rr_list[itmp].mass = Tp->P[p].getMass();
+              rr_list[itmp].r    = r;
+              itmp++;
+            }
+        }
+
+      if(itmp != len_type_loc[type])
+        Terminate("should not occur: %d %d", itmp, len_type_loc[type]);
+
+      mycxxsort_parallel(rr_list, rr_list + len_type_loc[type], subfind_compare_dist_rotcurve, Communicator);
+
+      /* calculate cumulative mass */
+      for(int i = 1; i < len_type_loc[type]; i++)
+        rr_list[i].mass = rr_list[i - 1].mass + rr_list[i].mass;
+
+      double mass_part = 0;
+      if(len_type_loc[type])
+        mass_part = rr_list[len_type_loc[type] - 1].mass;
+      double *masslist = (double *)Mem.mymalloc("masslist", commNTask * sizeof(double));
+      MPI_Allgather(&mass_part, 1, MPI_DOUBLE, masslist, 1, MPI_DOUBLE, Communicator);
+
+      double massbefore = 0;
+      for(int i = 0; i < commThisTask; i++)
+        massbefore += masslist[i];
+
+      for(int i = 0; i < len_type_loc[type]; i++)
+        rr_list[i].mass += massbefore;
+
+      Mem.myfree(masslist);
+
+      /* now calculate half mass radii */
+      double halfmassrad_loc  = 0;
+      sort_r2list *rr_lowlist = (sort_r2list *)Mem.mymalloc("rr_lowlist", commNTask * sizeof(sort_r2list));
+      sort_r2list low_element;
+      if(len_type_loc[type] > 0)
+        low_element = rr_list[0];
+      else
+        {
+          low_element.mass = 0;
+          low_element.r    = 0;
+        }
+
+      MPI_Allgather(&low_element, sizeof(sort_r2list), MPI_BYTE, rr_lowlist, sizeof(sort_r2list), MPI_BYTE, Communicator);
+
+      rr_list[len_type_loc[type]].mass = 0;
+      rr_list[len_type_loc[type]].r    = 0;
+      for(int j = commThisTask + 1; j < commNTask; j++)
+        if(rr_lowlist[j].mass > 0)
+          {
+            rr_list[len_type_loc[type]] = rr_lowlist[j];
+            break;
+          }
+
+      Mem.myfree(rr_lowlist);
+
+      for(int i = len_type_loc[type] - 1; i >= 0; i--)
+        {
+          if(rr_list[i].mass < 0.5 * mass_tab[type] && rr_list[i + 1].mass >= 0.5 * mass_tab[type])
+            halfmassrad_loc = 0.5 * (rr_list[i].r + rr_list[i + 1].r);
+        }
+
+      MPI_Allreduce(&halfmassrad_loc, &halfmassradtype[type], 1, MPI_DOUBLE, MPI_MAX, Communicator);
+
+      Mem.myfree(rr_list);
+    }
+
+    /* properties of star forming gas */
+#ifdef STARFORMATION
+  double gasMassSfr = 0;
+  for(int i = 0; i < num; i++)
+    {
+      int p = d[i];
+
+      if(P[p].getType() == 0)
+        if(Tp->SphP[p].Sfr > 0)
+          gasMassSfr += P[p].getMass();
+    }
+#endif
+
+#ifdef STARFORMATION
+  double gasMassSfrtot;
+  MPI_Allreduce(&gasMassSfr, &gasMassSfrtot, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+  gasMassSfr = gasMassSfrtot;
+#endif
+
+  long long totlen;
+  sumup_large_ints(1, &num, &totlen, Communicator);
+
+  /* now store the calculated properties in the subhalo structure */
+  if(commThisTask == 0)
+    {
+      subhalo->Len  = totlen;
+      subhalo->Mass = mass;
+
+      for(int j = 0; j < NTYPES; j++)
+        {
+          subhalo->MassType[j]           = mass_tab[j];
+          subhalo->LenType[j]            = len_type[j];
+          subhalo->SubHalfMassRadType[j] = halfmassradtype[j];
+        }
+
+      double pos[3];
+      fof_get_halo_position(intpos, pos);
+
+      for(int j = 0; j < 3; j++)
+        {
+          subhalo->IntPos[j] = intpos[j];
+          subhalo->Pos[j]    = pos[j];
+          subhalo->Vel[j]    = vel[j];
+          subhalo->CM[j]     = cm[j];
+          subhalo->Spin[j]   = spin[j];
+        }
+
+      subhalo->SubMostBoundID = mostboundid;
+      subhalo->SubVelDisp     = veldisp;
+      subhalo->SubVmax        = vmax;
+      subhalo->SubVmaxRad     = vmaxrad;
+      subhalo->SubHalfMassRad = halfmassrad;
+
+#if defined(SUBFIND_ORPHAN_TREATMENT)
+      subhalo->SubhaloLenPrevMostBnd = totprevmostboundlen;
+#endif
+#ifdef STARFORMATION
+      subhalo->Sfr        = sfr;
+      subhalo->GasMassSfr = gasMassSfr;
+#endif
+    }
+
+  return marked;
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
diff --git a/src/subfind/subfind_readid_io.cc b/src/subfind/subfind_readid_io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..239e4eaa387b9b20723f98f82071db2ad943c1cd
--- /dev/null
+++ b/src/subfind/subfind_readid_io.cc
@@ -0,0 +1,257 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_readid_io.cc
+ *
+ *  \brief routines to read particle IDs of group catalogues
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND_ORPHAN_TREATMENT
+
+#include <errno.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../data/particle_data.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mergertree/io_readsnap.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../subfind/subfind_readid_io.h"
+#include "../system/system.h"
+
+subreadid_io::subreadid_io(idstoredata *IdStore_ptr, MPI_Comm comm, int format) : IO_Def(comm, format)
+{
+  IdStore = IdStore_ptr;
+
+  this->N_IO_Fields  = 0;
+  this->N_DataGroups = NTYPES;
+  this->header_size  = sizeof(header);
+  this->header_buf   = &header;
+  this->type_of_file = FILE_IS_SNAPSHOT;
+  sprintf(this->info, "MERGERTREE: reading snapshot IDs");
+
+  init_field("ID  ", "ParticleIDs", MEM_MY_ID_TYPE, FILE_MY_ID_TYPE, READ_IF_PRESENT, 1, A_IDS, IdStore->ID, NULL, ALL_TYPES, 0, 0, 0,
+             0, 0, 0, 0);
+}
+
+/*! \brief This function reads initial conditions that are in on of the default file formats
+ * of Gadget.
+ *
+ * Snapshot files can be used as input files.  However, when a
+ * snapshot file is used as input, not all the information in the header is
+ * used: THE STARTING TIME NEEDS TO BE SET IN THE PARAMETERFILE.
+ * Alternatively, the code can be started with restartflag 2, then snapshots
+ * from the code can be used as initial conditions-files without having to
+ * change the parameter file.  For gas particles, only the internal energy is
+ * read, the density and mean molecular weight will be recomputed by the code.
+ * When InitGasTemp>0 is given, the gas temperature will be initialized to this
+ * value assuming a mean molecular weight either corresponding to complete
+ * neutrality, or full ionization.
+ *
+ * \param fname file name of the ICs
+ * \param readTypes readTypes is a bitfield that
+ * determines what particle types to read, only if the bit
+ * corresponding to a particle type is set, the corresponding data is
+ * loaded, otherwise its particle number is set to zero. (This is
+ * only implemented for HDF5 files.)
+ */
+void subreadid_io::previously_bound_read_snap_ids(int num)
+{
+  if(All.ICFormat < 1 || All.ICFormat > 4)
+    Terminate("ICFormat=%d not supported.\n", All.ICFormat);
+
+  char fname[MAXLEN_PATH_EXTRA], fname_multiple[MAXLEN_PATH_EXTRA];
+  sprintf(fname_multiple, "%s/snapdir_%03d/%s-prevmostboundonly_%03d", All.OutputDir, num, All.SnapshotFileBase, num);
+  sprintf(fname, "%s%s_%03d", All.OutputDir, All.SnapshotFileBase, num);
+
+  TIMER_START(CPU_SNAPSHOT);
+
+  int num_files = find_files(fname, fname_multiple);
+
+  if(num_files > 1)
+    strcpy(fname, fname_multiple);
+
+  /* we repeat reading the headers of the files two times. In the first iteration, only the
+   * particle numbers ending up on each processor are assembled, followed by memory allocation.
+   * In the second iteration, the data is actually read in.
+   */
+  for(int rep = 0; rep < 2; rep++)
+    {
+      IdStore->NumPart = 0;
+
+      read_files_driver(fname, rep, num_files);
+
+      /* now do the memory allocation */
+      if(rep == 0)
+        {
+          IdStore->ID = (MyIDType *)Mem.mymalloc_movable_clear(&IdStore->ID, "IdStore->ID", IdStore->NumPart * sizeof(MyIDType));
+        }
+    }
+
+  MPI_Barrier(Communicator);
+
+  mpi_printf("READSNAPID: reading done.\n");
+
+  TIMER_STOP(CPU_SNAPSHOT);
+}
+
+void subreadid_io::fill_file_header(int writeTask, int lastTask, long long *n_type, long long *ntot_type)
+{ /* empty */
+}
+
+void subreadid_io::read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *n_type, long long *ntot_type,
+                                    int *nstart)
+{
+  if(ThisTask == readTask)
+    {
+      if(filenr == 0 && nstart == NULL)
+        {
+          mpi_printf(
+              "\nREADSNAPID: filenr=%d, '%s' contains:\n"
+              "READSNAPID: Type 0 (gas):   %8lld  (tot=%15lld) masstab= %g\n",
+              filenr, fname, (long long)header.npart[0], (long long)header.npartTotal[0], All.MassTable[0]);
+
+          for(int type = 1; type < NTYPES; type++)
+            {
+              mpi_printf("READSNAPID: Type %d:         %8lld  (tot=%15lld) masstab= %g\n", type, (long long)header.npart[type],
+                         (long long)header.npartTotal[type], All.MassTable[type]);
+            }
+          mpi_printf("\n");
+        }
+    }
+
+  /* to collect the gas particles all at the beginning (in case several
+     snapshot files are read on the current CPU) we move the collisionless
+     particles such that a gap of the right size is created */
+
+  long long nall = 0;
+  for(int type = 0; type < NTYPES; type++)
+    {
+      ntot_type[type] = header.npart[type];
+
+      long long n_in_file = header.npart[type];
+      int ntask           = lastTask - readTask + 1;
+      int n_for_this_task = n_in_file / ntask;
+      if((ThisTask - readTask) < (n_in_file % ntask))
+        n_for_this_task++;
+
+      n_type[type] = n_for_this_task;
+
+      nall += n_for_this_task;
+    }
+
+  if(nstart)
+    {
+      memmove(&IdStore->ID[nall], &IdStore->ID[0], IdStore->NumPart * sizeof(MyIDType));
+
+      *nstart = 0;
+    }
+}
+
+void subreadid_io::write_header_fields(hid_t handle)
+{ /* empty */
+}
+
+/*! \brief This function reads the snapshot header in case of hdf5 files (i.e. format 3)
+ *
+ * \param fname file name of the snapshot as given in the parameter file
+ */
+void subreadid_io::read_header_fields(const char *fname)
+{
+  for(int i = 0; i < NTYPES; i++)
+    {
+      header.npart[i]      = 0;
+      header.npartTotal[i] = 0;
+      header.mass[i]       = 0;
+    }
+
+  hsize_t ntypes = NTYPES;
+
+  hid_t hdf5_file = my_H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
+  hid_t handle    = my_H5Gopen(hdf5_file, "/Header");
+
+  /* check if the file in question actually has this number of types */
+  hid_t hdf5_attribute = my_H5Aopen_name(handle, "NumPart_ThisFile");
+  hid_t space          = H5Aget_space(hdf5_attribute);
+  hsize_t dims, len;
+  H5Sget_simple_extent_dims(space, &dims, &len);
+  H5Sclose(space);
+  if(len != ntypes)
+    Terminate("Length of NumPart_ThisFile attribute (%d) does not match NTYPES(ICS) (%d)", (int)len, (int)ntypes);
+  my_H5Aclose(hdf5_attribute, "NumPart_ThisFile");
+
+  /* now read the header fields */
+
+#ifdef GADGET2_HEADER
+  read_vector_attribute(handle, "NumPart_ThisFile", header.npart, H5T_NATIVE_UINT, ntypes);
+  read_vector_attribute(handle, "NumPart_Total", header.npartTotal, H5T_NATIVE_UINT, ntypes);
+  read_scalar_attribute(handle, "Omega0", &header.Omega0, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "OmegaLambda", &header.OmegaLambda, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "HubbleParam", &header.HubbleParam, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "Hubble", &header.Hubble, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "Flag_Sfr", &header.flag_sfr, H5T_NATIVE_INT);
+  read_scalar_attribute(handle, "Flag_Cooling", &header.flag_cooling, H5T_NATIVE_INT);
+  read_scalar_attribute(handle, "Flag_Feedback", &header.flag_feedback, H5T_NATIVE_INT);
+  read_scalar_attribute(handle, "Flag_DoublePrecision", &header.flag_doubleprecision, H5T_NATIVE_INT);
+#else
+  read_vector_attribute(handle, "NumPart_ThisFile", header.npart, H5T_NATIVE_UINT64, ntypes);
+  read_vector_attribute(handle, "NumPart_Total", header.npartTotal, H5T_NATIVE_UINT64, ntypes);
+#endif
+
+  read_scalar_attribute(handle, "BoxSize", &header.BoxSize, H5T_NATIVE_DOUBLE);
+  read_vector_attribute(handle, "MassTable", header.mass, H5T_NATIVE_DOUBLE, ntypes);
+  read_scalar_attribute(handle, "Time", &header.time, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "Redshift", &header.redshift, H5T_NATIVE_DOUBLE);
+  read_scalar_attribute(handle, "NumFilesPerSnapshot", &header.num_files, H5T_NATIVE_INT);
+
+  my_H5Gclose(handle, "/Header");
+  my_H5Fclose(hdf5_file, fname);
+}
+
+int subreadid_io::get_filenr_from_header(void) { return header.num_files; }
+
+void subreadid_io::set_filenr_in_header(int numfiles) { header.num_files = numfiles; }
+
+void subreadid_io::read_increase_numbers(int type, int n_for_this_task) { IdStore->NumPart += n_for_this_task; }
+
+void subreadid_io::get_datagroup_name(int type, char *buf) { sprintf(buf, "/PartType%d", type); }
+
+int subreadid_io::get_type_of_element(int index) { return 0; /* empty */ }
+
+void subreadid_io::set_type_of_element(int index, int type)
+{ /* empty */
+}
+
+void *subreadid_io::get_base_address_of_structure(enum arrays array, int index)
+{
+  switch(array)
+    {
+      case A_IDS:
+        return (void *)(IdStore->ID + index);
+      default:
+        Terminate("we don't expect to get here");
+    }
+
+  return NULL;
+}
+#endif
diff --git a/src/subfind/subfind_readid_io.h b/src/subfind/subfind_readid_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8330eea31dd2d92b330400808f5c1575a2d7682
--- /dev/null
+++ b/src/subfind/subfind_readid_io.h
@@ -0,0 +1,115 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_readid_io.h
+ *
+ *  \brief declaration of a class for reading particle IDs of group catalogues
+ */
+
+#ifndef SUBREADID_IO_H
+#define SUBREADID_IO_H
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND_ORPHAN_TREATMENT
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../data/particle_data.h"
+#include "../fof/fof.h"
+#include "../io/hdf5_util.h"
+#include "../io/io.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mergertree/mergertree.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/parallel_sort.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+class subreadid_io : public IO_Def
+{
+ private:
+  idstoredata *IdStore;
+
+ public:
+  subreadid_io(idstoredata *IdStore_ptr, MPI_Comm comm, int format);
+
+  void previously_bound_read_snap_ids(int num);
+
+  /* supplied virtual functions */
+  void fill_file_header(int writeTask, int lastTask, long long *nloc_part, long long *npart);
+  void read_file_header(const char *fname, int filenr, int readTask, int lastTask, long long *nloc_part, long long *npart,
+                        int *nstart);
+  void get_datagroup_name(int grnr, char *gname);
+  void write_header_fields(hid_t);
+  void read_header_fields(const char *fname);
+  void read_increase_numbers(int type, int n_for_this_task);
+  int get_filenr_from_header(void);
+  void set_filenr_in_header(int);
+  void *get_base_address_of_structure(enum arrays array, int index);
+  int get_type_of_element(int index);
+  void set_type_of_element(int index, int type);
+
+  /** Header for the standard file format.
+   */
+#ifdef GADGET2_HEADER
+  struct io_header
+  {
+    int npart[NTYPES_HEADER];                       /**< number of particles of each type in this file */
+    double mass[NTYPES_HEADER];                     /**< mass of particles of each type. If 0, then the masses are explicitly
+                                                           stored in the mass-block of the snapshot file, otherwise they are omitted */
+    double time;                                    /**< time of snapshot file */
+    double redshift;                                /**< redshift of snapshot file */
+    int flag_sfr;                                   /**< flags whether the simulation was including star formation */
+    int flag_feedback;                              /**< flags whether feedback was included (obsolete) */
+    unsigned int npartTotal[NTYPES_HEADER];         /**< total number of particles of each type in this snapshot. This can be
+                                               different from npart if one is dealing with a multi-file snapshot. */
+    int flag_cooling;                               /**< flags whether cooling was included  */
+    int num_files;                                  /**< number of files in multi-file snapshot */
+    double BoxSize;                                 /**< box-size of simulation in case periodic boundaries were used */
+    double Omega0;                                  /**< matter density in units of critical density */
+    double OmegaLambda;                             /**< cosmological constant parameter */
+    double HubbleParam;                             /**< Hubble parameter in units of 100 km/sec/Mpc */
+    double Hubble;                                  /**< Hubble constant in internal units */
+    unsigned int npartTotalHighWord[NTYPES_HEADER]; /**< High word of the total number of particles of each type */
+    int flag_entropy_instead_u;                     /**< flags that IC-file contains entropy instead of u */
+    int flag_doubleprecision;                       /**< flags that snapshot contains double-precision instead of single precision */
+    int flag_ic_info;        /*!< flag to inform whether IC files are generated with ordinary Zeldovich approximation,
+                                    or whether they ocontains 2nd order lagrangian perturbation theory initial conditions.
+                                    For snapshots files, the value informs whether the simulation was evolved from
+                                    Zeldoch or 2lpt ICs. Encoding is as follows:
+                                      FLAG_ZELDOVICH_ICS     (1)   - IC file based on Zeldovich
+                                      FLAG_SECOND_ORDER_ICS  (2)   - Special IC-file containing 2lpt masses
+                                     All other values, including 0 are interpreted as "don't know" for backwards compatability.
+                                */
+    float lpt_scalingfactor; /*!< scaling factor for 2lpt initial conditions */
+    char fill[48];           /**< fills to 256 Bytes, for compatability with Gadget2/3 */
+  };
+  io_header header; /**< holds header for snapshot files */
+#else
+
+  /* new simplified header */
+  struct io_header
+  {
+    long long npart[NTYPES_HEADER]; /**< number of particles of each type in this file */
+    long long npartTotal[NTYPES_HEADER];
+    double mass[NTYPES_HEADER]; /**< mass of particles of each type. If 0, then the masses are explicitly
+                                       stored in the mass-block of the snapshot file, otherwise they are omitted */
+    double time;                /**< time of snapshot file */
+    double redshift;            /**< redshift of snapshot file */
+    double BoxSize;             /**< box-size of simulation in case periodic boundaries were used */
+    int num_files;              /**< number of files in multi-file snapshot */
+  };
+  io_header header; /**< holds header for snapshot files */
+
+#endif
+};
+
+#endif
+
+#endif /* READSNAP_IO_H */
diff --git a/src/subfind/subfind_so.cc b/src/subfind/subfind_so.cc
new file mode 100644
index 0000000000000000000000000000000000000000..71da26b6e62d7a386972326a3a26a4ed390e0935
--- /dev/null
+++ b/src/subfind/subfind_so.cc
@@ -0,0 +1,426 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_so.cc
+ *
+ *  \brief spherical overdensity virial radius determination and property calculation
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/generic_comm.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+static double *R200, *M200;
+
+/* local data structure for collecting particle/cell data that is sent to other processors if needed */
+struct sodata_in : data_in_generic
+{
+  MyIntPosType IntPos[3];
+  double R200;
+};
+
+struct sodata_out
+{
+  double Mass;
+};
+
+template <typename T_tree, typename T_domain, typename T_partset>
+class sodata_comm : public generic_comm<sodata_in, sodata_out, T_tree, T_domain, T_partset>
+{
+ public:
+  typedef generic_comm<sodata_in, sodata_out, T_tree, T_domain, T_partset> gcomm;
+  using gcomm::D;
+  using gcomm::Thread;
+  using gcomm::Tp;  // This makes sure that we can access Tp from the base class without having to use "this->Tp"
+  using gcomm::Tree;
+
+  typename fof<T_partset>::group_properties *Group;
+
+  /* need to call the base class constructor explicitly */
+  sodata_comm(T_domain *dptr, T_tree *tptr, T_partset *pptr, typename fof<T_partset>::group_properties *grpptr)
+      : gcomm(dptr, tptr, pptr)
+  {
+    Group = grpptr;
+  }
+
+  /* routine that fills the relevant particle/cell data into the input structure defined above */
+  void particle2in(sodata_in *in, int i) override
+  {
+    in->IntPos[0] = Group[i].IntPos[0];
+    in->IntPos[1] = Group[i].IntPos[1];
+    in->IntPos[2] = Group[i].IntPos[2];
+
+    in->R200 = R200[i];
+  }
+
+  /* routine to store or combine result data */
+  void out2particle(sodata_out *out, int i, int mode) override
+  {
+    if(mode == MODE_LOCAL_PARTICLES) /* initial store */
+      M200[i] = out->Mass;
+    else /* combine */
+      M200[i] += out->Mass;
+  }
+
+  /*! This function represents the core of the SPH density computation. The
+   *  target particle may either be local, or reside in the communication
+   *  buffer.
+   */
+  int evaluate(int target, int mode, int thread_id, int action, sodata_in *in, int numnodes, node_info *firstnode,
+               sodata_out &out) override
+  {
+    MyIntPosType *intpos = in->IntPos;
+    double hsml          = in->R200;
+
+    double mass = 0;
+
+    for(int k = 0; k < numnodes; k++)
+      {
+        int no;
+
+        if(mode == MODE_LOCAL_PARTICLES)
+          {
+            no = Tree->MaxPart; /* root node */
+          }
+        else
+          {
+            no = firstnode[k].Node;
+            no = Tree->get_nodep(no)->nextnode; /* open it */
+          }
+
+        unsigned int shmrank = Tree->TreeSharedMem_ThisTask;
+
+        while(no >= 0)
+          {
+            if(no < Tree->MaxPart) /* single particle */
+              {
+                auto P = Tree->get_Pp(no, shmrank);
+
+                no = Tree->get_nextnodep(shmrank)[no]; /* note: here shmrank cannot change */
+
+                double dxyz[3];
+                Tp->nearest_image_intpos_to_pos(P->IntPos, intpos, dxyz); /* converts the integer distance to floating point */
+
+                double h2 = hsml * hsml;
+
+                double r2 = dxyz[0] * dxyz[0];
+                if(r2 > h2)
+                  continue;
+
+                r2 += dxyz[1] * dxyz[1];
+                if(r2 > h2)
+                  continue;
+
+                r2 += dxyz[2] * dxyz[2];
+                if(r2 > h2)
+                  continue;
+
+                mass += P->getMass();
+              }
+            else if(no < Tree->MaxPart + Tree->MaxNodes) /* internal node */
+              {
+                if(mode == MODE_IMPORTED_PARTICLES)
+                  {
+                    /* we reached a top-level node again, which means that we are done with the branch */
+                    if(no < Tree->FirstNonTopLevelNode)
+                      break;
+                  }
+
+                gravnode *current = Tree->get_nodep(no, shmrank);
+
+                no      = current->sibling; /* in case the node can be discarded */
+                shmrank = current->sibling_shmrank;
+
+                /* converts the integer distance to floating point */
+                double dxyz[3];
+                Tp->nearest_image_intpos_to_pos(current->center.da, intpos, dxyz);
+
+                double lenhalf = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - 1 - current->level)) * Tp->FacIntToCoord;
+
+                double dist = hsml + lenhalf;
+
+                if(fabs(dxyz[0]) > dist)
+                  continue;
+                if(fabs(dxyz[1]) > dist)
+                  continue;
+                if(fabs(dxyz[2]) > dist)
+                  continue;
+
+                /* now test against the minimal sphere enclosing everything */
+                dist += FACT1 * 2.0 * lenhalf;
+
+                double r2 = dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2];
+
+                if(r2 > dist * dist)
+                  continue;
+
+                if(no >= Tree->FirstNonTopLevelNode) /* only do this for fully local nodes */
+                  {
+                    double lenhalf = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - 1 - current->level)) * Tp->FacIntToCoord;
+
+                    /* test whether the node is contained within the sphere */
+                    dist = hsml - FACTSQRT3 * lenhalf;
+                    if(dist > 0)
+                      if(r2 < dist * dist)
+                        {
+                          mass += current->mass;
+                          continue;
+                        }
+                  }
+
+                no      = current->nextnode; /* ok, we need to open the node */
+                shmrank = current->nextnode_shmrank;
+              }
+            else if(no >= Tree->ImportedNodeOffset) /* point from imported nodelist */
+              {
+                int n = no - Tree->ImportedNodeOffset;
+                no    = Tree->Nextnode[no - Tree->MaxNodes];
+                /* note: here shmrank cannot change */
+
+                double dxyz[3];
+                Tp->nearest_image_intpos_to_pos(Tree->Points[n].IntPos, intpos,
+                                                dxyz); /* converts the integer distance to floating point */
+
+                double h2 = hsml * hsml;
+
+                double r2 = dxyz[0] * dxyz[0];
+                if(r2 > h2)
+                  continue;
+
+                r2 += dxyz[1] * dxyz[1];
+                if(r2 > h2)
+                  continue;
+
+                r2 += dxyz[2] * dxyz[2];
+                if(r2 > h2)
+                  continue;
+
+                mass += Tree->Points[n].Mass;
+              }
+            else /* pseudo particle */
+              {
+                if(mode == MODE_LOCAL_PARTICLES)
+                  Tree->tree_export_node_threads(no, target, &Thread);
+
+                no = Tree->Nextnode[no - Tree->MaxNodes];
+              }
+          }
+      }
+
+    out.Mass = mass;
+
+    return 0;
+  }
+};
+
+template <typename partset>
+double fof<partset>::subfind_get_overdensity_value(int type, double ascale)
+{
+  double z = 1 / ascale - 1;
+  double omegaz =
+      All.Omega0 * pow(1 + z, 3) / (All.Omega0 * pow(1 + z, 3) + (1 - All.Omega0 - All.OmegaLambda) * pow(1 + z, 2) + All.OmegaLambda);
+  double x = omegaz - 1;
+
+  if(type == 0)
+    {
+      return 200.0;  // Mean200
+    }
+  else if(type == 1)
+    {  // Generalized Tophat overdensity
+      return (18 * M_PI * M_PI + 82 * x - 39 * x * x) / omegaz;
+    }
+  else if(type == 2)
+    {
+      return 200.0 / omegaz;  // DeltaCrit200
+    }
+  else if(type == 3)
+    {
+      return 500.0 / omegaz;  // DeltaCrit500
+    }
+  else
+    Terminate("can't be");
+
+  return 0;
+}
+
+template <typename partset>
+double fof<partset>::subfind_overdensity(void)
+{
+  int *TargetList = (int *)Mem.mymalloc("TargetList", Ngroups * sizeof(int));
+
+  double *Left  = (double *)Mem.mymalloc("Left", sizeof(double) * Ngroups);
+  double *Right = (double *)Mem.mymalloc("Right", sizeof(double) * Ngroups);
+
+  R200 = (double *)Mem.mymalloc("R200", sizeof(double) * Ngroups);
+  M200 = (double *)Mem.mymalloc("M200", sizeof(double) * Ngroups);
+
+  double rhoback = 3 * All.Omega0 * All.Hubble * All.Hubble / (8 * M_PI * All.G);
+
+  double tstart = Logs.second();
+
+  sodata_comm<gravtree<partset>, domain<partset>, partset> commpattern{FoFDomain, &FoFGravTree, Tp, Group};
+
+  for(int rep = 0; rep < 4; rep++) /* repeat for all four overdensity values */
+    {
+      int Nso = 0;
+
+      for(int i = 0; i < Ngroups; i++)
+        {
+          if(Group[i].Nsubs > 0)
+            {
+              double rguess = pow(All.G * Group[i].Mass / (100 * All.Hubble * All.Hubble), 1.0 / 3);
+
+              TargetList[Nso++] = i;
+
+              Right[i] = 3 * rguess;
+              Left[i]  = 0;
+
+              R200[i] = 0.5 * (Left[i] + Right[i]);
+            }
+        }
+
+      int iter = 0;
+      long long ntot;
+
+      /* we will repeat the whole thing for those groups where we didn't converge to a SO radius yet */
+      do
+        {
+          double t0 = Logs.second();
+
+          commpattern.execute(Nso, TargetList, MODE_DEFAULT);
+
+          /* do final operations on results */
+          int npleft = 0;
+          for(int n = 0; n < Nso; n++)
+            {
+              int i = TargetList[n];
+
+              double overdensity = M200[i] / (4.0 * M_PI / 3.0 * R200[i] * R200[i] * R200[i]) / rhoback;
+
+              if((Right[i] - Left[i]) > 1.0e-4 * Left[i])
+                {
+                  /* need to redo this group */
+                  TargetList[npleft++] = i;
+
+                  double delta = subfind_get_overdensity_value(rep, Group[i].Ascale);
+                  if(overdensity > delta)
+                    Left[i] = R200[i];
+                  else
+                    Right[i] = R200[i];
+
+                  R200[i] = 0.5 * (Left[i] + Right[i]);
+
+                  if(iter >= MAXITER - 10)
+                    {
+                      printf("gr=%d task=%d  R200=%g Left=%g Right=%g Menclosed=%g Right-Left=%g\n   pos=(%g|%g|%g)\n", i, ThisTask,
+                             R200[i], Left[i], Right[i], M200[i], Right[i] - Left[i], Group[i].Pos[0], Group[i].Pos[1],
+                             Group[i].Pos[2]);
+                      myflush(stdout);
+                    }
+                }
+            }
+
+          Nso = npleft;
+
+          sumup_large_ints(1, &npleft, &ntot, Communicator);
+
+          double t1 = Logs.second();
+
+          if(ntot > 0)
+            {
+              iter++;
+
+              if(iter > 0)
+                mpi_printf("SUBFIND: SO iteration %2d: need to repeat for %12lld halo centers. (took %g sec)\n", iter, ntot,
+                           Logs.timediff(t0, t1));
+
+              if(iter > MAXITER)
+                Terminate("failed to converge in SO iteration");
+            }
+        }
+      while(ntot > 0);
+
+      for(int i = 0; i < Ngroups; i++)
+        {
+          if(Group[i].Nsubs > 0)
+            {
+              double overdensity = M200[i] / (4.0 * M_PI / 3.0 * R200[i] * R200[i] * R200[i]) / rhoback;
+
+              double delta = subfind_get_overdensity_value(rep, Group[i].Ascale);
+
+              if((overdensity - delta) > 0.1 * delta)
+                {
+                  R200[i] = M200[i] = 0;
+                }
+              else if(M200[i] < 5 * Group[i].Mass / Group[i].Len)
+                {
+                  R200[i] = M200[i] = 0;
+                }
+            }
+          else
+            R200[i] = M200[i] = 0;
+
+          switch(rep)
+            {
+              case 0:
+                Group[i].M_Mean200 = M200[i];
+                Group[i].R_Mean200 = R200[i];
+                break;
+              case 1:
+                Group[i].M_TopHat200 = M200[i];
+                Group[i].R_TopHat200 = R200[i];
+                break;
+              case 2:
+                Group[i].M_Crit200 = M200[i];
+                Group[i].R_Crit200 = R200[i];
+                break;
+              case 3:
+                Group[i].M_Crit500 = M200[i];
+                Group[i].R_Crit500 = R200[i];
+                break;
+            }
+        }
+    }
+
+  Mem.myfree(M200);
+  Mem.myfree(R200);
+  Mem.myfree(Right);
+  Mem.myfree(Left);
+  Mem.myfree(TargetList);
+
+  double tend = Logs.second();
+  return Logs.timediff(tstart, tend);
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
diff --git a/src/subfind/subfind_treepotential.cc b/src/subfind/subfind_treepotential.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dfe12cf432b4b3834fb07f0d84f061bda037aaf4
--- /dev/null
+++ b/src/subfind/subfind_treepotential.cc
@@ -0,0 +1,336 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_treepotential.cc
+ *
+ *  \brief routines to compute the gravitational potential of the particles making up a group
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+
+#include <mpi.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/generic_comm.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/peano.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+/*! Structure for communication during the density computation. Holds data that is sent to other processors.
+ */
+struct potdata_in : data_in_generic
+{
+  MyIntPosType IntPos[3];
+
+  unsigned char Type;
+#if NSOFTCLASSES > 1
+  unsigned char SofteningClass;
+#endif
+};
+
+struct potdata_out
+{
+  MyFloat Potential;
+};
+
+template <typename T_tree, typename T_domain, typename T_partset>
+class potdata_comm : public generic_comm<potdata_in, potdata_out, T_tree, T_domain, T_partset>
+{
+ public:
+  typedef generic_comm<potdata_in, potdata_out, T_tree, T_domain, T_partset> gcomm;
+  typedef gravtree<T_partset> gtree;
+  using gcomm::D;
+  using gcomm::Thread;
+  using gcomm::Tp;  // This makes sure that we can access Tp from the base class without having to use "this->Tp"
+  using gcomm::Tree;
+
+  /* need to call the base class constructor explicitly */
+  potdata_comm(T_domain *dptr, T_tree *tptr, T_partset *pptr) : gcomm(dptr, tptr, pptr) {}
+
+  /* routine that fills the relevant particle/cell data into the input structure defined above */
+  void particle2in(potdata_in *in, int i) override
+  {
+    for(int k = 0; k < 3; k++)
+      in->IntPos[k] = Tp->P[i].IntPos[k];
+
+    in->Type = Tp->P[i].getType();
+#if NSOFTCLASSES > 1
+    in->SofteningClass = Tp->P[i].getSofteningClass();
+#endif
+  }
+
+  /* routine to store or combine result data */
+  void out2particle(potdata_out *out, int i, int mode) override
+  {
+    if(mode == MODE_LOCAL_PARTICLES) /* initial store */
+      Tp->PS[i].u.s.u.DM_Potential = out->Potential;
+    else /* combine */
+      Tp->PS[i].u.s.u.DM_Potential += out->Potential;
+  }
+
+  int evaluate(int target, int mode, int thread_id, int action, potdata_in *in, int numnodes, node_info *firstnode,
+               potdata_out &out) override
+  {
+    gravnode *nop          = NULL;
+    double pot             = 0.0;
+    int ninteractions      = 0;
+    MyIntPosType intpos[3] = {in->IntPos[0], in->IntPos[1], in->IntPos[2]};
+
+    double theta2 = All.ErrTolTheta * All.ErrTolTheta;
+
+#if NSOFTCLASSES > 1
+    double hmax, h_i = All.ForceSoftening[in->SofteningClass];
+#else
+    double hmax = All.ForceSoftening[0];
+#endif
+
+    for(int k = 0; k < numnodes; k++)
+      {
+        int no;
+
+        if(mode == 0)
+          no = Tree->MaxPart; /* root node */
+        else
+          {
+            no = firstnode[k].Node;
+
+            if(numnodes > Tree->MaxNodes)
+              Terminate("numnodes=%d Tree->MaxNodes=%d  Tree->NumNodes=%d no=%d", numnodes, Tree->MaxNodes, Tree->NumNodes, no);
+
+            no = Tree->get_nodep(no)->nextnode; /* open it */
+          }
+
+        unsigned int shmrank = Tree->TreeSharedMem_ThisTask;
+
+        while(no >= 0)
+          {
+            vector<double> dxyz;
+            double r2, mass;
+
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+            char flag_Q2Tensor = 0;
+#endif
+
+            if(no < Tree->MaxPart) /* single particle */
+              {
+                auto P = Tree->get_Pp(no, shmrank);
+
+                /* convert the integer distance to floating point */
+                Tp->nearest_image_intpos_to_pos(P->IntPos, intpos, dxyz.da);
+
+                r2 = dxyz.r2();
+
+                mass = P->getMass();
+
+#if NSOFTCLASSES > 1
+                double h_j = All.ForceSoftening[Tp->P[no].getSofteningClass()];
+                hmax       = (h_j > h_i) ? h_j : h_i;
+#endif
+
+                no = Tree->get_nextnodep(shmrank)[no]; /* note: here shmrank cannot change */
+              }
+            else if(no < Tree->MaxPart + Tree->MaxNodes) /* we have an internal node */
+              {
+                if(mode == 1)
+                  {
+                    if(no < Tree->FirstNonTopLevelNode) /* we reached a top-level node again, which means that we are done with the
+                                                           branch */
+                      {
+                        no = -1;
+                        continue;
+                      }
+                  }
+
+                nop = Tree->get_nodep(no, shmrank);
+
+                if(nop->level == 0)
+                  {
+                    /* we always open the root node (its full node length couldn't be stored in the integer type */
+                    no      = nop->nextnode;
+                    shmrank = nop->nextnode_shmrank;
+                    continue;
+                  }
+
+                MyIntPosType halflen = ((MyIntPosType)1) << ((BITS_FOR_POSITIONS - 1) - nop->level);
+                MyIntPosType intlen  = halflen << 1;
+
+                {
+                  /* check whether we lie very close to the cell, and if yes, open it */
+                  MyIntPosType dist[3] = {nop->center[0] - intpos[0], nop->center[1] - intpos[1], nop->center[2] - intpos[2]};
+
+                  dist[0] = (((MySignedIntPosType)dist[0]) >= 0) ? dist[0] : -dist[0];
+                  dist[1] = (((MySignedIntPosType)dist[1]) >= 0) ? dist[1] : -dist[1];
+                  dist[2] = (((MySignedIntPosType)dist[2]) >= 0) ? dist[2] : -dist[2];
+
+                  if(dist[0] < intlen && dist[1] < intlen && dist[2] < intlen)
+                    {
+                      /* open cell */
+                      no      = nop->nextnode;
+                      shmrank = nop->nextnode_shmrank;
+                      continue;
+                    }
+                }
+
+                /* converts the integer distance to floating point */
+                Tp->nearest_image_intpos_to_pos(nop->s.da, intpos, dxyz.da);
+
+                r2 = dxyz.r2();
+
+                mass = nop->mass;
+
+                double len  = intlen * Tp->FacIntToCoord;
+                double len2 = len * len;
+
+                /* check Barnes-Hut opening criterion */
+                if(len2 > r2 * theta2)
+                  {
+                    /* open cell */
+                    no      = nop->nextnode;
+                    shmrank = nop->nextnode_shmrank;
+                    continue;
+                  }
+
+#if NSOFTCLASSES > 1
+                double h_j = All.ForceSoftening[nop->maxsofttype];
+
+                if(h_j > h_i)
+                  {
+                    if(r2 < h_j * h_j)
+                      {
+                        if(All.ForceSoftening[nop->minsofttype] < All.ForceSoftening[nop->maxsofttype])
+                          {
+                            /* open cell */
+                            no      = nop->nextnode;
+                            shmrank = nop->nextnode_shmrank;
+                            continue;
+                          }
+                      }
+                    hmax = h_j;
+                  }
+                else
+                  hmax = h_i;
+#endif
+
+                  /* ok, node can be used */
+
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+                /* will need to account for quadrupole tensor of node */
+                flag_Q2Tensor = 1;
+#endif
+
+                no      = nop->sibling;
+                shmrank = nop->sibling_shmrank;
+              }
+            else if(no >= Tree->ImportedNodeOffset) /* point from imported nodelist */
+              {
+                Terminate("We don't expect TreePoints here");
+              }
+            else /* pseudo particle */
+              {
+                if(mode == MODE_LOCAL_PARTICLES)
+                  Tree->tree_export_node_threads(no, target, &Thread);
+
+                no = Tree->Nextnode[no - Tree->MaxNodes];
+                continue;
+              }
+
+            /* now evaluate the multipole moment */
+            if(mass)
+              {
+                double r = sqrt(r2);
+
+                double rinv = (r > 0) ? 1.0 / r : 0;
+
+                typename gtree::gfactors gfac;
+
+                Tree->get_gfactors_potential(gfac, r, hmax, rinv);
+
+                pot -= mass * gfac.fac0;
+
+                ninteractions++;
+
+#if(MULTIPOLE_ORDER >= 3) || (MULTIPOLE_ORDER >= 2 && defined(EXTRAPOTTERM))
+                if(flag_Q2Tensor)
+                  {
+                    double g1               = gfac.fac1 * rinv;
+                    double g2               = gfac.fac2 * rinv * rinv;
+                    vector<MyDouble> Q2dxyz = nop->Q2Tensor * dxyz;
+                    double Q2dxyz2          = Q2dxyz * dxyz;
+                    double Q2trace          = nop->Q2Tensor.trace();
+
+                    pot -= 0.5 * (g1 * Q2trace + g2 * Q2dxyz2);  //  quadrupole potential
+                  }
+#endif
+              }
+          }
+      }
+
+    /* now store the result */
+
+    out.Potential = pot;
+
+    return ninteractions;
+  }
+};
+
+template <typename partset>
+void fof<partset>::subfind_potential_compute(domain<partset> *SubDomain, int num, int *darg)
+{
+  /* create an object for handling the communication */
+  potdata_comm<gravtree<partset>, domain<partset>, partset> commpattern{SubDomain, &FoFGravTree, Tp};
+
+  //  double t0 = Logs.second();
+
+  commpattern.execute(num, darg, MODE_DEFAULT);
+
+  /*
+  double t1 = Logs.second();
+
+  double costtotal = 0, costsum, npart = num, npartsum;
+  for(int i = 0; i < NUM_THREADS; i++)
+    costtotal += commpattern.Thread[i].Interactions;
+
+  MPI_Reduce(&costtotal, &costsum, 1, MPI_DOUBLE, MPI_SUM, 0, SubDomain->Communicator);
+  MPI_Reduce(&npart, &npartsum, 1, MPI_DOUBLE, MPI_SUM, 0, SubDomain->Communicator);
+  if(SubDomain->ThisTask == 0)
+    {
+      int task;
+      MPI_Comm_rank(Communicator, &task);
+
+      double dt = Logs.timediff(t0, t1);
+
+      if(task == 0)
+        printf("SUBFIND_POTENTIAL_COMPUTE: ThisTask=%d  npartsum=%g  took=%g   ia/part=%g  part/sec=%g\n",
+               task, npartsum, dt, costsum/npartsum, npartsum/dt);
+    }
+    */
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
diff --git a/src/subfind/subfind_unbind.cc b/src/subfind/subfind_unbind.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c3cf7d5a5826aac9d41885b9d1066e72f066fdb
--- /dev/null
+++ b/src/subfind/subfind_unbind.cc
@@ -0,0 +1,316 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  subfind_unbind.cc
+ *
+ *  \brief carries out the gravitational unbinding of a subhalo candidate
+ */
+
+#include "gadgetconfig.h"
+
+#ifdef SUBFIND
+
+#include <gsl/gsl_math.h>
+#include <mpi.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../fof/fof.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/cxxsort.h"
+#include "../sort/parallel_sort.h"
+#include "../sort/peano.h"
+#include "../subfind/subfind.h"
+#include "../system/system.h"
+
+#define MAX_UNBOUND_FRAC_BEFORE_BULK_VELOCITY_UPDATE 0.02
+#define MAX_UNBOUND_FRAC_BEFORE_POTENTIAL_UPDATE 0.20
+
+/* this function takes a list of particles given via their indices in d[] and subjects them
+ * to a gravitational unbinding procedure. The number of bound particles is returned,
+ * and the array d[] is updated accordingly.
+ */
+template <typename partset>
+int fof<partset>::subfind_unbind(domain<partset> *D, MPI_Comm Communicator, int *d, int num)
+{
+  double fac_vel_to_phys, fac_hubbleflow, fac_comov_to_phys;
+  subfind_get_factors(fac_vel_to_phys, fac_hubbleflow, fac_comov_to_phys);
+
+  /* get the local communication context */
+  int commNTask, commThisTask;
+  MPI_Comm_size(Communicator, &commNTask);
+  MPI_Comm_rank(Communicator, &commThisTask);
+
+  typename partset::pdata *P = Tp->P;
+  subfind_data *PS           = Tp->PS;
+
+  /* we will start out by recomputing the potential for all particles based on all particles */
+  int phaseflag = RECOMPUTE_ALL;
+
+  int iter = 0;
+
+  long long totnum = num;
+  MPI_Allreduce(MPI_IN_PLACE, &totnum, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
+
+  int num_removed = 0;
+  long long totremoved;
+
+  int *dremoved  = (int *)Mem.mymalloc("dremoved", num * sizeof(int));
+  double *potold = (double *)Mem.mymalloc("potold", num * sizeof(double));
+
+  do
+    {
+      FoFGravTree.treeallocate(Tp->NumPart, Tp, D);
+
+      if(phaseflag == RECOMPUTE_ALL)
+        {
+          FoFGravTree.treebuild(num, d);
+        }
+      else
+        {
+          FoFGravTree.treebuild(num_removed, dremoved);
+
+          for(int i = 0; i < num; i++)
+            potold[i] = PS[d[i]].u.s.u.DM_Potential;
+        }
+
+      /* let's compute the potential energy */
+      subfind_potential_compute(D, num, d);
+
+      FoFGravTree.treefree();
+
+      if(phaseflag == RECOMPUTE_ALL)
+        {
+          /* subtract self-potential and convert to physical potential */
+          for(int i = 0; i < num; i++)
+            {
+              int softtype = P[d[i]].getSofteningClass();
+
+              PS[d[i]].u.s.u.DM_Potential += Tp->P[d[i]].getMass() / (All.ForceSoftening[softtype] / 2.8);
+              PS[d[i]].u.s.u.DM_Potential *= All.G / fac_comov_to_phys;
+            }
+        }
+      else
+        {
+          /* do not correct for self-potential and instead use calculated potential as correction to previous potential */
+          for(int i = 0; i < num; i++)
+            {
+              PS[d[i]].u.s.u.DM_Potential *= All.G / fac_comov_to_phys;
+              PS[d[i]].u.s.u.DM_Potential = potold[i] - PS[d[i]].u.s.u.DM_Potential;
+            }
+        }
+
+      /* At this point, we have in num/d[] the list of still considered particles, and the potential is current */
+
+      /* we will not unbind all particles with positive energy, until none are left. Because the kinetic energy depends on
+       * the velocity frame, we recompute the bulk velocity and the center of mass after 2% of the particles have been removed
+       */
+
+      /* Determine in intpos[] the potential minimum among the particles,
+       * which we take that as the center of the halo
+       */
+      int minindex = -1;
+      struct
+      {
+        double pot;
+        int rank;
+      } local = {MAX_DOUBLE_NUMBER, commThisTask}, global;
+
+      for(int i = 0; i < num; i++)
+        if(PS[d[i]].u.s.u.DM_Potential < local.pot)
+          {
+            local.pot = PS[d[i]].u.s.u.DM_Potential;
+            minindex  = d[i];
+          }
+
+      MPI_Allreduce(&local, &global, 1, MPI_DOUBLE_INT, MPI_MINLOC, Communicator);
+
+      MyIntPosType intpos[3]; /* potential minimum */
+
+      if(commThisTask == global.rank)
+        for(int j = 0; j < 3; j++)
+          intpos[j] = P[minindex].IntPos[j];
+
+      MPI_Bcast(intpos, 3 * sizeof(MyIntPosType), MPI_BYTE, global.rank, Communicator);
+
+      /* we start with zero removed particles */
+      num_removed = 0;
+
+      long long totunbound;
+      do
+        {
+          /* let's get bulk velocity and the center-of-mass */
+          double massloc = 0, sloc[3] = {0, 0, 0}, vloc[3] = {0, 0, 0};
+
+          for(int i = 0; i < num; i++)
+            {
+              int part_index = d[i];
+
+              double dxyz[3];
+              Tp->nearest_image_intpos_to_pos(P[part_index].IntPos, intpos, dxyz);
+
+              for(int j = 0; j < 3; j++)
+                sloc[j] += Tp->P[part_index].getMass() * dxyz[j];
+
+              for(int j = 0; j < 3; j++)
+                vloc[j] += Tp->P[part_index].getMass() * P[part_index].Vel[j];
+
+              massloc += Tp->P[part_index].getMass();
+            }
+
+          double s[3], v[3], mass;
+          MPI_Allreduce(sloc, s, 3, MPI_DOUBLE, MPI_SUM, Communicator);
+          MPI_Allreduce(vloc, v, 3, MPI_DOUBLE, MPI_SUM, Communicator);
+          MPI_Allreduce(&massloc, &mass, 1, MPI_DOUBLE, MPI_SUM, Communicator);
+
+          for(int j = 0; j < 3; j++)
+            {
+              s[j] /= mass; /* center of mass offset relative to minimum potential */
+              v[j] /= mass; /* center of mass velocity */
+            }
+
+          MySignedIntPosType off[3];
+          Tp->pos_to_signedintpos(s, off);
+
+          /* get integer version of absolute center of mass position */
+          MyIntPosType int_cm[3];
+          int_cm[0] = off[0] + intpos[0];
+          int_cm[1] = off[1] + intpos[1];
+          int_cm[2] = off[2] + intpos[2];
+
+          double *bnd_energy = (double *)Mem.mymalloc("bnd_energy", num * sizeof(double));
+
+          /* calculate the binding energies */
+          for(int i = 0; i < num; i++)
+            {
+              int part_index = d[i];
+
+              /* distance to center of mass */
+              double dx[3];
+              Tp->nearest_image_intpos_to_pos(P[part_index].IntPos, int_cm, dx);
+
+              /* get physical velocity relative to center of mass */
+              double dv[3];
+              for(int j = 0; j < 3; j++)
+                {
+                  dv[j] = fac_vel_to_phys * (P[part_index].Vel[j] - v[j]);
+                  dv[j] += fac_hubbleflow * fac_comov_to_phys * dx[j];
+                }
+
+              PS[part_index].v.DM_BindingEnergy =
+                  PS[part_index].u.s.u.DM_Potential + 0.5 * (dv[0] * dv[0] + dv[1] * dv[1] + dv[2] * dv[2]);
+#ifndef LEAN
+              if(P[part_index].getType() == 0)
+                PS[part_index].v.DM_BindingEnergy += PS[part_index].Utherm;
+#endif
+              bnd_energy[i] = PS[part_index].v.DM_BindingEnergy;
+            }
+
+          /* sort by binding energy, highest energies (num_unbound / most weakly bound) first */
+          mycxxsort_parallel(bnd_energy, bnd_energy + num, subfind_compare_binding_energy, Communicator);
+
+          int *npart = (int *)Mem.mymalloc("npart", commNTask * sizeof(int));
+          MPI_Allgather(&num, 1, MPI_INT, npart, 1, MPI_INT, Communicator);
+
+          /* (global) index of limiting energy value for least tightly bound fraction  - those we may
+           * remove at most in one iteration */
+          long long j = std::max<long long>(5, (long long)(MAX_UNBOUND_FRAC_BEFORE_BULK_VELOCITY_UPDATE * totnum));
+
+          /* find the processor where this lies */
+          int task = 0;
+          while(j >= npart[task])
+            {
+              j -= npart[task];
+              task++;
+            }
+
+          double energy_limit = MAX_DOUBLE_NUMBER;
+
+          if(commThisTask == task)
+            energy_limit = bnd_energy[j];
+
+          MPI_Allreduce(MPI_IN_PLACE, &energy_limit, 1, MPI_DOUBLE, MPI_MIN, Communicator);
+
+          /* now unbind particles */
+          int num_unbound = 0;
+
+          for(int i = 0; i < num; i++)
+            {
+              int p = d[i];
+
+              if(PS[p].v.DM_BindingEnergy > 0 && PS[p].v.DM_BindingEnergy > energy_limit)
+                {
+                  num_unbound++;
+
+                  dremoved[num_removed++] = d[i];
+
+                  d[i] = d[num - 1];
+                  num--;
+                  i--;
+                }
+            }
+
+          Mem.myfree(npart);
+          Mem.myfree(bnd_energy);
+
+          totunbound = num_unbound;
+          totremoved = num_removed;
+          totnum     = num;
+
+          MPI_Allreduce(MPI_IN_PLACE, &totunbound, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
+          MPI_Allreduce(MPI_IN_PLACE, &totremoved, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
+          MPI_Allreduce(MPI_IN_PLACE, &totnum, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
+        }
+      while(totunbound > 0 && totnum >= All.DesLinkNgb && totremoved < MAX_UNBOUND_FRAC_BEFORE_POTENTIAL_UPDATE * totnum);
+
+      iter++;
+
+      if(iter > MAX_ITER_UNBIND)
+        Terminate("too many iterations");
+
+      if(phaseflag == RECOMPUTE_ALL)
+        {
+          if(totremoved > 0)
+            phaseflag = UPDATE_ALL;
+        }
+      else
+        {
+          if(totremoved == 0)
+            {
+              phaseflag  = RECOMPUTE_ALL; /* this will make us repeat everything once more for all particles */
+              totremoved = 1;             /* to make the code check once more all particles */
+            }
+        }
+    }
+  while(totremoved > 0 && totnum >= All.DesLinkNgb);
+
+  Mem.myfree(potold);
+  Mem.myfree(dremoved);
+
+  return num;
+}
+
+/* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
+#include "../data/simparticles.h"
+template class fof<simparticles>;
+
+#if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
+#include "../data/lcparticles.h"
+template class fof<lcparticles>;
+#endif
+
+#endif
diff --git a/src/system/pinning.cc b/src/system/pinning.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f128a89e7025ab436c25a2a14120cf36f70e60f7
--- /dev/null
+++ b/src/system/pinning.cc
@@ -0,0 +1,195 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  pinning.cc
+ *
+ *  \brief routines to report and modify the pinning of processes/threads to CPU cores
+ */
+
+#include "gadgetconfig.h"
+
+#include <gsl/gsl_rng.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../system/pinning.h"
+
+#define MAX_CORES 4096
+
+void pinning::get_core_set(void)
+{
+#ifdef IMPOSE_PINNING
+  cpuset = hwloc_bitmap_alloc();
+  hwloc_get_proc_cpubind(topology, getpid(), cpuset, 0);
+#endif
+}
+
+void pinning::detect_topology(void)
+{
+#ifdef IMPOSE_PINNING
+  /* Allocate and initialize topology object. */
+  hwloc_topology_init(&topology);
+
+  /* Perform the topology detection. */
+  hwloc_topology_load(topology);
+
+  /* Get some additional topology information
+     in case we need the topology depth later. */
+  topodepth = hwloc_topology_get_depth(topology);
+
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
+
+  if(depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    sockets = -1;
+  else
+    sockets = hwloc_get_nbobjs_by_depth(topology, depth);
+
+  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_CORE);
+
+  if(depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    cores = -1;
+  else
+    cores = hwloc_get_nbobjs_by_depth(topology, depth);
+
+  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
+
+  if(depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    pus = -1;
+  else
+    pus = hwloc_get_nbobjs_by_depth(topology, depth);
+#endif
+}
+
+void pinning::pin_to_core_set(setcomm *sc)
+{
+#ifdef IMPOSE_PINNING
+  sc->mpi_printf("PINNING: We have %d sockets, %d physical cores and %d logical cores on the first MPI-task's node.\n", sockets, cores,
+                 pus);
+  if(cores <= 0 || sockets <= 0 || pus <= 0)
+    {
+      sc->mpi_printf("PINNING: The topology cannot be recognized. We refrain from any pinning attempt.\n");
+      flag_pinning_error = 1;
+      return;
+    }
+
+  hyperthreads_per_core = pus / cores;
+
+  if(hyperthreads_per_core < 1)
+    Terminate("Need at least one logical thread per physical core\n");
+
+  if(pus > cores)
+    sc->mpi_printf("PINNING: Looks like %d hyperthreads per physical core are in principle possible.\n", hyperthreads_per_core);
+
+  cpuset_after_MPI_init = hwloc_bitmap_alloc();
+  hwloc_get_proc_cpubind(topology, getpid(), cpuset_after_MPI_init, 0);
+
+  if(!hwloc_bitmap_isequal(cpuset, cpuset_after_MPI_init))
+    sc->mpi_printf("PINNING: Apparently, the MPI library set some pinning itself. We'll override this.\n");
+
+  int available_pus = 0;
+
+  for(int id = hwloc_bitmap_first(cpuset); id != -1; id = hwloc_bitmap_next(cpuset, id))
+    available_pus++;
+
+  sc->mpi_printf("PINNING: Looks like %d logical cores are available.\n", available_pus);
+
+  if(available_pus == pus)
+    {
+      sc->mpi_printf("PINNING: Looks like all available logical cores are at our disposal.\n");
+    }
+  else
+    {
+      if(available_pus >= 1)
+        {
+          sc->mpi_printf("PINNING: Looks like already before start of the code, a tight binding was imposed.\n");
+#ifdef IMPOSE_PINNING_OVERRIDE_MODE
+          for(int id = 0; id < pus; id++)
+            hwloc_bitmap_set(cpuset, id);
+          available_pus = pus;
+          sc->mpi_printf("PINNING: We are overriding this and make all %d available to us.\n", available_pus);
+#else
+          sc->mpi_printf(
+              "PINNING: We refrain from any pinning attempt ourselves. (This can be changed by setting the compile flag "
+              "IMPOSE_PINNING_OVERRIDE_MODE.)\n");
+          flag_pinning_error = 1;
+          return;
+#endif
+        }
+    }
+
+  char buf[MAX_CORES + 1];
+
+  for(int i = 0; i < pus && i < MAX_CORES; i++)
+    if(hwloc_bitmap_isset(cpuset, i))
+      buf[i] = '1';
+    else
+      buf[i] = '-';
+  buf[pus] = 0;
+
+  sc->mpi_printf("PINNING: Available logical cores on first node:  %s\n", buf);
+
+  int pus_per_task = available_pus / sc->TasksInThisNode;
+
+  sc->mpi_printf("PINNING: %d logical cores are available per MPI Task.\n", pus_per_task);
+
+  if(pus_per_task <= 0)
+    Terminate("Need at least one logical core per MPI task for pinning to make sense.\n");
+
+  /* go through all logical cores in sequence of proximity */
+  int depth        = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
+  int cores_before = 0;
+  int cid;
+
+  for(cid = 0; cores_before < sc->RankInThisNode * pus_per_task && cid < pus; cid++)
+    {
+      hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, cid);
+
+      hwloc_cpuset_t cpuset_core = hwloc_bitmap_dup(obj->cpuset);
+      if(hwloc_bitmap_isincluded(cpuset_core, cpuset))
+        {
+          cores_before++;
+        }
+      hwloc_bitmap_free(cpuset_core);
+    }
+
+  /* cid should now be the logical index of the first PU for this MPI task */
+
+  hwloc_obj_t obj            = hwloc_get_obj_by_depth(topology, depth, cid);
+  hwloc_cpuset_t current_cpu = hwloc_bitmap_dup(obj->cpuset);
+
+  hwloc_set_proc_cpubind(topology, getpid(), current_cpu, HWLOC_CPUBIND_PROCESS);
+#endif
+}
+
+void pinning::report_pinning(setcomm *sc)
+{
+#ifdef IMPOSE_PINNING
+  if(flag_pinning_error)
+    return;
+
+  hwloc_get_cpubind(topology, cpuset, 0);
+
+  char buf[MAX_CORES + 1];
+
+  for(int i = 0; i < pus && i < MAX_CORES; i++)
+    if(hwloc_bitmap_isset(cpuset, i))
+      buf[i] = '1';
+    else
+      buf[i] = '-';
+  buf[pus] = 0;
+
+  for(int i = 0; i < sc->NTask; i++)
+    {
+      if(sc->ThisTask == i && sc->ThisNode == 0)
+        printf("PINNING: Node=%4d: Task=%04d:                   %s\n", sc->ThisNode, sc->ThisTask, buf);
+      fflush(stdout);
+      MPI_Barrier(sc->Communicator);
+    }
+#endif
+}
diff --git a/src/system/pinning.h b/src/system/pinning.h
new file mode 100644
index 0000000000000000000000000000000000000000..27cbce0fbd052534793cf906bb876ff945204299
--- /dev/null
+++ b/src/system/pinning.h
@@ -0,0 +1,59 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  pinning.h
+ *
+ *  \brief declares a class for pinning related work
+ */
+
+#ifndef PINNING_H
+#define PINNING_H
+
+#include <gsl/gsl_rng.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../main/main.h"
+#include "../mpi_utils/setcomm.h"
+#include "../system/system.h"
+#include "gadgetconfig.h"
+
+/*! \file pinning.c
+ *  \brief examines cpu topology and binds processes and threads to cores
+ */
+
+#ifdef IMPOSE_PINNING
+#include <hwloc.h>
+#endif
+
+class pinning
+{
+ private:
+#ifdef IMPOSE_PINNING
+  int flag_pinning_error = 0;
+
+  hwloc_cpuset_t cpuset, cpuset_after_MPI_init;
+  hwloc_topology_t topology;
+  int topodepth;
+  int sockets;
+  int cores;
+  int pus;
+  int hyperthreads_per_core;
+
+#endif
+ public:
+  void get_core_set(void);
+  void detect_topology(void);
+  void pin_to_core_set(setcomm *sc);
+  void report_pinning(setcomm *sc);
+};
+
+#endif
diff --git a/src/system/system.cc b/src/system/system.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8530d80952db8eca18e095deda04acef842b36f9
--- /dev/null
+++ b/src/system/system.cc
@@ -0,0 +1,348 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  system.cc
+ *
+ *  \brief various low level helper routines
+ */
+
+#include "gadgetconfig.h"
+
+#include <gsl/gsl_rng.h>
+#include <math.h>
+#include <mpi.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <sys/resource.h>
+#include <sys/statvfs.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/mymalloc.h"
+#include "../io/io.h"
+#include "../logs/logs.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../system/system.h"
+
+gsl_rng *random_generator; /*!< the random number generator used */
+
+void init_rng(int thistask)
+{
+  random_generator = gsl_rng_alloc(gsl_rng_ranlxd1);
+
+  gsl_rng_set(random_generator, 42 + thistask); /* start-up seed, choose different on each rank */
+}
+
+double get_random_number(void) { return gsl_rng_uniform(random_generator); }
+
+void subdivide_evenly(long long N, int pieces, int index_bin, long long *first, int *count)
+{
+  if(N / pieces > INT_MAX)
+    Terminate("overflow");
+
+  int nbase      = N / pieces;
+  int additional = N % pieces;
+  *first         = index_bin * ((long long)nbase) + ((index_bin < additional) ? index_bin : additional);
+  *count         = nbase + (index_bin < additional);
+}
+
+void subdivide_evenly(int N, int pieces, int index_bin, int *first, int *count)
+{
+  int nbase      = N / pieces;
+  int additional = N % pieces;
+  *first         = index_bin * nbase + ((index_bin < additional) ? index_bin : additional);
+  *count         = nbase + (index_bin < additional);
+}
+
+void subdivide_evenly_get_bin(int N, int pieces, int index, int *bin)
+{
+  int nbase      = N / pieces;
+  int additional = N % pieces;
+
+  if(index < additional * (nbase + 1))
+    *bin = index / (nbase + 1);
+  else
+    *bin = (index - additional) / nbase;
+}
+
+void permutate_chunks_in_list(int ncount, int *list)
+{
+#define WALK_N_PIECES 32 /*!< Number of sets, the chunks are divided into */
+#define WALK_N_SIZE 500  /*!< Number of particles per chunk */
+
+  int nchunk     = 1;      /*!< Number of chunk sets used */
+  int nchunksize = ncount; /*!< Size of each chunk */
+
+  if(ncount > WALK_N_PIECES * WALK_N_SIZE)
+    {
+      nchunk     = WALK_N_PIECES;
+      nchunksize = WALK_N_SIZE;
+    }
+
+  int currentchunk = 0; /*!< Chunk set currently processed */
+
+  int *chunked_TargetList = (int *)Mem.mymalloc("chunked_TargetList", ncount * sizeof(int));
+  for(int n = 0, nextparticle = 0; n < ncount; n++)
+    {
+      int i = nextparticle;
+
+      chunked_TargetList[n] = list[i];
+      if(i < ncount)
+        {
+          nextparticle++;
+
+          if((nextparticle % nchunksize) == 0)
+            nextparticle += (nchunk - 1) * nchunksize;
+
+          if(nextparticle >= ncount)
+            {
+              currentchunk++;
+              if(currentchunk < nchunk)
+                nextparticle = currentchunk * nchunksize;
+            }
+        }
+    }
+
+  for(int n = 0; n < ncount; n++)
+    list[n] = chunked_TargetList[n];
+
+  Mem.myfree(chunked_TargetList);
+}
+
+void myflush(FILE *fstream)
+{
+#ifdef REDUCE_FLUSH
+  /* do nothing */
+  (void)fstream;
+#else
+  fflush(fstream);
+#endif
+}
+
+#ifdef DEBUG
+#include <fenv.h>
+void enable_core_dumps_and_fpu_exceptions(void)
+{
+#if defined(DEBUG_ENABLE_FPU_EXCEPTIONS) && defined(__linux__)
+  /* enable floating point exceptions */
+
+  extern int feenableexcept(int __excepts);
+  feenableexcept(FE_DIVBYZERO | FE_INVALID);
+
+  /* Note: FPU exceptions appear not to work properly
+   * when the Intel C-Compiler for Linux is used
+   */
+#endif
+
+  /* set core-dump size to infinity */
+  rlimit rlim;
+  getrlimit(RLIMIT_CORE, &rlim);
+  rlim.rlim_cur = RLIM_INFINITY;
+  setrlimit(RLIMIT_CORE, &rlim);
+
+  /* MPICH catches the signales SIGSEGV, SIGBUS, and SIGFPE....
+   * The following statements reset things to the default handlers,
+   * which will generate a core file.
+   */
+  signal(SIGSEGV, SIG_DFL);
+  signal(SIGBUS, SIG_DFL);
+  signal(SIGFPE, SIG_DFL);
+  signal(SIGINT, SIG_DFL);
+}
+#endif
+
+long long sim::report_comittable_memory(long long *MemTotal, long long *Committed_AS, long long *SwapTotal, long long *SwapFree)
+{
+  FILE *fd;
+  char buf[1024];
+
+  if((fd = fopen("/proc/meminfo", "r")))
+    {
+      while(1)
+        {
+          if(fgets(buf, 500, fd) != buf)
+            break;
+
+          if(bcmp(buf, "MemTotal", 8) == 0)
+            {
+              *MemTotal = atoll(buf + 10);
+            }
+          if(strncmp(buf, "Committed_AS", 12) == 0)
+            {
+              *Committed_AS = atoll(buf + 14);
+            }
+          if(strncmp(buf, "SwapTotal", 9) == 0)
+            {
+              *SwapTotal = atoll(buf + 11);
+            }
+          if(strncmp(buf, "SwapFree", 8) == 0)
+            {
+              *SwapFree = atoll(buf + 10);
+            }
+        }
+      fclose(fd);
+    }
+
+  return (*MemTotal - *Committed_AS);
+}
+
+// returns kByte
+long long sim::report_free_size_in_tmpfs(void)
+{
+  struct statvfs buf;
+
+  if(statvfs("/dev/shm", &buf) == 0)
+    {
+      return (1024.0 * (uint64_t)buf.f_bsize * ((uint64_t)(buf.f_bavail))) * TO_MBYTE_FAC;
+    }
+  else
+    return 0;
+}
+
+void sim::mpi_report_comittable_memory(void)
+{
+  long long *sizelist, maxsize[7], minsize[7];
+  double avgsize[7];
+  int i, imem, mintask[7], maxtask[7];
+  long long Mem[7];
+  char label[512];
+
+  Mem[0] = report_comittable_memory(&Mem[1], &Mem[2], &Mem[3], &Mem[4]);
+  Mem[5] = Mem[1] - Mem[0];
+
+  Mem[6] = report_free_size_in_tmpfs();
+
+  MemoryOnNode       = Mem[1];
+  SharedMemoryOnNode = Mem[6];
+
+  for(imem = 0; imem < 7; imem++)
+    {
+      sizelist = (long long *)malloc(NTask * sizeof(long long));
+      MPI_Allgather(&Mem[imem], sizeof(long long), MPI_BYTE, sizelist, sizeof(long long), MPI_BYTE, Communicator);
+
+      for(i = 1, mintask[imem] = 0, maxtask[imem] = 0, maxsize[imem] = minsize[imem] = sizelist[0], avgsize[imem] = sizelist[0];
+          i < NTask; i++)
+        {
+          if(sizelist[i] > maxsize[imem])
+            {
+              maxsize[imem] = sizelist[i];
+              maxtask[imem] = i;
+            }
+          if(sizelist[i] < minsize[imem])
+            {
+              minsize[imem] = sizelist[i];
+              mintask[imem] = i;
+            }
+          avgsize[imem] += sizelist[i];
+        }
+
+      free(sizelist);
+    }
+
+  if(ThisTask == 0)
+    {
+      printf(
+          "\n-------------------------------------------------------------------------------------------------------------------------"
+          "\n");
+      for(imem = 0; imem < 7; imem++)
+        {
+          switch(imem)
+            {
+              case 0:
+                sprintf(label, "AvailMem");
+                break;
+              case 1:
+                sprintf(label, "Total Mem");
+                break;
+              case 2:
+                sprintf(label, "Committed_AS");
+                break;
+              case 3:
+                sprintf(label, "SwapTotal");
+                break;
+              case 4:
+                sprintf(label, "SwapFree");
+                break;
+              case 5:
+                sprintf(label, "AllocMem");
+                break;
+              case 6:
+                sprintf(label, "avail /dev/shm");
+                break;
+            }
+          printf("%s:\t Largest = %10.2f Mb (on task=%4d), Smallest = %10.2f Mb (on task=%4d), Average = %10.2f Mb\n", label,
+                 maxsize[imem] / (1024.0), maxtask[imem], minsize[imem] / (1024.0), mintask[imem], avgsize[imem] / (1024.0 * NTask));
+        }
+      printf(
+          "-------------------------------------------------------------------------------------------------------------------------"
+          "\n");
+    }
+
+  char name[MPI_MAX_PROCESSOR_NAME];
+
+  if(ThisTask == maxtask[2])
+    {
+      int len;
+      MPI_Get_processor_name(name, &len);
+    }
+
+  MPI_Bcast(name, MPI_MAX_PROCESSOR_NAME, MPI_BYTE, maxtask[2], Communicator);
+
+  if(ThisTask == 0)
+    {
+      printf("Task=%d has the maximum commited memory and is host: %s\n", ThisTask, name);
+      printf(
+          "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+          "\n");
+    }
+
+  fflush(stdout);
+}
+
+/* the following function finds the last significant bit, as in the linux kernel */
+int my_fls(unsigned int x)
+{
+  int r = 32;
+
+  if(!x)
+    return 0;
+  if(!(x & 0xffff0000u))
+    {
+      x <<= 16;
+      r -= 16;
+    }
+  if(!(x & 0xff000000u))
+    {
+      x <<= 8;
+      r -= 8;
+    }
+  if(!(x & 0xf0000000u))
+    {
+      x <<= 4;
+      r -= 4;
+    }
+  if(!(x & 0xc0000000u))
+    {
+      x <<= 2;
+      r -= 2;
+    }
+  if(!(x & 0x80000000u))
+    {
+      x <<= 1;
+      r -= 1;
+    }
+  return r;
+}
diff --git a/src/system/system.h b/src/system/system.h
new file mode 100644
index 0000000000000000000000000000000000000000..882325ee481044ac23fbe32807e0885619d1eced
--- /dev/null
+++ b/src/system/system.h
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  system.h
+ *
+ *  \brief declares functions for various low level helper routines
+ */
+
+#ifndef SYSTEM_H
+#define SYSTEM_H
+
+#include <gsl/gsl_rng.h>
+#include <stdio.h>
+
+extern gsl_rng *random_generator; /**< the random number generator used */
+
+void myflush(FILE *fstream);
+
+void enable_core_dumps_and_fpu_exceptions(void);
+
+void permutate_chunks_in_list(int ncount, int *list);
+
+void subdivide_evenly(long long N, int pieces, int index_bin, long long *first, int *count);
+void subdivide_evenly(int N, int pieces, int index, int *first, int *count);
+void subdivide_evenly_get_bin(int N, int pieces, int index, int *bin);
+
+void init_rng(int thistask);
+double get_random_number(void);
+
+int my_fls(unsigned int x);
+
+template <typename T>
+inline T square(T const value)
+{
+  return value * value;
+}
+
+#endif
diff --git a/src/time_integration/driftfac.cc b/src/time_integration/driftfac.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0197eed6b2396e1dfde657d892efdf5a68b46417
--- /dev/null
+++ b/src/time_integration/driftfac.cc
@@ -0,0 +1,268 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  driftfac.cc
+ *
+ *  \brief tabulates cosmological drift/kick factors for fast look-up
+ */
+
+#include "gadgetconfig.h"
+
+#include <gsl/gsl_integration.h>
+#include <gsl/gsl_math.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../time_integration/driftfac.h"
+
+void driftfac::init_drift_table(void)
+{
+#define WORKSIZE 100000
+
+  gsl_function F;
+  gsl_integration_workspace *workspace;
+
+  logTimeBegin = log(All.TimeBegin);
+  logTimeMax   = log(All.TimeMax);
+
+  workspace = gsl_integration_workspace_alloc(WORKSIZE);
+
+  for(int i = 0; i < DRIFT_TABLE_LENGTH; i++)
+    {
+      double result, abserr;
+
+      F.function = &driftfac::drift_integ;
+      gsl_integration_qag(&F, exp(logTimeBegin), exp(logTimeBegin + ((logTimeMax - logTimeBegin) / DRIFT_TABLE_LENGTH) * (i + 1)), 0,
+                          1.0e-8, WORKSIZE, GSL_INTEG_GAUSS41, workspace, &result, &abserr);
+      DriftTable[i] = result;
+
+      F.function = &gravkick_integ;
+      gsl_integration_qag(&F, exp(logTimeBegin), exp(logTimeBegin + ((logTimeMax - logTimeBegin) / DRIFT_TABLE_LENGTH) * (i + 1)), 0,
+                          1.0e-8, WORKSIZE, GSL_INTEG_GAUSS41, workspace, &result, &abserr);
+      GravKickTable[i] = result;
+
+      F.function = &hydrokick_integ;
+      gsl_integration_qag(&F, exp(logTimeBegin), exp(logTimeBegin + ((logTimeMax - logTimeBegin) / DRIFT_TABLE_LENGTH) * (i + 1)), 0,
+                          1.0e-8, WORKSIZE, GSL_INTEG_GAUSS41, workspace, &result, &abserr);
+      HydroKickTable[i] = result;
+    }
+
+  gsl_integration_workspace_free(workspace);
+}
+
+/*! This function integrates the cosmological prefactor for a drift
+ *   step between time0 and time1. The value returned is
+ *  \f[ \int_{a_0}^{a_1} \frac{{\rm d}a}{a^3 * H(a)}
+ *  \f]
+ *
+ *  A lookup-table is used for reasons of speed.
+ */
+double driftfac::get_drift_factor(integertime time0, integertime time1)
+{
+  static integertime last_time0 = -1, last_time1 = -1;
+  static double last_value;
+
+  if(time0 == last_time0 && time1 == last_time1)
+    return last_value;
+
+  /* note: will only be called for cosmological integration */
+
+  double a1 = logTimeBegin + time0 * All.Timebase_interval;
+  double a2 = logTimeBegin + time1 * All.Timebase_interval;
+
+  double u1 = (a1 - logTimeBegin) / (logTimeMax - logTimeBegin) * DRIFT_TABLE_LENGTH;
+  int i1    = (int)u1;
+  if(i1 >= DRIFT_TABLE_LENGTH)
+    i1 = DRIFT_TABLE_LENGTH - 1;
+
+  double df1;
+  if(i1 <= 1)
+    df1 = u1 * DriftTable[0];
+  else
+    df1 = DriftTable[i1 - 1] + (DriftTable[i1] - DriftTable[i1 - 1]) * (u1 - i1);
+
+  double u2 = (a2 - logTimeBegin) / (logTimeMax - logTimeBegin) * DRIFT_TABLE_LENGTH;
+  int i2    = (int)u2;
+  if(i2 >= DRIFT_TABLE_LENGTH)
+    i2 = DRIFT_TABLE_LENGTH - 1;
+
+  double df2;
+  if(i2 <= 1)
+    df2 = u2 * DriftTable[0];
+  else
+    df2 = DriftTable[i2 - 1] + (DriftTable[i2] - DriftTable[i2 - 1]) * (u2 - i2);
+
+  last_time0 = time0;
+  last_time1 = time1;
+
+  return last_value = (df2 - df1);
+}
+
+double driftfac::get_gravkick_factor(integertime time0, integertime time1)
+{
+  static integertime last_time0 = -1, last_time1 = -1;
+  static double last_value;
+
+  if(time0 == last_time0 && time1 == last_time1)
+    return last_value;
+
+  /* note: will only be called for cosmological integration */
+
+  double a1 = logTimeBegin + time0 * All.Timebase_interval;
+  double a2 = logTimeBegin + time1 * All.Timebase_interval;
+
+  double u1 = (a1 - logTimeBegin) / (logTimeMax - logTimeBegin) * DRIFT_TABLE_LENGTH;
+  int i1    = (int)u1;
+  if(i1 >= DRIFT_TABLE_LENGTH)
+    i1 = DRIFT_TABLE_LENGTH - 1;
+
+  double df1;
+  if(i1 <= 1)
+    df1 = u1 * GravKickTable[0];
+  else
+    df1 = GravKickTable[i1 - 1] + (GravKickTable[i1] - GravKickTable[i1 - 1]) * (u1 - i1);
+
+  double u2 = (a2 - logTimeBegin) / (logTimeMax - logTimeBegin) * DRIFT_TABLE_LENGTH;
+  int i2    = (int)u2;
+  if(i2 >= DRIFT_TABLE_LENGTH)
+    i2 = DRIFT_TABLE_LENGTH - 1;
+
+  double df2;
+  if(i2 <= 1)
+    df2 = u2 * GravKickTable[0];
+  else
+    df2 = GravKickTable[i2 - 1] + (GravKickTable[i2] - GravKickTable[i2 - 1]) * (u2 - i2);
+
+  last_time0 = time0;
+  last_time1 = time1;
+
+  return last_value = (df2 - df1);
+}
+
+double driftfac::get_hydrokick_factor(integertime time0, integertime time1)
+{
+  static integertime last_time0 = -1, last_time1 = -1;
+  static double last_value;
+
+  if(time0 == last_time0 && time1 == last_time1)
+    return last_value;
+
+  /* note: will only be called for cosmological integration */
+
+  double a1 = logTimeBegin + time0 * All.Timebase_interval;
+  double a2 = logTimeBegin + time1 * All.Timebase_interval;
+
+  double u1 = (a1 - logTimeBegin) / (logTimeMax - logTimeBegin) * DRIFT_TABLE_LENGTH;
+  int i1    = (int)u1;
+  if(i1 >= DRIFT_TABLE_LENGTH)
+    i1 = DRIFT_TABLE_LENGTH - 1;
+
+  double df1;
+  if(i1 <= 1)
+    df1 = u1 * HydroKickTable[0];
+  else
+    df1 = HydroKickTable[i1 - 1] + (HydroKickTable[i1] - HydroKickTable[i1 - 1]) * (u1 - i1);
+
+  double u2 = (a2 - logTimeBegin) / (logTimeMax - logTimeBegin) * DRIFT_TABLE_LENGTH;
+  int i2    = (int)u2;
+  if(i2 >= DRIFT_TABLE_LENGTH)
+    i2 = DRIFT_TABLE_LENGTH - 1;
+
+  double df2;
+  if(i2 <= 1)
+    df2 = u2 * HydroKickTable[0];
+  else
+    df2 = HydroKickTable[i2 - 1] + (HydroKickTable[i2] - HydroKickTable[i2 - 1]) * (u2 - i2);
+
+  last_time0 = time0;
+  last_time1 = time1;
+
+  return last_value = (df2 - df1);
+}
+
+double driftfac::get_comoving_distance(integertime time0)
+{
+  /* we just need to multiply this with the speed of light to get the correct cosmological factor */
+  double fac = get_gravkick_factor(time0, TIMEBASE);
+
+  return fac * (CLIGHT / All.UnitVelocity_in_cm_per_s);
+}
+
+double driftfac::get_comoving_distance_for_scalefactor(double ascale)
+{
+  integertime time0 = log(ascale / All.TimeBegin) / All.Timebase_interval;
+
+  /* we just need to multiply this with the speed of light to get the correct cosmological factor */
+  double fac = get_gravkick_factor(time0, TIMEBASE);
+
+  return fac * (CLIGHT / All.UnitVelocity_in_cm_per_s);
+}
+
+double driftfac::get_scalefactor_for_comoving_distance(double dist)
+{
+  double fac = dist / (CLIGHT / All.UnitVelocity_in_cm_per_s);
+
+  integertime time0 = get_gravkick_factor_inverse(fac);
+
+  double ascale = All.TimeBegin * exp(time0 * All.Timebase_interval);
+
+  return ascale;
+}
+
+integertime driftfac::get_gravkick_factor_inverse(double fac)
+{
+  integertime time1 = TIMEBASE;
+
+  double a2 = logTimeBegin + time1 * All.Timebase_interval;
+
+  double u2 = (a2 - logTimeBegin) / (logTimeMax - logTimeBegin) * DRIFT_TABLE_LENGTH;
+  int i2    = (int)u2;
+  if(i2 >= DRIFT_TABLE_LENGTH)
+    i2 = DRIFT_TABLE_LENGTH - 1;
+
+  double df2;
+  if(i2 <= 1)
+    df2 = u2 * GravKickTable[0];
+  else
+    df2 = GravKickTable[i2 - 1] + (GravKickTable[i2] - GravKickTable[i2 - 1]) * (u2 - i2);
+
+  double df1 = df2 - fac;
+
+  int i0 = 0;
+  int i1 = DRIFT_TABLE_LENGTH - 1;
+
+  if(df1 < 0 || df1 > GravKickTable[i1])
+    Terminate("out of range:  df1=%g  GravKickTable[i0]=%g  GravKickTable[i1]=%g\n", df1, GravKickTable[i0], GravKickTable[i1]);
+
+  double u1;
+
+  if(df1 <= GravKickTable[0])
+    u1 = df1 / GravKickTable[0];
+  else
+    {
+      while(i1 - i0 > 1)
+        {
+          int im = (i0 + i1) / 2;
+          if(df1 < GravKickTable[im])
+            i1 = im;
+          else
+            i0 = im;
+        }
+
+      u1 = (df1 - GravKickTable[i0]) / (GravKickTable[i1] - GravKickTable[i0]) + i1;
+    }
+
+  double a1 = u1 * (logTimeMax - logTimeBegin) / DRIFT_TABLE_LENGTH + logTimeBegin;
+
+  integertime time0 = (a1 - logTimeBegin) / All.Timebase_interval;
+
+  return time0;
+}
diff --git a/src/time_integration/driftfac.h b/src/time_integration/driftfac.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9aa070f648600ba016bdd1ca7509a292582a37f
--- /dev/null
+++ b/src/time_integration/driftfac.h
@@ -0,0 +1,87 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  driftfac.h
+ *
+ *  \brief declares a class for supporting cosmological drift/kick factor calculations
+ */
+
+#ifndef DRIFTFAC_H
+#define DRIFTFAC_H
+
+#include <gsl/gsl_integration.h>
+#include <gsl/gsl_math.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/dtypes.h"
+#include "../main/main.h"
+#include "gadgetconfig.h"
+
+class driftfac
+{
+ public:
+  void init_drift_table(void);
+  double get_drift_factor(integertime time0, integertime time1);
+  double get_gravkick_factor(integertime time0, integertime time1);
+  double get_hydrokick_factor(integertime time0, integertime time1);
+  double get_comoving_distance(integertime time0);
+  double get_comoving_distance_for_scalefactor(double ascale);
+  double get_scalefactor_for_comoving_distance(double dist);
+  integertime get_gravkick_factor_inverse(double fac);
+
+  static double hubble_function(double a)
+  {
+    double hubble_a = All.Omega0 / (a * a * a) + (1 - All.Omega0 - All.OmegaLambda) / (a * a) + All.OmegaLambda;
+
+    hubble_a = All.Hubble * sqrt(hubble_a);
+
+    return hubble_a;
+  }
+
+ private:
+#define DRIFT_TABLE_LENGTH 1000
+
+  /** table for the cosmological drift factors */
+  double DriftTable[DRIFT_TABLE_LENGTH];
+
+  /** table for the cosmological kick factor for gravitational forces */
+  double GravKickTable[DRIFT_TABLE_LENGTH];
+
+  /** table for the cosmological kick factor for hydrodynmical forces */
+  double HydroKickTable[DRIFT_TABLE_LENGTH];
+
+  double logTimeBegin;
+  double logTimeMax;
+
+  static double drift_integ(double a, void *param)
+  {
+    double h = hubble_function(a);
+
+    return 1 / (h * a * a * a);
+  }
+
+  static double gravkick_integ(double a, void *param)
+  {
+    double h = hubble_function(a);
+
+    return 1 / (h * a * a);
+  }
+
+  static double hydrokick_integ(double a, void *param)
+  {
+    double h = hubble_function(a);
+
+    return 1 / (h * pow(a, 3 * GAMMA_MINUS1) * a);
+  }
+};
+
+extern driftfac Driftfac;
+
+#endif
diff --git a/src/time_integration/kicks.cc b/src/time_integration/kicks.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b66beea17d171df21cad1be10b015e3172c25879
--- /dev/null
+++ b/src/time_integration/kicks.cc
@@ -0,0 +1,590 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  kicks.cc
+ *
+ *  \brief drives gravitational and hydrodynamical force calculations and applies corresponding kicks to particles
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../domain/domain.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../system/system.h"
+#include "../time_integration/driftfac.h"
+#include "../time_integration/timestep.h"
+
+/*! \brief performs the first half step kick operator for the gravity
+ *
+ * This function applies a half step kick similar to do_gravity_step_second_half().
+ * If we are on a PM step the kick due to the particle mesh's long range gravity
+ * is applied first. Afterwards the short range kick due to the tree force is added.
+ */
+void sim::find_timesteps_and_do_gravity_step_first_half(void)
+{
+  TIMER_START(CPU_DRIFTS);
+
+  All.set_cosmo_factors_for_current_time();
+
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+  if(All.PM_Ti_endstep == All.Ti_Current) /* need to do long-range kick */
+    {
+      /* first, determine the new PM timestep */
+      integertime ti_step = Sp.get_timestep_pm();
+
+      All.PM_Ti_begstep = All.PM_Ti_endstep;
+      All.PM_Ti_endstep = All.PM_Ti_begstep + ti_step;
+
+      integertime tstart = All.PM_Ti_begstep;
+      integertime tend   = tstart + ti_step / 2;
+
+      double dt_gravkick;
+
+      if(All.ComovingIntegrationOn)
+        dt_gravkick = Driftfac.get_gravkick_factor(tstart, tend);
+      else
+        dt_gravkick = (tend - tstart) * All.Timebase_interval;
+
+      for(int i = 0; i < Sp.NumPart; i++)
+        {
+          for(int j = 0; j < 3; j++)
+            Sp.P[i].Vel[j] += Sp.P[i].GravPM[j] * dt_gravkick;
+        }
+    }
+#endif
+
+  Sp.TimeBinsGravity.timebin_make_list_of_active_particles_up_to_timebin(All.HighestSynchronizedTimeBin);
+  sumup_large_ints(1, &Sp.TimeBinsGravity.NActiveParticles, &Sp.TimeBinsGravity.GlobalNActiveParticles, Communicator);
+
+#ifdef FORCE_EQUAL_TIMESTEPS
+  find_global_timesteps();
+#endif
+
+#ifdef HIERARCHICAL_GRAVITY
+  /* First, move all active particles to the highest allowed timestep for this synchronization time.
+   * They will then cascade down to smaller timesteps as needed.
+   */
+  for(int i = 0; i < Sp.TimeBinsGravity.NActiveParticles; i++)
+    {
+      int target = Sp.TimeBinsGravity.ActiveParticleList[i];
+      int bin    = All.HighestSynchronizedTimeBin;
+      int binold = Sp.P[target].TimeBinGrav;
+
+      Sp.TimeBinsGravity.timebin_move_particle(target, binold, bin);
+      Sp.P[target].TimeBinGrav = bin;
+    }
+
+  long long Previous_GlobalNActiveGravity = Sp.TimeBinsGravity.GlobalNActiveParticles;
+
+  double dt_gravsum = 0;
+
+  int bin_highest_occupied = 0;
+
+  /* go over all timebins */
+  for(int timebin = All.HighestSynchronizedTimeBin; timebin >= 0; timebin--)
+    {
+      Sp.TimeBinsGravity.NActiveParticles = 0;
+
+      Sp.TimeBinsGravity.timebin_add_particles_of_timebin_to_list_of_active_particles(timebin);
+      sumup_large_ints(1, &Sp.TimeBinsGravity.NActiveParticles, &Sp.TimeBinsGravity.GlobalNActiveParticles, Communicator);
+
+      if(Sp.TimeBinsGravity.GlobalNActiveParticles == 0) /* we are done at this point */
+        break;
+
+      /* calculate gravity for all active particles */
+      if(Sp.TimeBinsGravity.GlobalNActiveParticles != Previous_GlobalNActiveGravity)
+        {
+          TIMER_STOP(CPU_DRIFTS);
+
+          compute_grav_accelerations(timebin);
+
+          TIMER_START(CPU_DRIFTS);
+        }
+
+      /* now check whether the current timestep should be reduced */
+
+      int nfine = 0;
+      for(int i = 0; i < Sp.TimeBinsGravity.NActiveParticles; i++)
+        {
+          int target = Sp.TimeBinsGravity.ActiveParticleList[i];
+          int binold = Sp.P[target].TimeBinGrav;
+
+          if(Sp.test_if_grav_timestep_is_too_large(target, binold))
+            nfine++;
+        }
+
+      long long nfine_tot;
+      sumup_large_ints(1, &nfine, &nfine_tot, Communicator);
+
+      int push_down_flag = 0;
+      if(Sp.TimeBinsGravity.GlobalNActiveParticles == Sp.TotNumPart && nfine_tot < Sp.TimeBinsGravity.GlobalNActiveParticles &&
+         nfine_tot > 0.33 * Sp.TimeBinsGravity.GlobalNActiveParticles)
+        {
+          mpi_printf(
+              "KICKS: We reduce the highest occupied timestep by pushing %lld particles on timebin=%d down in timestep (fraction "
+              "wanting lower bin is: %g)\n",
+              Sp.TimeBinsGravity.GlobalNActiveParticles - nfine_tot, timebin,
+              ((double)nfine_tot) / Sp.TimeBinsGravity.GlobalNActiveParticles);
+
+          push_down_flag = 1;
+        }
+
+      for(int i = 0; i < Sp.TimeBinsGravity.NActiveParticles; i++)
+        {
+          int target = Sp.TimeBinsGravity.ActiveParticleList[i];
+          int binold = Sp.P[target].TimeBinGrav;
+
+          if(push_down_flag || Sp.test_if_grav_timestep_is_too_large(target, binold))
+            {
+              int bin = binold - 1;
+
+              if(bin == 0)
+                {
+                  Sp.print_particle_info(target);
+                  Terminate("timestep too small");
+                }
+
+              Sp.TimeBinsGravity.timebin_move_particle(target, binold, bin);
+              Sp.P[target].TimeBinGrav = bin;
+            }
+          else if(binold > bin_highest_occupied)
+            bin_highest_occupied = binold;
+        }
+
+      if(All.HighestOccupiedGravTimeBin == 0) /* this will only be the case in the very first step */
+        {
+          MPI_Allreduce(&bin_highest_occupied, &All.HighestOccupiedGravTimeBin, 1, MPI_INT, MPI_MAX, Communicator);
+          if(All.HighestOccupiedTimeBin > 0)
+            mpi_printf("KICKS: Special Start-up: All.HighestOccupiedGravTimeBin=%d\n", All.HighestOccupiedGravTimeBin);
+        }
+
+      if(Sp.TimeBinsGravity.GlobalNActiveParticles)
+        {
+          integertime ti_step = timebin ? (((integertime)1) << timebin) : 0;
+
+          integertime tstart = All.Ti_begstep[timebin]; /* beginning of step */
+          integertime tend   = tstart + ti_step / 2;    /* midpoint of step */
+
+          double dt_gravkick;
+
+          if(All.ComovingIntegrationOn)
+            dt_gravkick = Driftfac.get_gravkick_factor(tstart, tend);
+          else
+            dt_gravkick = (tend - tstart) * All.Timebase_interval;
+
+          double dt_save = dt_gravkick;
+
+          if(timebin < All.HighestSynchronizedTimeBin)
+            {
+              ti_step = (timebin + 1) ? (((integertime)1) << (timebin + 1)) : 0;
+
+              tstart = All.Ti_begstep[timebin + 1]; /* beginning of step */
+              tend   = tstart + ti_step / 2;        /* midpoint of step */
+
+              if(All.ComovingIntegrationOn)
+                dt_gravkick -= Driftfac.get_gravkick_factor(tstart, tend);
+              else
+                dt_gravkick -= (tend - tstart) * All.Timebase_interval;
+            }
+
+          dt_gravsum += dt_gravkick;
+
+          mpi_printf("KICKS: 1st gravity for hierarchical timebin=%d:  %lld particles   dt_gravkick=%g  %g %g\n", timebin,
+                     Sp.TimeBinsGravity.GlobalNActiveParticles, dt_gravkick, dt_gravsum, dt_save);
+
+          for(int i = 0; i < Sp.TimeBinsGravity.NActiveParticles; i++)
+            {
+              int target = Sp.TimeBinsGravity.ActiveParticleList[i];
+
+              for(int j = 0; j < 3; j++)
+                Sp.P[target].Vel[j] += Sp.P[target].GravAccel[j] * dt_gravkick;
+            }
+        }
+
+      Previous_GlobalNActiveGravity = Sp.TimeBinsGravity.GlobalNActiveParticles;
+    }
+
+#else
+
+  mpi_printf("KICKS: 1st gravity for highest active timebin=%d:  particles %lld\n", All.HighestActiveTimeBin,
+             Sp.TimeBinsGravity.GlobalNActiveParticles);
+
+  for(int i = 0; i < Sp.TimeBinsGravity.NActiveParticles; i++)
+    {
+      int target          = Sp.TimeBinsGravity.ActiveParticleList[i];
+
+#ifndef FORCE_EQUAL_TIMESTEPS
+      integertime ti_step = Sp.get_timestep_grav(target);
+      int timebin;
+
+      Sp.timebins_get_bin_and_do_validity_checks(ti_step, &timebin, Sp.P[target].TimeBinGrav);
+
+      ti_step = timebin ? (((integertime)1) << timebin) : 0;
+
+      Sp.TimeBinsGravity.timebin_move_particle(target, Sp.P[target].TimeBinGrav, timebin);
+      Sp.P[target].TimeBinGrav = timebin;
+#else
+      int timebin         = Sp.P[target].TimeBinGrav;
+      integertime ti_step = timebin ? (((integertime)1) << timebin) : 0;
+#endif
+
+      integertime tstart = All.Ti_begstep[timebin]; /* beginning of step */
+      integertime tend   = tstart + ti_step / 2;    /* midpoint of step */
+
+      double dt_gravkick;
+
+      if(All.ComovingIntegrationOn)
+        dt_gravkick = Driftfac.get_gravkick_factor(tstart, tend);
+      else
+        dt_gravkick = (tend - tstart) * All.Timebase_interval;
+
+      for(int j = 0; j < 3; j++)
+        Sp.P[target].Vel[j] += Sp.P[target].GravAccel[j] * dt_gravkick;
+    }
+
+#endif
+
+  TIMER_STOP(CPU_DRIFTS);
+}
+
+/*! \brief performs the second gravity half step kick operator
+ *
+ * This function applies a half step kick similar to do_gravity_step_first_half().
+ * First the short range kick due to the tree force is added. If we are on a PM step the kick
+ * due to the particle mesh's long range gravity is applied too. In both cases
+ * the momentum and energy for Sph particles is updated.
+ */
+void sim::do_gravity_step_second_half(void)
+{
+  TIMER_START(CPU_DRIFTS);
+
+  char fullmark[8];
+
+  if(All.HighestActiveTimeBin == All.HighestOccupiedTimeBin)
+    sprintf(fullmark, "(*)");
+  else
+    fullmark[0] = 0;
+
+  if(ThisTask == 0)
+    {
+      fprintf(Logs.FdTimings, "\nStep%s: %d, t: %g, dt: %g, highest active timebin: %d  (lowest active: %d, highest occupied: %d)\n",
+              fullmark, All.NumCurrentTiStep, All.Time, All.TimeStep, All.HighestActiveTimeBin, All.LowestActiveTimeBin,
+              All.HighestOccupiedTimeBin);
+
+      fprintf(Logs.FdDensity, "\nStep%s: %d, t: %g, dt: %g, highest active timebin: %d  (lowest active: %d, highest occupied: %d)\n",
+              fullmark, All.NumCurrentTiStep, All.Time, All.TimeStep, All.HighestActiveTimeBin, All.LowestActiveTimeBin,
+              All.HighestOccupiedTimeBin);
+
+      fprintf(Logs.FdHydro, "\nStep%s: %d, t: %g, dt: %g, highest active timebin: %d  (lowest active: %d, highest occupied: %d)\n",
+              fullmark, All.NumCurrentTiStep, All.Time, All.TimeStep, All.HighestActiveTimeBin, All.LowestActiveTimeBin,
+              All.HighestOccupiedTimeBin);
+    }
+
+  double dt_gravkick;
+
+#ifdef HIERARCHICAL_GRAVITY
+  /* go over all timebins, in inverse sequence so that we end up getting the cumulative force at the end */
+  for(int timebin = 0; timebin <= All.HighestActiveTimeBin; timebin++)
+    {
+      if(Sp.TimeBinSynchronized[timebin])
+        {
+          /* need to make all timebins below the current one active */
+          Sp.TimeBinsGravity.timebin_make_list_of_active_particles_up_to_timebin(timebin);
+          sumup_large_ints(1, &Sp.TimeBinsGravity.NActiveParticles, &Sp.TimeBinsGravity.GlobalNActiveParticles, Communicator);
+
+          if(Sp.TimeBinsGravity.GlobalNActiveParticles)
+            {
+              /* calculate gravity for all active particles */
+
+              TIMER_STOP(CPU_DRIFTS);
+
+              compute_grav_accelerations(timebin);
+
+              TIMER_START(CPU_DRIFTS);
+
+              mpi_printf("KICKS: 2nd gravity for hierarchical timebin=%d:  particles %lld\n", timebin,
+                         Sp.TimeBinsGravity.GlobalNActiveParticles);
+
+              integertime ti_step = timebin ? (((integertime)1) << timebin) : 0;
+
+              integertime tend = All.Ti_begstep[timebin]; /* end of step (Note: All.Ti_begstep[] has already been advanced for the next
+                                                             step at this point)   */
+              integertime tstart = tend - ti_step / 2;    /* midpoint of step */
+
+              if(All.ComovingIntegrationOn)
+                dt_gravkick = Driftfac.get_gravkick_factor(tstart, tend);
+              else
+                dt_gravkick = (tend - tstart) * All.Timebase_interval;
+
+              if(timebin < All.HighestActiveTimeBin)
+                {
+                  ti_step = (timebin + 1) ? (((integertime)1) << (timebin + 1)) : 0;
+
+                  tend = All.Ti_begstep[timebin + 1]; /* end of step (Note: All.Ti_begstep[] has already been advanced for the next
+                                                         step at this point)   */
+                  tstart = tend - ti_step / 2;        /* midpoint of step */
+
+                  if(All.ComovingIntegrationOn)
+                    dt_gravkick -= Driftfac.get_gravkick_factor(tstart, tend);
+                  else
+                    dt_gravkick -= (tend - tstart) * All.Timebase_interval;
+                }
+
+              for(int i = 0; i < Sp.TimeBinsGravity.NActiveParticles; i++)
+                {
+                  int target = Sp.TimeBinsGravity.ActiveParticleList[i];
+
+                  for(int j = 0; j < 3; j++)
+                    Sp.P[target].Vel[j] += Sp.P[target].GravAccel[j] * dt_gravkick;
+
+                  if(Sp.P[target].getType() == 0 && All.HighestOccupiedGravTimeBin == timebin)
+                    {
+                      for(int j = 0; j < 3; j++)
+                        {
+                          Sp.SphP[target].VelPred[j]       = Sp.P[target].Vel[j];
+                          Sp.SphP[target].FullGravAccel[j] = Sp.P[target].GravAccel[j];
+                        }
+                    }
+                }
+            }
+        }
+    }
+#else
+
+  Sp.TimeBinsGravity.timebin_make_list_of_active_particles_up_to_timebin(All.HighestActiveTimeBin);
+  sumup_large_ints(1, &Sp.TimeBinsGravity.NActiveParticles, &Sp.TimeBinsGravity.GlobalNActiveParticles, Communicator);
+
+  if(Sp.TimeBinsGravity.GlobalNActiveParticles)
+    {
+      TIMER_STOP(CPU_DRIFTS);
+
+      /* calculate gravity for all active particles */
+      compute_grav_accelerations(All.HighestActiveTimeBin);
+
+      TIMER_START(CPU_DRIFTS);
+
+      mpi_printf("KICKS: 2nd gravity for highest active timebin=%d:  particles %lld\n", All.HighestActiveTimeBin,
+                 Sp.TimeBinsGravity.GlobalNActiveParticles);
+
+      for(int i = 0; i < Sp.TimeBinsGravity.NActiveParticles; i++)
+        {
+          int target  = Sp.TimeBinsGravity.ActiveParticleList[i];
+          int timebin = Sp.P[target].TimeBinGrav;
+
+          integertime ti_step = (timebin) ? (((integertime)1) << (timebin)) : 0;
+          integertime tend    = All.Ti_Current;
+          integertime tstart  = tend - ti_step / 2; /* midpoint of step */
+
+          if(All.ComovingIntegrationOn)
+            dt_gravkick = Driftfac.get_gravkick_factor(tstart, tend);
+          else
+            dt_gravkick = (tend - tstart) * All.Timebase_interval;
+
+          for(int j = 0; j < 3; j++)
+            Sp.P[target].Vel[j] += Sp.P[target].GravAccel[j] * dt_gravkick;
+
+          if(Sp.P[target].getType() == 0)
+            {
+              for(int j = 0; j < 3; j++)
+                Sp.SphP[target].VelPred[j] = Sp.P[target].Vel[j];
+            }
+        }
+    }
+
+#endif
+
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+  if(All.PM_Ti_endstep == All.Ti_Current) /* need to do long-range kick */
+    {
+      TIMER_STOP(CPU_DRIFTS);
+
+      gravity_long_range_force();
+
+      TIMER_START(CPU_DRIFTS);
+
+      integertime ti_step = All.PM_Ti_endstep - All.PM_Ti_begstep;
+      integertime tstart  = All.PM_Ti_begstep + ti_step / 2;
+      integertime tend    = tstart + ti_step / 2;
+
+      if(All.ComovingIntegrationOn)
+        dt_gravkick = Driftfac.get_gravkick_factor(tstart, tend);
+      else
+        dt_gravkick = (tend - tstart) * All.Timebase_interval;
+
+      for(int i = 0; i < Sp.NumPart; i++)
+        for(int j = 0; j < 3; j++)
+          Sp.P[i].Vel[j] += Sp.P[i].GravPM[j] * dt_gravkick;
+
+      for(int i = 0; i < Sp.NumGas; i++)
+        if(Sp.P[i].getType() == 0)
+          for(int j = 0; j < 3; j++)
+            Sp.SphP[i].VelPred[j] = Sp.P[i].Vel[j];
+
+      gravity_set_oldacc(All.HighestActiveTimeBin);
+    }
+#else
+  gravity_set_oldacc(All.HighestActiveTimeBin);
+#endif
+
+  TIMER_STOP(CPU_DRIFTS);
+}
+
+void sim::do_hydro_step_first_half(void)
+{
+  if(All.NumCurrentTiStep == 0) /* special domain decomposition now that we know the timebins of both gravity and hydro */
+    {
+      Sp.mark_active_timebins();
+
+      NgbTree.treefree();
+
+      Domain.domain_free();
+      Domain.domain_decomposition(STANDARD);
+
+      NgbTree.treeallocate(Sp.NumGas, &Sp, &Domain);
+      NgbTree.treebuild(Sp.NumGas, NULL);
+    }
+
+  /* now we can calculate the hydro forces */
+  hydro_force(FIRST_HALF_STEP); /* computes hydrodynamical accelerations and rate of chnange of entropy,
+                                   and applies this where appropriate directly (half step kicks)  */
+}
+
+void sim::do_hydro_step_second_half(void)
+{
+  /* now we can calculate the hydro forces */
+  hydro_force(SECOND_HALF_STEP); /* computes hydrodynamical accelerations and change rate of entropy,
+                                    and applies this where appropriate directly (half step kicks)  */
+}
+
+/*! This function is the driver routine for the calculation of hydrodynamical
+ *  force and rate of change of entropy due to shock heating for all active
+ *  particles .
+ */
+void sim::hydro_force(int step_indicator)
+{
+  if(Sp.TimeBinsHydro.GlobalNActiveParticles == 0)
+    return;
+
+  /* Create list of targets. */
+  int *targetlist = (int *)Mem.mymalloc("targetlist", Sp.NumGas * sizeof(int));
+
+  struct old_hydro_accel
+  {
+    MyFloat HydroAccel[3];
+  };
+  old_hydro_accel *Old = NULL;
+
+  if(step_indicator == SECOND_HALF_STEP)
+    Old = (old_hydro_accel *)Mem.mymalloc("Old", Sp.TimeBinsHydro.NActiveParticles * sizeof(old_hydro_accel));
+
+  int Nhydroforces = 0;
+
+  for(int i = 0; i < Sp.TimeBinsHydro.NActiveParticles; i++)
+    {
+      int target = Sp.TimeBinsHydro.ActiveParticleList[i];
+
+      if(target < 0 || target >= Sp.NumGas)
+        Terminate("target=%d i=%d\n", target, i);
+
+      if(step_indicator == SECOND_HALF_STEP)
+        {
+          Old[i].HydroAccel[0] = Sp.SphP[target].HydroAccel[0];
+          Old[i].HydroAccel[1] = Sp.SphP[target].HydroAccel[1];
+          Old[i].HydroAccel[2] = Sp.SphP[target].HydroAccel[2];
+        }
+
+      targetlist[Nhydroforces++] = target;
+    }
+
+#ifdef REUSE_HYDRO_ACCELERATIONS_FROM_PREVIOUS_STEP
+  /* in this case, the forces for the first hydro step are simply kept */
+  if(step_indicator != FIRST_HALF_STEP)
+#endif
+    {
+      NgbTree.hydro_forces_determine(Nhydroforces, targetlist);
+    }
+
+  /* let's now do the hydrodynamical kicks */
+  for(int i = 0; i < Nhydroforces; i++)
+    {
+      int target = Sp.TimeBinsHydro.ActiveParticleList[i];
+
+      int timebin = Sp.P[target].getTimeBinHydro();
+
+      integertime tstart, tend, ti_step = timebin ? (((integertime)1) << timebin) : 0;
+
+      if(step_indicator == SECOND_HALF_STEP)
+        {
+          tend   = All.Ti_Current;
+          tstart = tend - ti_step / 2; /* midpoint of step */
+        }
+      else
+        {
+          tstart = All.Ti_Current;
+          tend   = tstart + ti_step / 2; /* midpoint of step */
+        }
+
+      double dt_hydrokick, dt_entr = (tend - tstart) * All.Timebase_interval;
+
+      if(All.ComovingIntegrationOn)
+        dt_hydrokick = Driftfac.get_hydrokick_factor(tstart, tend);
+      else
+        dt_hydrokick = dt_entr;
+
+      Sp.SphP[target].Entropy += Sp.SphP[target].DtEntropy * dt_entr;
+
+      Sp.P[target].Vel[0] += Sp.SphP[target].HydroAccel[0] * dt_hydrokick;
+      Sp.P[target].Vel[1] += Sp.SphP[target].HydroAccel[1] * dt_hydrokick;
+      Sp.P[target].Vel[2] += Sp.SphP[target].HydroAccel[2] * dt_hydrokick;
+
+      if(step_indicator == SECOND_HALF_STEP)
+        {
+          Sp.SphP[target].EntropyPred = Sp.SphP[target].Entropy;
+          Sp.SphP[target].set_thermodynamic_variables();
+
+          Sp.SphP[target].VelPred[0] += (Sp.SphP[target].HydroAccel[0] - Old[i].HydroAccel[0]) * dt_hydrokick;
+          Sp.SphP[target].VelPred[1] += (Sp.SphP[target].HydroAccel[1] - Old[i].HydroAccel[1]) * dt_hydrokick;
+          Sp.SphP[target].VelPred[2] += (Sp.SphP[target].HydroAccel[2] - Old[i].HydroAccel[2]) * dt_hydrokick;
+
+          /* note: if there is no gravity, we should instead set VelPred = Vel (if this is not done anymore in the gravity
+           * routine)
+           */
+        }
+
+#ifdef TIMEDEP_ART_VISC
+        /*     double csnd_over_h = Sp.SphP[i].Csnd / Sp.SphP[i].Hsml;
+             double f           = fabs(Sp.SphP[i].DivVel) / (fabs(Sp.SphP[i].DivVel) + Sp.SphP[i].CurlVel + 0.0001 * csnd_over_h /
+           NgbTree.fac_mu);
+
+             Sp.SphP[i].Dtalpha =
+                 -(Sp.SphP[i].alpha - All.AlphaMin) * All.DecayTime * 0.5 * Sp.SphP[i].MaxSignalVel / (Sp.SphP[i].Hsml *
+           NgbTree.fac_mu) + f * All.ViscSource * std::max<double>(0.0, -Sp.SphP[i].DivVel); if(All.ComovingIntegrationOn)
+               {
+                 Sp.SphP[i].Dtalpha /= (All.cf_hubble_a * All.Time * All.Time);
+               }*/
+        // TODO check
+#endif
+    }
+
+  if(step_indicator == SECOND_HALF_STEP)
+    Mem.myfree(Old);
+
+  Mem.myfree(targetlist);
+}
diff --git a/src/time_integration/predict.cc b/src/time_integration/predict.cc
new file mode 100644
index 0000000000000000000000000000000000000000..922500d350e6c7f58bb3fff338b477c7fa174aac
--- /dev/null
+++ b/src/time_integration/predict.cc
@@ -0,0 +1,461 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  predict.cc
+ *
+ *  \brief find the next sync point, drift particles forward in time, and (re)build the timebin lists
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/simparticles.h"
+#include "../lightcone/lightcone.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/main.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../system/system.h"
+#include "../time_integration/driftfac.h"
+#include "../time_integration/timestep.h"
+
+/*
+ * It counts the number of particles in each timebin and updates the
+ * linked lists containing the particles of each time bin. Afterwards the
+ * linked list of active particles is updated by make_list_of_active_particles().
+ *
+ * The linked lists for each timebin are stored in #FirstInTimeBin[], #LastInTimeBin[],
+ * #PrevInTimeBin[] and #NextInTimeBin[]. The counters of particles per timebin are
+ * #TimeBinCount and #TimeBinCountSph.
+ */
+void simparticles::reconstruct_timebins(void)
+{
+  TIMER_START(CPU_TIMELINE);
+
+  for(int bin = 0; bin < TIMEBINS; bin++)
+    {
+      TimeBinsHydro.TimeBinCount[bin]   = 0;
+      TimeBinsHydro.FirstInTimeBin[bin] = -1;
+      TimeBinsHydro.LastInTimeBin[bin]  = -1;
+
+      TimeBinsGravity.TimeBinCount[bin]   = 0;
+      TimeBinsGravity.FirstInTimeBin[bin] = -1;
+      TimeBinsGravity.LastInTimeBin[bin]  = -1;
+
+#ifdef STARFORMATION
+      TimeBinSfr[bin] = 0;
+#endif
+    }
+
+  for(int i = 0; i < NumPart; i++)
+    {
+      int bin = P[i].TimeBinGrav;
+
+      if(TimeBinsGravity.TimeBinCount[bin] > 0)
+        {
+          TimeBinsGravity.PrevInTimeBin[i]                                  = TimeBinsGravity.LastInTimeBin[bin];
+          TimeBinsGravity.NextInTimeBin[i]                                  = -1;
+          TimeBinsGravity.NextInTimeBin[TimeBinsGravity.LastInTimeBin[bin]] = i;
+          TimeBinsGravity.LastInTimeBin[bin]                                = i;
+        }
+      else
+        {
+          TimeBinsGravity.FirstInTimeBin[bin] = TimeBinsGravity.LastInTimeBin[bin] = i;
+          TimeBinsGravity.PrevInTimeBin[i] = TimeBinsGravity.NextInTimeBin[i] = -1;
+        }
+
+      TimeBinsGravity.TimeBinCount[bin]++;
+
+      if(P[i].getType() == 0)
+        {
+          bin = P[i].getTimeBinHydro();
+
+          if(TimeBinsHydro.TimeBinCount[bin] > 0)
+            {
+              TimeBinsHydro.PrevInTimeBin[i]                                = TimeBinsHydro.LastInTimeBin[bin];
+              TimeBinsHydro.NextInTimeBin[i]                                = -1;
+              TimeBinsHydro.NextInTimeBin[TimeBinsHydro.LastInTimeBin[bin]] = i;
+              TimeBinsHydro.LastInTimeBin[bin]                              = i;
+            }
+          else
+            {
+              TimeBinsHydro.FirstInTimeBin[bin] = TimeBinsHydro.LastInTimeBin[bin] = i;
+              TimeBinsHydro.PrevInTimeBin[i] = TimeBinsHydro.NextInTimeBin[i] = -1;
+            }
+
+          TimeBinsHydro.TimeBinCount[bin]++;
+
+#ifdef STARFORMATION
+          TimeBinSfr[bin] += SphP[i].Sfr;
+#endif
+        }
+    }
+
+  make_list_of_active_particles();
+
+  TIMER_STOP(CPU_TIMELINE);
+}
+
+/*! \brief This function finds the next synchronization point of the system.
+ * (i.e. the earliest point of time any of the particles needs a force
+ * computation).
+ *
+ * This function drifts all particles, including inactive particles to the
+ * next sync point. This is done by drift_particles(). Afterwards the linked
+ * list of active particles is updated to the new sync point by
+ * make_list_of_active_particles(). Particles become active/inactive here.
+ */
+integertime simparticles::find_next_sync_point(void)
+{
+  /* find the next kick time */
+  integertime ti_next_kick = TIMEBASE;
+
+  for(int n = 0; n < TIMEBINS; n++)
+    {
+      if(TimeBinsGravity.TimeBinCount[n] || TimeBinsHydro.TimeBinCount[n])
+        {
+          integertime ti_next_for_bin;
+          if(n > 0)
+            {
+              integertime dt_bin = (((integertime)1) << n);
+              ti_next_for_bin    = (All.Ti_Current / dt_bin) * dt_bin + dt_bin; /* next kick time for this timebin */
+            }
+          else
+            {
+              ti_next_for_bin = All.Ti_Current;
+            }
+
+          if(ti_next_for_bin < ti_next_kick)
+            ti_next_kick = ti_next_for_bin;
+        }
+    }
+
+  integertime ti_next_kick_global;
+#ifdef ENLARGE_DYNAMIC_RANGE_IN_TIME
+  minimum_large_ints(1, &ti_next_kick, &ti_next_kick_global, Communicator);
+#else
+  MPI_Allreduce(&ti_next_kick, &ti_next_kick_global, 1, MPI_INT, MPI_MIN, Communicator);
+#endif
+
+  return ti_next_kick_global;
+}
+
+void simparticles::mark_active_timebins(void)
+{
+  int lowest_active_bin = TIMEBINS, highest_active_bin = 0;
+  int lowest_occupied_bin = TIMEBINS, highest_occupied_bin = 0;
+  int lowest_occupied_gravity_bin = TIMEBINS, highest_occupied_gravity_bin = 0;
+  int highest_synchronized_bin = 0;
+  int nsynchronized_gravity = 0, nsynchronized_hydro = 0;
+
+  /* mark the bins that will be synchronized/active */
+
+  for(int n = 0; n < TIMEBINS; n++)
+    {
+      if(TimeBinsGravity.TimeBinCount[n])
+        {
+          if(highest_occupied_gravity_bin < n)
+            highest_occupied_gravity_bin = n;
+
+          if(lowest_occupied_gravity_bin > n)
+            lowest_occupied_gravity_bin = n;
+        }
+
+      int active = TimeBinsHydro.TimeBinCount[n] + TimeBinsGravity.TimeBinCount[n];
+
+      if(active)
+        {
+          if(highest_occupied_bin < n)
+            highest_occupied_bin = n;
+
+          if(lowest_occupied_bin > n)
+            lowest_occupied_bin = n;
+        }
+
+      integertime dt_bin = (((integertime)1) << n);
+
+      if((All.Ti_Current % dt_bin) == 0)
+        {
+          TimeBinSynchronized[n] = 1;
+          All.Ti_begstep[n]      = All.Ti_Current;
+
+          nsynchronized_gravity += TimeBinsGravity.TimeBinCount[n];
+          nsynchronized_hydro += TimeBinsHydro.TimeBinCount[n];
+
+          if(highest_synchronized_bin < n)
+            highest_synchronized_bin = n;
+
+          if(active)
+            {
+              if(highest_active_bin < n)
+                highest_active_bin = n;
+
+              if(lowest_active_bin > n)
+                lowest_active_bin = n;
+            }
+        }
+      else
+        TimeBinSynchronized[n] = 0;
+    }
+
+  int lowest_in[3], lowest_out[3];
+  lowest_in[0] = lowest_occupied_bin;
+  lowest_in[1] = lowest_occupied_gravity_bin;
+  lowest_in[2] = lowest_active_bin;
+  MPI_Allreduce(lowest_in, lowest_out, 3, MPI_INT, MPI_MIN, Communicator);
+  All.LowestOccupiedTimeBin     = lowest_out[0];
+  All.LowestOccupiedGravTimeBin = lowest_out[1];
+  All.LowestActiveTimeBin       = lowest_out[2];
+
+  int highest_in[4], highest_out[4];
+  highest_in[0] = highest_occupied_bin;
+  highest_in[1] = highest_occupied_gravity_bin;
+  highest_in[2] = highest_active_bin;
+  highest_in[3] = highest_synchronized_bin;
+  MPI_Allreduce(highest_in, highest_out, 4, MPI_INT, MPI_MAX, Communicator);
+  All.HighestOccupiedTimeBin     = highest_out[0];
+  All.HighestOccupiedGravTimeBin = highest_out[1];
+  All.HighestActiveTimeBin       = highest_out[2];
+  All.HighestSynchronizedTimeBin = highest_out[3];
+
+  /* note: the lowest synchronized bin is always 1 */
+
+  int input_ints[2 + 2 * TIMEBINS];
+  long long output_longs[2 + 2 * TIMEBINS];
+
+  input_ints[0] = nsynchronized_hydro;
+  input_ints[1] = nsynchronized_gravity;
+  memcpy(input_ints + 2, TimeBinsGravity.TimeBinCount, TIMEBINS * sizeof(int));
+  memcpy(input_ints + 2 + TIMEBINS, TimeBinsHydro.TimeBinCount, TIMEBINS * sizeof(int));
+
+  sumup_large_ints(2 + 2 * TIMEBINS, input_ints, output_longs, Communicator);
+
+  All.GlobalNSynchronizedHydro   = output_longs[0];
+  All.GlobalNSynchronizedGravity = output_longs[1];
+  long long *tot_count_grav      = output_longs + 2;
+  long long *tot_count_sph       = output_longs + 2 + TIMEBINS;
+
+  long long tot_grav = 0, tot_sph = 0;
+
+  for(int n = 0; n < TIMEBINS; n++)
+    {
+      tot_grav += tot_count_grav[n];
+      tot_sph += tot_count_sph[n];
+
+      if(n > 0)
+        {
+          tot_count_grav[n] += tot_count_grav[n - 1];
+          tot_count_sph[n] += tot_count_sph[n - 1];
+        }
+    }
+
+  All.SmallestTimeBinWithDomainDecomposition = All.HighestOccupiedTimeBin;
+
+  for(int n = All.HighestOccupiedTimeBin; n >= All.LowestOccupiedTimeBin; n--)
+    {
+      if(tot_count_grav[n] > All.ActivePartFracForNewDomainDecomp * tot_grav ||
+         tot_count_sph[n] > All.ActivePartFracForNewDomainDecomp * tot_sph)
+        All.SmallestTimeBinWithDomainDecomposition = n;
+    }
+}
+
+void simparticles::drift_all_particles(void)
+{
+  TIMER_START(CPU_DRIFTS);
+
+  for(int i = 0; i < NumPart; i++)
+    {
+#ifdef LIGHTCONE_MASSMAPS
+      int flag = drift_particle(&P[i], &SphP[i], All.Ti_Current);
+
+      if(flag)
+        {
+          MPI_Allreduce(MPI_IN_PLACE, &flag, 1, MPI_INT, MPI_MAX, Communicator);
+          LightCone->lightcone_massmap_flush(0);
+        }
+#else
+      drift_particle(&P[i], &SphP[i], All.Ti_Current);
+#endif
+    }
+
+#ifdef LIGHTCONE_MASSMAPS
+  int flag = 0;
+  do
+    {
+      flag = 0;
+      MPI_Allreduce(MPI_IN_PLACE, &flag, 1, MPI_INT, MPI_MAX, Communicator);
+      if(flag)
+        LightCone->lightcone_massmap_flush(0);
+    }
+  while(flag);
+#endif
+
+  TIMER_STOP(CPU_DRIFTS);
+}
+
+/*! \brief This function drifts a particle i to time1
+ *
+ * @param time_previous current time
+ * @param time1 time to which particles get drifted
+ */
+int simparticles::drift_particle(particle_data *P, sph_particle_data *SphP, integertime time1, bool ignore_light_cone)
+{
+  int buffer_full_flag = 0;
+#ifndef LEAN
+  while(P->access.test_and_set(std::memory_order_acquire))
+    {
+      // acquire spin lock
+    }
+#endif
+
+  integertime time0 = P->Ti_Current.load(std::memory_order_acquire);
+
+  if(time1 == time0)
+    {
+#ifndef LEAN
+      P->access.clear(std::memory_order_release);
+#endif
+      return buffer_full_flag;
+    }
+
+  if(time1 < time0)
+    Terminate("no prediction into past allowed: time0=%lld time1=%lld\n", (long long)time0, (long long)time1);
+
+  double dt_drift;
+
+  if(All.ComovingIntegrationOn)
+    dt_drift = Driftfac.get_drift_factor(time0, time1);
+  else
+    dt_drift = (time1 - time0) * All.Timebase_interval;
+
+#ifdef LIGHTCONE
+  if(ignore_light_cone == false)
+    buffer_full_flag = LightCone->lightcone_test_for_particle_addition(P, time0, time1, dt_drift);
+#endif
+
+  double posdiff[3];
+  for(int j = 0; j < 3; j++)
+    posdiff[j] = P->Vel[j] * dt_drift;
+
+  MyIntPosType delta[3];
+  pos_to_signedintpos(posdiff, (MySignedIntPosType *)delta);
+
+  for(int j = 0; j < 3; j++)
+    P->IntPos[j] += delta[j];
+
+  constrain_intpos(P->IntPos); /* will only do something if we have a stretched box */
+
+  if(P->getType() == 0)
+    {
+      double dt_hydrokick, dt_entr, dt_gravkick;
+
+      if(All.ComovingIntegrationOn)
+        {
+          dt_entr      = (time1 - time0) * All.Timebase_interval;
+          dt_hydrokick = Driftfac.get_hydrokick_factor(time0, time1);
+          dt_gravkick  = Driftfac.get_gravkick_factor(time0, time1);
+        }
+      else
+        dt_gravkick = dt_entr = dt_hydrokick = dt_drift;
+
+      for(int j = 0; j < 3; j++)
+        {
+          SphP->VelPred[j] += SphP->HydroAccel[j] * dt_hydrokick;
+#ifdef HIERARCHICAL_GRAVITY
+          SphP->VelPred[j] += SphP->FullGravAccel[j] * dt_gravkick;
+#else
+          SphP->VelPred[j] += P->GravAccel[j] * dt_gravkick;
+#endif
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+          SphP->VelPred[j] += P->GravPM[j] * dt_gravkick;
+#endif
+        }
+
+      SphP->EntropyPred += SphP->DtEntropy * dt_entr;
+
+      SphP->Density += SphP->DtDensity * dt_drift;
+
+      SphP->Hsml += SphP->DtHsml * dt_drift;
+
+#ifdef PRESSURE_ENTROPY_SPH
+      SphP->PressureSphDensity += SphP->DtPressureSphDensity * dt_drift;
+
+      SphP->EntropyToInvGammaPred = pow(SphP->EntropyPred, 1.0 / GAMMA);
+#endif
+
+      SphP->set_thermodynamic_variables();
+    }
+
+  P->Ti_Current = time1;
+
+#ifndef LEAN
+  P->access.clear(std::memory_order_release);
+#endif
+
+  return buffer_full_flag;
+}
+
+void simparticles::make_list_of_active_particles(void)
+{
+  TIMER_START(CPU_DRIFTS);
+
+  TimeBinsHydro.NActiveParticles = 0;
+
+  for(int n = 0; n < TIMEBINS; n++)
+    {
+      if(TimeBinSynchronized[n])
+        {
+          for(int i = TimeBinsHydro.FirstInTimeBin[n]; i >= 0; i = TimeBinsHydro.NextInTimeBin[i])
+            {
+              if(P[i].getType() == 0)
+                {
+                  if(P[i].getTimeBinHydro() != n)
+                    Terminate("P[i].TimeBinHydro=%d != timebin=%d", P[i].getTimeBinHydro(), n);
+
+                  if(P[i].Ti_Current.load(std::memory_order_acquire) != All.Ti_Current)
+                    drift_particle(&P[i], &SphP[i], All.Ti_Current);
+
+                  TimeBinsHydro.ActiveParticleList[TimeBinsHydro.NActiveParticles++] = i;
+                }
+            }
+        }
+    }
+
+  TimeBinsGravity.NActiveParticles = 0;
+
+  for(int n = 0; n < TIMEBINS; n++)
+    {
+      if(TimeBinSynchronized[n])
+        {
+          for(int i = TimeBinsGravity.FirstInTimeBin[n]; i >= 0; i = TimeBinsGravity.NextInTimeBin[i])
+            {
+              if(P[i].Ti_Current.load(std::memory_order_acquire) != All.Ti_Current)
+                drift_particle(&P[i], &SphP[i], All.Ti_Current);
+
+              TimeBinsGravity.ActiveParticleList[TimeBinsGravity.NActiveParticles++] = i;
+            }
+        }
+    }
+
+  int in[2] = {TimeBinsGravity.NActiveParticles, TimeBinsHydro.NActiveParticles};
+  long long out[2];
+
+  sumup_large_ints(2, in, out, Communicator);
+
+  TimeBinsGravity.GlobalNActiveParticles = out[0];
+  TimeBinsHydro.GlobalNActiveParticles   = out[1];
+
+  TIMER_STOP(CPU_DRIFTS);
+}
diff --git a/src/time_integration/timestep.cc b/src/time_integration/timestep.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c6087b717e80eb6b47db56436ee602e345350de9
--- /dev/null
+++ b/src/time_integration/timestep.cc
@@ -0,0 +1,677 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  timestep.cc
+ *
+ *  \brief routines for determining the timesteps of particles
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../cooling_sfr/cooling.h"
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../data/simparticles.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../system/system.h"
+#include "../time_integration/driftfac.h"
+#include "../time_integration/timestep.h"
+
+/*! This function advances the system in momentum space, i.e. it does apply the 'kick' operation after the
+ *  forces have been computed. Additionally, it assigns new timesteps to particles. At start-up, a
+ *  half-timestep is carried out, as well as at the end of the simulation. In between, the half-step kick that
+ *  ends the previous timestep and the half-step kick for the new timestep are combined into one operation.
+ */
+
+void sim::find_hydro_timesteps(void)
+{
+#ifndef FORCE_EQUAL_TIMESTEPS
+
+  All.set_cosmo_factors_for_current_time();
+
+  NgbTree.tree_based_timesteps();
+
+  TIMER_START(CPU_TIMELINE);
+
+  Sp.assign_hydro_timesteps();
+
+  TIMER_STOP(CPU_TIMELINE);
+
+#endif
+}
+
+void simparticles::assign_hydro_timesteps(void)
+{
+  /* Now assign new timesteps for the hydro particles that are synchronized */
+  for(int i = 0; i < TimeBinsHydro.NActiveParticles; i++)
+    {
+      int target = TimeBinsHydro.ActiveParticleList[i];
+      if(P[target].getType() != 0)
+        continue;
+
+      if(TimeBinSynchronized[P[target].getTimeBinHydro()])
+        {
+          integertime ti_step = get_timestep_hydro(target);
+
+          int bin;
+          timebins_get_bin_and_do_validity_checks(ti_step, &bin, P[target].getTimeBinHydro());
+
+#ifdef SELFGRAVITY
+          /* we enforce that the hydro timestep is nested inside the gravity step */
+          if(bin > P[target].TimeBinGrav)
+            bin = P[target].TimeBinGrav;
+#endif
+
+          TimeBinsHydro.timebin_move_particle(target, P[target].getTimeBinHydro(), bin);
+
+          P[target].setTimeBinHydro(bin);
+        }
+    }
+}
+
+#ifdef FORCE_EQUAL_TIMESTEPS
+void sim::find_global_timesteps(void)
+{
+  All.set_cosmo_factors_for_current_time();
+
+  NgbTree.tree_based_timesteps();
+
+  TIMER_START(CPU_TIMELINE);
+
+  integertime globTimeStep = TIMEBASE;
+
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+  globTimeStep = Sp.get_timestep_pm();
+#endif
+
+#if defined(SELFGRAVITY) || defined(EXTERNALGRAVITY)
+  for(int idx = 0; idx < Sp.TimeBinsGravity.NActiveParticles; idx++)
+    {
+      int i = Sp.TimeBinsGravity.ActiveParticleList[idx];
+
+      integertime ti_step = Sp.get_timestep_grav(i);
+      if(ti_step < globTimeStep)
+        globTimeStep = ti_step;
+    }
+#endif
+
+  for(int idx = 0; idx < Sp.TimeBinsHydro.NActiveParticles; idx++)
+    {
+      int i = Sp.TimeBinsHydro.ActiveParticleList[idx];
+      if(Sp.P[i].getType() != 0)
+        continue;
+
+      integertime ti_step = Sp.get_timestep_hydro(i);
+      if(ti_step < globTimeStep)
+        globTimeStep = ti_step;
+    }
+
+#ifdef ENLARGE_DYNAMIC_RANGE_IN_TIME
+  minimum_large_ints(1, &globTimeStep, &All.GlobalTimeStep, Communicator);
+#else
+  MPI_Allreduce(&globTimeStep, &All.GlobalTimeStep, 1, MPI_INT, MPI_MIN, Communicator);
+#endif
+
+  for(int idx = 0; idx < Sp.TimeBinsGravity.NActiveParticles; idx++)
+    {
+      int target = Sp.TimeBinsGravity.ActiveParticleList[idx];
+
+      int bin;
+
+      Sp.timebins_get_bin_and_do_validity_checks(All.GlobalTimeStep, &bin, Sp.P[target].TimeBinGrav);
+      Sp.TimeBinsGravity.timebin_move_particle(target, Sp.P[target].TimeBinGrav, bin);
+      Sp.P[target].TimeBinGrav = bin;
+    }
+
+  for(int idx = 0; idx < Sp.TimeBinsHydro.NActiveParticles; idx++)
+    {
+      int target = Sp.TimeBinsHydro.ActiveParticleList[idx];
+      if(Sp.P[target].getType() != 0)
+        continue;
+
+      int bin;
+
+      Sp.timebins_get_bin_and_do_validity_checks(All.GlobalTimeStep, &bin, Sp.P[target].getTimeBinHydro());
+      Sp.TimeBinsHydro.timebin_move_particle(target, Sp.P[target].getTimeBinHydro(), bin);
+#ifndef LEAN
+      Sp.P[target].TimeBinHydro = bin;
+#endif
+    }
+
+  TIMER_STOP(CPU_TIMELINE);
+}
+#endif
+
+int simparticles::test_if_grav_timestep_is_too_large(int p, int bin)
+{
+  integertime ti_step_bin = bin ? (((integertime)1) << bin) : 0;
+
+  integertime ti_step = get_timestep_grav(p);
+
+  if(ti_step < ti_step_bin)
+    return 1;
+  else
+    return 0;
+}
+
+/*! This function normally (for flag==0) returns the maximum allowed timestep of a particle, expressed in
+ *  terms of the integer mapping that is used to represent the total simulated timespan. The physical
+ *  acceleration is returned in aphys. The latter is used in conjunction with the PSEUDOSYMMETRIC integration
+ *  option, which also makes of the second function of get_timestep. When it is called with a finite timestep
+ *  for flag, it returns the physical acceleration that would lead to this timestep, assuming timestep
+ *  criterion 0.
+ */
+integertime simparticles::get_timestep_grav(int p /*!< particle index */)
+{
+  double ax = All.cf_a2inv * P[p].GravAccel[0];
+  double ay = All.cf_a2inv * P[p].GravAccel[1];
+  double az = All.cf_a2inv * P[p].GravAccel[2];
+
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+  ax += All.cf_a2inv * P[p].GravPM[0];
+  ay += All.cf_a2inv * P[p].GravPM[1];
+  az += All.cf_a2inv * P[p].GravPM[2];
+#endif
+
+  double ac = sqrt(ax * ax + ay * ay + az * az); /* this is now the physical acceleration */
+
+  if(ac == 0)
+    ac = MIN_FLOAT_NUMBER;
+
+    /* determine the "kinematic" timestep dt_grav, in physical units */
+#if NSOFTCLASSES > 1
+  double dt_grav = sqrt(2 * All.ErrTolIntAccuracy * All.cf_atime * All.SofteningTable[P[p].getSofteningClass()] / ac);
+#else
+  double dt_grav = sqrt(2 * All.ErrTolIntAccuracy * All.cf_atime * All.SofteningTable[0] / ac);
+#endif
+
+  double dt = dt_grav;
+
+  /* convert the physical timestep to dloga if needed. Note: If comoving integration has not been selected,
+     All.cf_hubble_a=1.
+   */
+  dt *= All.cf_hubble_a;
+
+  if(dt >= All.MaxSizeTimestep)
+    dt = All.MaxSizeTimestep;
+
+#if defined(SELFGRAVITY) && defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+  if(dt >= All.DtDisplacement)
+    dt = All.DtDisplacement;
+#endif
+
+  if(dt < All.MinSizeTimestep)
+    {
+      dt = All.MinSizeTimestep;
+#ifndef NO_STOP_BELOW_MINTIMESTEP
+      Terminate(
+          "Timestep wants to be below the limit MinSizeTimestep=%g\n"
+          "Part-ID=%lld task=%d type=%d dtgrav=%g ac=%g soft=%g\n",
+          All.MinSizeTimestep, (long long)P[p].ID.get(), ThisTask, P[p].getType(), dt, ac,
+          All.SofteningTable[P[p].getSofteningClass()]);
+#endif
+    }
+
+  integertime ti_step = (integertime)(dt / All.Timebase_interval);
+
+  if(!(ti_step > 0 && ti_step < TIMEBASE))
+    {
+      double pos[3];
+      intpos_to_pos(P[p].IntPos, pos); /* converts the integer coordinates to floating point */
+
+      Terminate(
+          "\nError: A timestep of size zero was assigned on the integer timeline!\n"
+          "We better stop.\n"
+          "Task=%d Part-ID=%lld type=%d dt_grav=%g dt=%g tibase=%g ac=%g xyz=(%g|%g|%g) vel=(%g|%g|%g) tree=(%g|%g|%g) mass=%g\n\n",
+          ThisTask, (long long)P[p].ID.get(), P[p].getType(), dt_grav, dt, All.Timebase_interval, ac, pos[0], pos[1], pos[2],
+          P[p].Vel[0], P[p].Vel[1], P[p].Vel[2], P[p].GravAccel[0], P[p].GravAccel[1], P[p].GravAccel[2], P[p].getMass());
+
+      myflush(stdout);
+      Terminate("integer timestep reached zero");
+    }
+
+  return ti_step;
+}
+
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+integertime simparticles::get_timestep_pm(void)
+{
+  integertime ti_step = TIMEBASE;
+  while(ti_step > (All.DtDisplacement / All.Timebase_interval))
+    ti_step >>= 1;
+
+  if(ti_step > (All.PM_Ti_endstep - All.PM_Ti_begstep)) /* PM-timestep wants to increase */
+    {
+      int bin    = get_timestep_bin(ti_step);
+      int binold = get_timestep_bin(All.PM_Ti_endstep - All.PM_Ti_begstep);
+
+      while(TimeBinSynchronized[bin] == 0 && bin > binold) /* make sure the new step is synchronized */
+        bin--;
+
+      ti_step = bin ? (((integertime)1) << bin) : 0;
+    }
+
+  if(All.Ti_Current == TIMEBASE) /* we here finish the last timestep. */
+    ti_step = 0;
+
+  return ti_step;
+}
+#endif
+
+integertime simparticles::get_timestep_hydro(int p /*!< particle index */)
+{
+  if(P[p].getType() != 0)
+    Terminate("P[p].getType() != 0");
+
+  double ax = All.cf_afac2 * SphP[p].HydroAccel[0];
+  double ay = All.cf_afac2 * SphP[p].HydroAccel[1];
+  double az = All.cf_afac2 * SphP[p].HydroAccel[2];
+
+  ax += All.cf_a2inv * P[p].GravAccel[0];
+  ay += All.cf_a2inv * P[p].GravAccel[1];
+  az += All.cf_a2inv * P[p].GravAccel[2];
+
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+  ax += All.cf_a2inv * P[p].GravPM[0];
+  ay += All.cf_a2inv * P[p].GravPM[1];
+  az += All.cf_a2inv * P[p].GravPM[2];
+#endif
+
+  double ac = sqrt(ax * ax + ay * ay + az * az); /* this is now the physical acceleration */
+
+  if(ac == 0)
+    ac = MIN_FLOAT_NUMBER;
+
+  /* determine the "kinematic" timestep dt_grav, in physical units */
+  double dt_kin = sqrt(2 * All.ErrTolIntAccuracy * All.cf_atime * SphP[p].Hsml / ac);
+
+  /* calculate local Courant timestep and treebased maximum timestep in physical units */
+
+  double dt_courant = (All.cf_atime / All.cf_afac3) * All.CourantFac * 2.0 * SphP[p].Hsml / (SphP[p].MaxSignalVel + MIN_FLOAT_NUMBER);
+
+  double dt_treebased = (All.cf_atime / All.cf_afac3) * SphP[p].CurrentMaxTiStep;
+
+  /* calculate a timestep that restricts the rate at which the smoothing length may change,
+   * in physical units
+   */
+  double dt_hsml = All.cf_atime2 * All.CourantFac * fabs(SphP[p].Hsml / (SphP[p].DtHsml + MIN_FLOAT_NUMBER));
+
+  /* now take the smallest of these four criteria */
+  double dt = dt_kin;
+  if(dt > dt_courant)
+    dt = dt_courant;
+  if(dt > dt_treebased)
+    dt = dt_treebased;
+  if(dt > dt_hsml)
+    dt = dt_hsml;
+
+#ifdef STARFORMATION
+  if(P[p].getType() == 0) /* to protect using a particle that has been turned into a star */
+    {
+      if(SphP[p].Sfr > 0)
+        {
+          double dt_sfr = 0.1 * P[p].getMass() / SphP[p].Sfr;
+          if(dt_sfr < dt)
+            dt = dt_sfr;
+        }
+    }
+#endif
+
+  /* convert the physical timestep to dloga in the cosmological case.
+   * Note: If comoving integration has not been selected, All.cf_hubble_a = 1.0.
+   */
+  dt *= All.cf_hubble_a;
+
+  if(dt >= All.MaxSizeTimestep)
+    dt = All.MaxSizeTimestep;
+
+#if defined(PMGRID) && !defined(TREEPM_NOTIMESPLIT)
+  if(dt >= All.DtDisplacement)
+    dt = All.DtDisplacement;
+#endif
+
+  if(dt < All.MinSizeTimestep)
+    {
+      if(P[p].getType() == 0)
+        Terminate(
+            "Timestep wants to be below the limit MinSizeTimestep=%g\n"
+            "Part-ID=%lld task=%d dtkin=%g dtcourant=%g ac=%g\n",
+            All.MinSizeTimestep, (long long)P[p].ID.get(), ThisTask, dt_kin * All.cf_hubble_a, dt_courant * All.cf_hubble_a, ac);
+      dt = All.MinSizeTimestep;
+    }
+
+  integertime ti_step = (integertime)(dt / All.Timebase_interval);
+
+  if(!(ti_step > 0 && ti_step < TIMEBASE))
+    {
+      double pos[3];
+      intpos_to_pos(P[p].IntPos, pos); /* converts the integer coordinates to floating point */
+
+      Terminate(
+          "\nError: A timestep of size zero was assigned on the integer timeline!\n"
+          "We better stop.\n"
+          "Task=%d Part-ID=%lld type=%d dt=%g dtc=%g dt_kin=%g dt_treebased=%g dt_hsml=%g tibase=%g ti_step=%d ac=%g xyz=(%g|%g|%g) "
+          "vel=(%g|%g|%g) "
+          "tree=(%g|%g|%g) mass=%g  All.cf_hubble_a=%g\n\n",
+          ThisTask, (long long)P[p].ID.get(), P[p].getType(), dt, dt_courant, dt_kin, dt_treebased, dt_hsml, All.Timebase_interval,
+          (int)ti_step, ac, pos[0], pos[1], pos[2], P[p].Vel[0], P[p].Vel[1], P[p].Vel[2], P[p].GravAccel[0], P[p].GravAccel[1],
+          P[p].GravAccel[2], P[p].getMass(), All.cf_hubble_a);
+    }
+
+  return ti_step;
+}
+
+#if defined(PMGRID) && defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
+void simparticles::find_long_range_step_constraint(void)
+{
+  double dtmin = MAX_DOUBLE_NUMBER;
+
+  for(int p = 0; p < NumPart; p++)
+    {
+      if(P[p].getType() == 0)
+        continue;
+
+      /* calculate acceleration */
+      double ax = All.cf_a2inv * P[p].GravPM[0];
+      double ay = All.cf_a2inv * P[p].GravPM[1];
+      double az = All.cf_a2inv * P[p].GravPM[2];
+
+      double ac = sqrt(ax * ax + ay * ay + az * az); /* this is now the physical acceleration */
+
+      if(ac < 1.0e-30)
+        ac = 1.0e-30;
+
+#if NSOFTCLASSES > 1
+      double dt = sqrt(2 * All.ErrTolIntAccuracy * All.cf_atime * All.ForceSoftening[P[p].getSofteningClass()] / 2.8 / ac);
+#else
+      double dt = sqrt(2 * All.ErrTolIntAccuracy * All.cf_atime * All.ForceSoftening[0] / 2.8 / ac);
+#endif
+      dt *= All.cf_hubble_a;
+
+      if(dt < dtmin)
+        dtmin = dt;
+    }
+
+  dtmin *= 2.0; /* move it one timebin higher to prevent being too conservative */
+
+  MPI_Allreduce(&dtmin, &All.DtDisplacement, 1, MPI_DOUBLE, MPI_MIN, Communicator);
+
+  mpi_printf("TIMESTEPS: displacement time constraint: %g  (%g)\n", All.DtDisplacement, All.MaxSizeTimestep);
+
+  if(All.DtDisplacement > All.MaxSizeTimestep)
+    All.DtDisplacement = All.MaxSizeTimestep;
+}
+#endif
+
+int simparticles::get_timestep_bin(integertime ti_step)
+{
+  int bin = -1;
+
+  if(ti_step == 0)
+    return 0;
+
+  if(ti_step == 1)
+    Terminate("time-step of integer size 1 not allowed\n");
+
+  while(ti_step)
+    {
+      bin++;
+      ti_step >>= 1;
+    }
+
+  return bin;
+}
+
+void TimeBinData::timebins_init(const char *name, int *maxPart)
+{
+  NActiveParticles   = 0;
+  ActiveParticleList = 0;
+
+  for(int i = 0; i < TIMEBINS; i++)
+    {
+      FirstInTimeBin[i] = -1;
+      LastInTimeBin[i]  = -1;
+    }
+
+  NextInTimeBin = 0;
+  PrevInTimeBin = 0;
+
+  strncpy(Name, name, 99);
+  Name[99] = 0;
+  MaxPart  = maxPart;
+}
+
+void TimeBinData::timebins_allocate(void)
+{
+  char Identifier[200];
+  Identifier[199] = 0;
+
+  snprintf(Identifier, 199, "NextActiveParticle%s", Name);
+  ActiveParticleList = (int *)Mem.mymalloc_movable(&ActiveParticleList, Identifier, *(MaxPart) * sizeof(int));
+
+  snprintf(Identifier, 199, "NextInTimeBin%s", Name);
+  NextInTimeBin = (int *)Mem.mymalloc_movable(&NextInTimeBin, Identifier, *(MaxPart) * sizeof(int));
+
+  snprintf(Identifier, 199, "PrevInTimeBin%s", Name);
+  PrevInTimeBin = (int *)Mem.mymalloc_movable(&PrevInTimeBin, Identifier, *(MaxPart) * sizeof(int));
+}
+
+void TimeBinData::timebins_free(void)
+{
+  Mem.myfree_movable(PrevInTimeBin);
+  Mem.myfree_movable(NextInTimeBin);
+  Mem.myfree_movable(ActiveParticleList);
+
+  PrevInTimeBin      = NULL;
+  NextInTimeBin      = NULL;
+  ActiveParticleList = NULL;
+}
+
+void TimeBinData::timebins_reallocate(void)
+{
+  if(ActiveParticleList != NULL)
+    {
+      ActiveParticleList = (int *)Mem.myrealloc_movable(ActiveParticleList, *(MaxPart) * sizeof(int));
+      NextInTimeBin      = (int *)Mem.myrealloc_movable(NextInTimeBin, *(MaxPart) * sizeof(int));
+      PrevInTimeBin      = (int *)Mem.myrealloc_movable(PrevInTimeBin, *(MaxPart) * sizeof(int));
+    }
+}
+
+void simparticles::timebins_get_bin_and_do_validity_checks(integertime ti_step, int *bin_new, int bin_old)
+{
+  /* make it a power 2 subdivision */
+  integertime ti_min = TIMEBASE;
+  while(ti_min > ti_step)
+    ti_min >>= 1;
+  ti_step = ti_min;
+
+  /* get timestep bin */
+  int bin = -1;
+
+  if(ti_step == 0)
+    bin = 0;
+
+  if(ti_step == 1)
+    Terminate("time-step of integer size 1 not allowed\n");
+
+  while(ti_step)
+    {
+      bin++;
+      ti_step >>= 1;
+    }
+
+  if(bin > bin_old) /* timestep wants to increase */
+    {
+      while(TimeBinSynchronized[bin] == 0 && bin > bin_old) /* make sure the new step is synchronized */
+        bin--;
+
+      ti_step = bin ? (((integertime)1) << bin) : 0;
+    }
+
+  if(All.Ti_Current >= TIMEBASE) /* we here finish the last timestep. */
+    {
+      ti_step = 0;
+      bin     = 0;
+    }
+
+  if((TIMEBASE - All.Ti_Current) < ti_step) /* check that we don't run beyond the end */
+    {
+      Terminate("we are beyond the end of the timeline"); /* should not happen */
+    }
+
+  *bin_new = bin;
+}
+
+void TimeBinData::timebin_move_particle(int p, int timeBin_old, int timeBin_new)
+{
+  if(timeBin_old == timeBin_new)
+    return;
+
+  TimeBinCount[timeBin_old]--;
+
+  int prev = PrevInTimeBin[p];
+  int next = NextInTimeBin[p];
+
+  if(FirstInTimeBin[timeBin_old] == p)
+    FirstInTimeBin[timeBin_old] = next;
+  if(LastInTimeBin[timeBin_old] == p)
+    LastInTimeBin[timeBin_old] = prev;
+  if(prev >= 0)
+    NextInTimeBin[prev] = next;
+  if(next >= 0)
+    PrevInTimeBin[next] = prev;
+
+  if(TimeBinCount[timeBin_new] > 0)
+    {
+      PrevInTimeBin[p]                          = LastInTimeBin[timeBin_new];
+      NextInTimeBin[LastInTimeBin[timeBin_new]] = p;
+      NextInTimeBin[p]                          = -1;
+      LastInTimeBin[timeBin_new]                = p;
+    }
+  else
+    {
+      FirstInTimeBin[timeBin_new] = LastInTimeBin[timeBin_new] = p;
+      PrevInTimeBin[p] = NextInTimeBin[p] = -1;
+    }
+
+  TimeBinCount[timeBin_new]++;
+}
+
+void TimeBinData::timebin_remove_particle(int idx, int bin)
+{
+  int p                   = ActiveParticleList[idx];
+  ActiveParticleList[idx] = -1;
+
+  TimeBinCount[bin]--;
+
+  if(p >= 0)
+    {
+      int prev = PrevInTimeBin[p];
+      int next = NextInTimeBin[p];
+
+      if(prev >= 0)
+        NextInTimeBin[prev] = next;
+      if(next >= 0)
+        PrevInTimeBin[next] = prev;
+
+      if(FirstInTimeBin[bin] == p)
+        FirstInTimeBin[bin] = next;
+      if(LastInTimeBin[bin] == p)
+        LastInTimeBin[bin] = prev;
+    }
+}
+
+/* insert a particle into the timebin struct behind another already existing particle */
+void TimeBinData::timebin_add_particle(int i_new, int i_old, int timeBin, int addToListOfActiveParticles)
+{
+  TimeBinCount[timeBin]++;
+
+  if(i_old < 0)
+    {
+      /* if we don't have an existing particle to add if after, let's take the last one in this timebin */
+      i_old = LastInTimeBin[timeBin];
+
+      if(i_old < 0)
+        {
+          /* the timebin is empty at the moment, so just add the new particle */
+          FirstInTimeBin[timeBin] = i_new;
+          LastInTimeBin[timeBin]  = i_new;
+          NextInTimeBin[i_new]    = -1;
+          PrevInTimeBin[i_new]    = -1;
+        }
+    }
+
+  if(i_old >= 0)
+    {
+      /* otherwise we added it already */
+      PrevInTimeBin[i_new] = i_old;
+      NextInTimeBin[i_new] = NextInTimeBin[i_old];
+      if(NextInTimeBin[i_old] >= 0)
+        PrevInTimeBin[NextInTimeBin[i_old]] = i_new;
+      NextInTimeBin[i_old] = i_new;
+      if(LastInTimeBin[timeBin] == i_old)
+        LastInTimeBin[timeBin] = i_new;
+    }
+
+  if(addToListOfActiveParticles)
+    {
+      ActiveParticleList[NActiveParticles] = i_new;
+      NActiveParticles++;
+    }
+}
+
+void simparticles::timebin_cleanup_list_of_active_particles(void)
+{
+  for(int idx = 0; idx < TimeBinsGravity.NActiveParticles; idx++)
+    {
+      int i = TimeBinsGravity.ActiveParticleList[idx];
+      if(i < 0)
+        continue;
+
+      if(P[i].ID.get() == 0 && P[i].getMass() == 0)
+        {
+          TimeBinsGravity.timebin_remove_particle(idx, P[i].TimeBinGrav);
+        }
+    }
+
+  for(int idx = 0; idx < TimeBinsHydro.NActiveParticles; idx++)
+    {
+      int i = TimeBinsHydro.ActiveParticleList[idx];
+      if(i < 0)
+        continue;
+
+      if(P[i].ID.get() == 0 && P[i].getMass() == 0 && P[i].getType() == 0)
+        {
+          TimeBinsHydro.timebin_remove_particle(idx, P[i].getTimeBinHydro());
+        }
+    }
+}
+
+void TimeBinData::timebin_make_list_of_active_particles_up_to_timebin(int timebin)
+{
+  NActiveParticles = 0;
+  for(int tbin = timebin; tbin >= 0; tbin--)
+    timebin_add_particles_of_timebin_to_list_of_active_particles(tbin);
+}
+
+void TimeBinData::timebin_add_particles_of_timebin_to_list_of_active_particles(int timebin)
+{
+  for(int i = FirstInTimeBin[timebin]; i >= 0; i = NextInTimeBin[i])
+    {
+      ActiveParticleList[NActiveParticles] = i;
+      NActiveParticles++;
+    }
+}
diff --git a/src/time_integration/timestep.h b/src/time_integration/timestep.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cf04b18aed4b9d14989d9eb3946a5baef7a6361
--- /dev/null
+++ b/src/time_integration/timestep.h
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  timestep.h
+ *
+ *  \brief declarations of a structure for organizing the timebins
+ */
+
+#ifndef TIMESTEP_H
+#define TIMESTEP_H
+
+struct TimeBinData
+{
+ public:
+  int NActiveParticles;
+  long long GlobalNActiveParticles;
+  int *ActiveParticleList;
+  int TimeBinCount[TIMEBINS];
+
+  int FirstInTimeBin[TIMEBINS];
+  int LastInTimeBin[TIMEBINS];
+  int *NextInTimeBin;
+  int *PrevInTimeBin;
+  char Name[100];
+  int *MaxPart;
+
+  /* TimeBinData stuff */
+  void timebins_init(const char *name, int *MaxPart);
+  void timebins_allocate(void);
+  void timebins_free(void);
+  void timebins_reallocate(void);
+  void timebin_move_particle(int p, int timeBin_old, int timeBin_new);
+  void timebin_add_particle(int i_new, int i_old, int timeBin, int addToListOfActiveParticles);
+  void timebin_remove_particle(int idx, int bin);
+  void timebin_cleanup_list_of_active_particles(void);
+  void timebin_move_sfr(int p, int timeBin_old, int timeBin_new);
+  void timebin_move_bh(int p, int timeBin_old, int timeBin_new);
+  void timebin_make_list_of_active_particles_up_to_timebin(int timebin);
+  void timebin_add_particles_of_timebin_to_list_of_active_particles(int timebin);
+};
+
+#endif /* TIMESTEP */
diff --git a/src/time_integration/timestep_treebased.cc b/src/time_integration/timestep_treebased.cc
new file mode 100644
index 0000000000000000000000000000000000000000..03a6aa4e6f03d16f8e7d1c1f85a39a8f792cba69
--- /dev/null
+++ b/src/time_integration/timestep_treebased.cc
@@ -0,0 +1,489 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file timestep_treebased.cc
+ *
+ *  \brief routines to find the timestep by checking for the arrival of the first waves from anywhere
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../data/simparticles.h"
+#include "../domain/domain.h"
+#include "../gravtree/gravtree.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../ngbtree/ngbtree.h"
+#include "../sort/cxxsort.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+
+inline int sph::sph_treetimestep_evaluate_particle_node_opening_criterion(pinfo &pdat, ngbnode *nop)
+{
+  if(nop->level <= LEVEL_ALWAYS_OPEN)  // always open the root node (note: full node length does not fit in the integer type)
+    return NODE_OPEN;
+
+  if(nop->Ti_Current != All.Ti_Current)
+    nop->drift_node(All.Ti_Current, Tp);
+
+  sph_particle_data *SphP = &Tp->SphP[pdat.target];
+
+  double vsig = SphP->Csnd + nop->MaxCsnd;
+
+  MyIntPosType new_range_min[3], new_range_max[3];
+  MySignedIntPosType left[3], right[3];
+  double vleft, vright;
+
+  // ----------- x-checks
+
+  vleft = (-vsig + (nop->vmin[0] - SphP->VelPred[0]));
+  if(fabs(vleft) * SphP->CurrentMaxTiStep > MaxBoxDist)
+    SphP->CurrentMaxTiStep = MaxBoxDist / fabs(vleft);
+
+  vright = (vsig + (nop->vmax[0] - SphP->VelPred[0]));
+  if(fabs(vright) * SphP->CurrentMaxTiStep > MaxBoxDist)
+    SphP->CurrentMaxTiStep = MaxBoxDist / fabs(vright);
+
+  new_range_min[0] = nop->center_offset_min[0] + nop->center[0] +
+                     (MyIntPosType)Tp->pos_to_signedintpos(SphP->CurrentMaxTiStep * vleft + 2 * pdat.hsml);
+  new_range_max[0] = nop->center_offset_max[0] + nop->center[0] +
+                     (MyIntPosType)Tp->pos_to_signedintpos(SphP->CurrentMaxTiStep * vright - 2 * pdat.hsml);
+
+  left[0]  = (MySignedIntPosType)Tp->nearest_image_intpos_to_intpos_X(new_range_min[0], pdat.searchcenter[0]);
+  right[0] = (MySignedIntPosType)Tp->nearest_image_intpos_to_intpos_X(new_range_max[0], pdat.searchcenter[0]);
+
+  /* check whether we can stop walking along this branch */
+  if(right[0] < 0 || left[0] > 0)
+    return NODE_DISCARD;
+
+  // ----------- y-checks
+
+  vleft = (-vsig + (nop->vmin[1] - SphP->VelPred[1]));
+  if(fabs(vleft) * SphP->CurrentMaxTiStep > MaxBoxDist)
+    SphP->CurrentMaxTiStep = MaxBoxDist / fabs(vleft);
+
+  vright = (vsig + (nop->vmax[1] - SphP->VelPred[1]));
+  if(fabs(vright) * SphP->CurrentMaxTiStep > MaxBoxDist)
+    SphP->CurrentMaxTiStep = MaxBoxDist / fabs(vright);
+
+  new_range_min[1] = nop->center_offset_min[1] + nop->center[1] +
+                     (MyIntPosType)Tp->pos_to_signedintpos(SphP->CurrentMaxTiStep * vleft + 2 * pdat.hsml);
+  new_range_max[1] = nop->center_offset_max[1] + nop->center[1] +
+                     (MyIntPosType)Tp->pos_to_signedintpos(SphP->CurrentMaxTiStep * vright - 2 * pdat.hsml);
+
+  left[1]  = (MySignedIntPosType)Tp->nearest_image_intpos_to_intpos_Y(new_range_min[1], pdat.searchcenter[1]);
+  right[1] = (MySignedIntPosType)Tp->nearest_image_intpos_to_intpos_Y(new_range_max[1], pdat.searchcenter[1]);
+
+  /* check whether we can stop walking along this branch */
+  if(right[1] < 0 || left[1] > 0)
+    return NODE_DISCARD;
+
+  // ----------- z-checks
+
+  vleft = (-vsig + (nop->vmin[2] - SphP->VelPred[2]));
+  if(fabs(vleft) * SphP->CurrentMaxTiStep > MaxBoxDist)
+    SphP->CurrentMaxTiStep = MaxBoxDist / fabs(vleft);
+
+  vright = (vsig + (nop->vmax[2] - SphP->VelPred[2]));
+  if(fabs(vright) * SphP->CurrentMaxTiStep > MaxBoxDist)
+    SphP->CurrentMaxTiStep = MaxBoxDist / fabs(vright);
+
+  new_range_min[2] = nop->center_offset_min[2] + nop->center[2] +
+                     (MyIntPosType)Tp->pos_to_signedintpos(SphP->CurrentMaxTiStep * vleft + 2 * pdat.hsml);
+  new_range_max[2] = nop->center_offset_max[2] + nop->center[2] +
+                     (MyIntPosType)Tp->pos_to_signedintpos(SphP->CurrentMaxTiStep * vright - 2 * pdat.hsml);
+
+  left[2]  = (MySignedIntPosType)Tp->nearest_image_intpos_to_intpos_Z(new_range_min[2], pdat.searchcenter[2]);
+  right[2] = (MySignedIntPosType)Tp->nearest_image_intpos_to_intpos_Z(new_range_max[2], pdat.searchcenter[2]);
+
+  /* check whether we can stop walking along this branch */
+  if(right[2] < 0 || left[2] > 0)
+    return NODE_DISCARD;
+
+  /*
+  double ctr[3];
+  Tp->intpos_to_pos(nop->center.da, ctr);
+
+  MyIntPosType halflen = ((MyIntPosType)1) << ((BITS_FOR_POSITIONS - 1) - nop->level);
+
+  vector<MyReal> dxyz;
+  Tp->nearest_image_intpos_to_pos(nop->center.da, pdat.searchcenter, dxyz.da);
+
+  double r2 = dxyz.r2();
+
+  printf("level=%d  center=(%g|%g|%g)  dist=%g lenhalf=%g   dt*vsig=%g \n ", nop->level, ctr[0], ctr[1], ctr[2], sqrt(r2),
+
+         halflen * Tp->FacIntToCoord,
+
+         SphP->CurrentMaxTiStep * vsig);
+*/
+
+  return NODE_OPEN;
+}
+
+inline void sph::sph_treetimestep_check_particle_particle_interaction(pinfo &pdat, int p, int p_type, unsigned char shmrank)
+{
+#ifdef PRESERVE_SHMEM_BINARY_INVARIANCE
+  if(skip_actual_force_computation)
+    return;
+#endif
+
+  sph_particle_data *SphP_i = &Tp->SphP[pdat.target];
+
+  if(p_type == NODE_TYPE_LOCAL_PARTICLE) /* local particle */
+    {
+      particle_data *P_j        = get_Pp(p, shmrank);
+      sph_particle_data *SphP_j = get_SphPp(p, shmrank);
+
+      if(P_j->getType() > 0)
+        return;
+
+      if(P_j->get_Ti_Current() != All.Ti_Current)
+        Tp->drift_particle(P_j, SphP_j, All.Ti_Current);  // this function avoids race conditions
+
+      double dxyz[3];
+      Tp->nearest_image_intpos_to_pos(P_j->IntPos, pdat.searchcenter, dxyz); /* converts the integer distance to floating point */
+
+      double dist2 = dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2];
+
+      if(dist2 > 0)
+        {
+          double dist = sqrt(dist2);
+          double vsig = SphP_i->Csnd + SphP_j->Csnd -
+                        ((SphP_j->VelPred[0] - SphP_i->VelPred[0]) * dxyz[0] + (SphP_j->VelPred[1] - SphP_i->VelPred[1]) * dxyz[1] +
+                         (SphP_j->VelPred[2] - SphP_i->VelPred[2]) * dxyz[2]) /
+                            dist;
+
+          if(vsig > 0)
+            {
+              dist += 2 * SphP_i->Hsml; /* take one smoothing length as minimum distance in order to protect against unreasonably
+                              small steps if two particles are very close */
+
+              double dt = dist / vsig;
+              if(SphP_i->CurrentMaxTiStep > dt)
+                SphP_i->CurrentMaxTiStep = dt;
+            }
+        }
+    }
+  else if(p_type == NODE_TYPE_FETCHED_PARTICLE)
+    {
+      foreign_sphpoint_data *foreignpoint = get_foreignpointsp(p - EndOfForeignNodes, shmrank);
+
+      sph_particle_data_hydrocore *SphP_j = &foreignpoint->SphCore;
+
+      /* converts the integer distance to floating point */
+      double dxyz[3];
+      Tp->nearest_image_intpos_to_pos(foreignpoint->IntPos, pdat.searchcenter, dxyz);
+
+      double dist2 = dxyz[0] * dxyz[0] + dxyz[1] * dxyz[1] + dxyz[2] * dxyz[2];
+
+      if(dist2 > 0)
+        {
+          double dist = sqrt(dist2);
+          double vsig = SphP_i->Csnd + SphP_j->Csnd -
+                        ((SphP_j->VelPred[0] - SphP_i->VelPred[0]) * dxyz[0] + (SphP_j->VelPred[1] - SphP_i->VelPred[1]) * dxyz[1] +
+                         (SphP_j->VelPred[2] - SphP_i->VelPred[2]) * dxyz[2]) /
+                            dist;
+
+          if(vsig > 0)
+            {
+              dist += 2 * SphP_i->Hsml; /* take one smoothing length as minimum distance in order to protect against unreasonably
+                              small steps if two particles are very close */
+
+              double dt = dist / vsig;
+              if(SphP_i->CurrentMaxTiStep > dt)
+                SphP_i->CurrentMaxTiStep = dt;
+            }
+        }
+    }
+  else
+    Terminate("unexpected");
+}
+
+inline void sph::sph_treetimestep_open_node(pinfo &pdat, ngbnode *nop, int mintopleafnode, int committed)
+{
+  /* open node */
+  int p                 = nop->nextnode;
+  unsigned char shmrank = nop->nextnode_shmrank;
+
+  while(p != nop->sibling || (shmrank != nop->sibling_shmrank && nop->sibling >= MaxPart + D->NTopnodes))
+    {
+      if(p < 0)
+        Terminate(
+            "p=%d < 0  nop->sibling=%d nop->nextnode=%d shmrank=%d nop->sibling_shmrank=%d nop->foreigntask=%d  "
+            "first_nontoplevelnode=%d",
+            p, nop->sibling, nop->nextnode, shmrank, nop->sibling_shmrank, nop->OriginTask, MaxPart + D->NTopnodes);
+
+      int next;
+      unsigned char next_shmrank;
+      char type;
+
+      if(p < MaxPart) /* a local particle */
+        {
+          /* note: here shmrank cannot change */
+          next         = get_nextnodep(shmrank)[p];
+          next_shmrank = shmrank;
+          type         = NODE_TYPE_LOCAL_PARTICLE;
+        }
+      else if(p < MaxPart + MaxNodes) /* an internal node  */
+        {
+          ngbnode *nop = get_nodep(p, shmrank);
+          next         = nop->sibling;
+          next_shmrank = nop->sibling_shmrank;
+          type         = NODE_TYPE_LOCAL_NODE;
+        }
+      else if(p >= ImportedNodeOffset && p < EndOfTreePoints) /* an imported Treepoint particle  */
+        {
+          Terminate("not expected for SPH");
+        }
+      else if(p >= EndOfTreePoints && p < EndOfForeignNodes) /* an imported tree node */
+        {
+          ngbnode *nop = get_nodep(p, shmrank);
+          next         = nop->sibling;
+          next_shmrank = nop->sibling_shmrank;
+          type         = NODE_TYPE_FETCHED_NODE;
+        }
+      else if(p >= EndOfForeignNodes) /* an imported particle below an imported tree node */
+        {
+          foreign_sphpoint_data *foreignpoint = get_foreignpointsp(p - EndOfForeignNodes, shmrank);
+
+          next         = foreignpoint->Nextnode;
+          next_shmrank = foreignpoint->Nextnode_shmrank;
+          type         = NODE_TYPE_FETCHED_PARTICLE;
+        }
+      else
+        {
+          /* a pseudo point */
+          Terminate(
+              "should not happen: p=%d MaxPart=%d MaxNodes=%d  ImportedNodeOffset=%d  EndOfTreePoints=%d  EndOfForeignNodes=%d "
+              "shmrank=%d",
+              p, MaxPart, MaxNodes, ImportedNodeOffset, EndOfTreePoints, EndOfForeignNodes, shmrank);
+        }
+
+      sph_treetimestep_interact(pdat, p, type, shmrank, mintopleafnode, committed);
+
+      p       = next;
+      shmrank = next_shmrank;
+    }
+}
+
+inline void sph::sph_treetimestep_interact(pinfo &pdat, int no, char no_type, unsigned char shmrank, int mintopleafnode, int committed)
+{
+  if(no_type <= NODE_TYPE_FETCHED_PARTICLE)  // we are interacting with a particle
+    {
+      sph_treetimestep_check_particle_particle_interaction(pdat, no, no_type, shmrank);
+    }
+  else  // we are interacting with a node
+    {
+      ngbnode *nop = get_nodep(no, shmrank);
+
+      if(nop->not_empty == 0)
+        return;
+
+      if(no < MaxPart + MaxNodes)                // we have a top-levelnode
+        if(nop->nextnode >= MaxPart + MaxNodes)  // if the next node is not a top-level, we have a leaf node
+          mintopleafnode = no;
+
+      int openflag = sph_treetimestep_evaluate_particle_node_opening_criterion(pdat, nop);
+
+      if(openflag == NODE_OPEN) /* we need to open it */
+        {
+          if(nop->cannot_be_opened_locally.load(std::memory_order_acquire))
+            {
+              // are we in the same shared memory node?
+              if(Shmem.GetNodeIDForSimulCommRank[nop->OriginTask] == Shmem.GetNodeIDForSimulCommRank[D->ThisTask])
+                {
+                  Terminate("this should not happen any more");
+                }
+              else
+                {
+                  tree_add_to_fetch_stack(nop, no, shmrank);  // will only add unique copies
+
+                  tree_add_to_work_stack(pdat.target, no, shmrank, mintopleafnode);
+                }
+            }
+          else
+            {
+              int min_buffer_space =
+                  std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+
+              if(min_buffer_space >= committed + 8 * TREE_NUM_BEFORE_NODESPLIT)
+                sph_treetimestep_open_node(pdat, nop, mintopleafnode, committed + 8 * TREE_NUM_BEFORE_NODESPLIT);
+              else
+                tree_add_to_work_stack(pdat.target, no, shmrank, mintopleafnode);
+            }
+        }
+    }
+}
+
+void sph::tree_based_timesteps(void)
+{
+  if(Tp->TotNumGas > 0)
+    {
+      TIMER_START(CPU_TREE_TIMESTEPS);
+
+      D->mpi_printf("TIMESTEP-TREEWALK: Begin\n");
+
+      MaxBoxDist = 0.25 * Tp->RegionLen / (1 << LEVEL_ALWAYS_OPEN);
+
+      double ta = Logs.second();
+
+      // let's grab at most half the still available memory for imported points and nodes
+      int nspace = (0.33 * Mem.FreeBytes) / (sizeof(ngbnode) + 8 * sizeof(foreign_sphpoint_data));
+
+      MaxForeignNodes  = nspace;
+      MaxForeignPoints = 8 * nspace;
+      NumForeignNodes  = 0;
+      NumForeignPoints = 0;
+
+      sum_NumForeignNodes  = 0;
+      sum_NumForeignPoints = 0;
+
+      /* the following two arrays hold imported tree nodes and imported points to augment the local tree */
+      Foreign_Nodes  = (ngbnode *)Mem.mymalloc_movable(&Foreign_Nodes, "Foreign_Nodes", MaxForeignNodes * sizeof(ngbnode));
+      Foreign_Points = (foreign_sphpoint_data *)Mem.mymalloc_movable(&Foreign_Points, "Foreign_Points",
+                                                                     MaxForeignPoints * sizeof(foreign_sphpoint_data));
+
+      tree_initialize_leaf_node_access_info();
+
+      max_ncycles = 0;
+
+      prepare_shared_memory_access();
+
+      NumOnWorkStack         = 0;
+      AllocWorkStackBaseLow  = std::max<int>(1.5 * (Tp->NumPart + NumPartImported), TREE_MIN_WORKSTACK_SIZE);
+      AllocWorkStackBaseHigh = AllocWorkStackBaseLow + TREE_EXPECTED_CYCLES * TREE_MIN_WORKSTACK_SIZE;
+      MaxOnWorkStack         = AllocWorkStackBaseLow;
+
+      WorkStack = (workstack_data *)Mem.mymalloc("WorkStack", AllocWorkStackBaseHigh * sizeof(workstack_data));
+
+      for(int i = 0; i < Tp->TimeBinsHydro.NActiveParticles; i++)
+        {
+          int target = Tp->TimeBinsHydro.ActiveParticleList[i];
+
+          if(Tp->SphP[target].Csnd < MIN_FLOAT_NUMBER)
+            Tp->SphP[target].Csnd = MIN_FLOAT_NUMBER;
+
+          Tp->SphP[target].CurrentMaxTiStep = 2.0 * Tp->SphP[target].Hsml / (Tp->SphP[target].MaxSignalVel + MIN_FLOAT_NUMBER);
+
+          /* note: for cosmological integration, CurrentMaxTiStep stores  1/a^2 times the maximum allowed physical timestep */
+          if(Tp->SphP[target].CurrentMaxTiStep >= All.MaxSizeTimestep / All.cf_atime2_hubble_a / All.CourantFac)
+            Tp->SphP[target].CurrentMaxTiStep = All.MaxSizeTimestep / All.cf_atime2_hubble_a / All.CourantFac;
+
+          WorkStack[NumOnWorkStack].Target         = target;
+          WorkStack[NumOnWorkStack].Node           = MaxPart;
+          WorkStack[NumOnWorkStack].ShmRank        = Shmem.Island_ThisTask;
+          WorkStack[NumOnWorkStack].MinTopLeafNode = MaxPart + D->NTopnodes;
+          NumOnWorkStack++;
+        }
+
+      // set a default size of the fetch stack equal to half the work stack (this may still be somewhat too large)
+      MaxOnFetchStack = std::max<int>(0.1 * (Tp->NumPart + NumPartImported), TREE_MIN_WORKSTACK_SIZE);
+      StackToFetch    = (fetch_data *)Mem.mymalloc_movable(&StackToFetch, "StackToFetch", MaxOnFetchStack * sizeof(fetch_data));
+
+      while(NumOnWorkStack > 0)  // repeat until we are out of work
+        {
+          NewOnWorkStack  = 0;  // gives the new entries
+          NumOnFetchStack = 0;
+          MaxOnWorkStack  = std::min<int>(AllocWorkStackBaseLow + max_ncycles * TREE_MIN_WORKSTACK_SIZE, AllocWorkStackBaseHigh);
+
+          int item = 0;
+
+          while(item < NumOnWorkStack)
+            {
+              int committed = 8 * TREE_NUM_BEFORE_NODESPLIT;
+              int min_buffer_space =
+                  std::min<int>(MaxOnWorkStack - (NumOnWorkStack + NewOnWorkStack), MaxOnFetchStack - NumOnFetchStack);
+              if(min_buffer_space >= committed)
+                {
+                  int target     = WorkStack[item].Target;
+                  int no         = WorkStack[item].Node;
+                  int shmrank    = WorkStack[item].ShmRank;
+                  int mintopleaf = WorkStack[item].MinTopLeafNode;
+                  item++;
+
+                  pinfo pdat;
+                  get_pinfo(target, pdat);
+
+                  if(no == MaxPart)
+                    {
+                      // we have a pristine particle that's processed for the first time
+                      sph_treetimestep_interact(pdat, no, NODE_TYPE_LOCAL_NODE, shmrank, mintopleaf, committed);
+                    }
+                  else
+                    {
+                      // we have a node that we previously could not open
+                      ngbnode *nop = get_nodep(no, shmrank);
+
+                      if(nop->cannot_be_opened_locally)
+                        {
+                          Terminate("item=%d:  no=%d  now we should be able to open it!", item, no);
+                        }
+                      else
+                        sph_treetimestep_open_node(pdat, nop, mintopleaf, committed);
+                    }
+                }
+              else
+                break;
+            }
+
+          if(item == 0 && NumOnWorkStack > 0)
+            Terminate("Can't even process a single particle");
+
+          tree_fetch_foreign_nodes(FETCH_SPH_TREETIMESTEP);
+
+          /* now reorder the workstack such that we are first going to do residual pristine particles, and then
+           * imported nodes that hang below the first leaf nodes */
+          NumOnWorkStack = NumOnWorkStack - item + NewOnWorkStack;
+          memmove(WorkStack, WorkStack + item, NumOnWorkStack * sizeof(workstack_data));
+
+          /* now let's sort such that we can go deep on top-level node branches, allowing us to clear them out eventually */
+          mycxxsort(WorkStack, WorkStack + NumOnWorkStack, compare_workstack);
+
+          max_ncycles++;
+        }
+
+      Mem.myfree(StackToFetch);
+      Mem.myfree(WorkStack);
+
+      /* now multiply the determined values with the specified CourantFactor */
+      for(int i = 0; i < Tp->TimeBinsHydro.NActiveParticles; i++)
+        {
+          int target = Tp->TimeBinsHydro.ActiveParticleList[i];
+
+          Tp->SphP[target].CurrentMaxTiStep *= All.CourantFac;
+        }
+
+      MPI_Allreduce(MPI_IN_PLACE, &max_ncycles, 1, MPI_INT, MPI_MAX, D->Communicator);
+
+      cleanup_shared_memory_access();
+
+      /* free temporary buffers */
+      Mem.myfree(Foreign_Points);
+      Mem.myfree(Foreign_Nodes);
+
+      double tb = Logs.second();
+
+      TIMER_STOPSTART(CPU_TREE_TIMESTEPS, CPU_LOGS);
+
+      D->mpi_printf("TIMESTEP-TREEWALK: took %g sec, max_ncycles = %d, part/sec = %g\n", Logs.timediff(ta, tb), max_ncycles,
+                    Tp->TimeBinsHydro.GlobalNActiveParticles / (D->NTask * Logs.timediff(ta, tb) + MIN_FLOAT_NUMBER));
+
+      TIMER_STOP(CPU_LOGS);
+    }
+}
diff --git a/src/tree/tree.cc b/src/tree/tree.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c26561d494558184281136412d4c993056f643dc
--- /dev/null
+++ b/src/tree/tree.cc
@@ -0,0 +1,1357 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file tree.cc
+ *
+ *  \brief basic routines for oct-tree building
+ */
+
+#include "gadgetconfig.h"
+
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../data/allvars.h"
+#include "../data/dtypes.h"
+#include "../data/intposconvert.h"
+#include "../data/mymalloc.h"
+#include "../domain/domain.h"
+#include "../logs/logs.h"
+#include "../logs/timer.h"
+#include "../main/simulation.h"
+#include "../mpi_utils/mpi_utils.h"
+#include "../sort/cxxsort.h"
+#include "../sort/peano.h"
+#include "../system/system.h"
+#include "../time_integration/timestep.h"
+#include "../tree/tree.h"
+
+/*! This file contains the construction of the tree used for calculating the gravitational force
+ *  and the neighbor tree for SPH.
+ *  The type of tree implemented is a geometrical oct-tree, starting from a cube encompassing
+ *  all particles. This cube is automatically found in the domain decomposition, which also
+ *  splits up the global "top-level" tree along node boundaries, moving the particles
+ *  of different parts of the tree to separate processors. In the present version of the code, the tree
+ *  construction may be repeated without a renewed domain decomposition. In this case,
+ *  if particles are on the "wrong" processor because a new domain decomposition has not been
+ *  carried out, they are sent as temporary points to the right insertion processor according
+ *  to the layout of the top-level nodes.
+ */
+
+/*! This function is a driver routine for constructing the oct-tree.
+ *
+ *  \return number of local nodes (including top level nodes) of the constructed tree
+ */
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+int tree<node, partset, point_data, foreign_point_data>::treebuild(int ninsert, int *indexlist)
+{
+  if(MaxPart == 0 && ninsert == 0)
+    return 0;  // nothing to be done
+
+  if(MaxPart == 0 && ninsert > 0)
+    Terminate("Strange, we'll try to construct a tree for %d particles, but it appears not be allocated\n", ninsert);
+
+  if(ninsert == Tp->NumPart)
+    D->mpi_printf("TREE: Full tree construction for all particles. (presently allocated=%g MB)\n", Mem.getAllocatedBytesInMB());
+
+  Ninsert   = ninsert;
+  IndexList = indexlist;
+
+  TIMER_START(CPU_TREEBUILD);
+
+  double t0 = Logs.second();
+
+  int flag, iter = 0;
+  do /* try constructing tree until successful in terms of storage allocation */
+    {
+      TIMER_START(CPU_TREEBUILD_INSERT);
+
+      int flag_single = treebuild_construct();
+
+      TIMER_STOP(CPU_TREEBUILD_INSERT);
+
+      MPI_Allreduce(&flag_single, &flag, 1, MPI_INT, MPI_MIN, D->Communicator);
+
+      if(flag < 0)
+        {
+          /* tree construction was not successful and needs to be repeated */
+          treefree();
+          All.TreeAllocFactor *= 1.15;
+          // D->mpi_printf("TREE: Increasing TreeAllocFactor, new value=%g\n", All.TreeAllocFactor);
+          treeallocate(Tp->NumPart, Tp, D);
+        }
+      else
+        {
+          /* treebuild was successfull, but let's check if we allocated clearly too much storage, and if so repeat it */
+          int max_numnodes;
+          MPI_Allreduce(&NumNodes, &max_numnodes, 1, MPI_INT, MPI_MAX, D->Communicator);
+
+          if((MaxNodes - D->NTopnodes) > 1.5 * (max_numnodes - D->NTopnodes))
+            {
+              double oldvalue = All.TreeAllocFactor;
+              double newvalue = std::max<double>(1.1 * (max_numnodes - D->NTopnodes) / (MaxPart + BASENUMBER), 0.02);
+
+              if(newvalue < oldvalue)
+                {
+                  treefree();
+                  /*
+                     D->mpi_printf("TREE: max_numnodes=%d  MaxNodes=%d D->NTopnodes=%d\n", max_numnodes, MaxNodes,
+                     D->NTopnodes);
+                     D->mpi_printf("TREE: Decreasing TreeAllocFactor, new value=%g, old value=%g\n", newvalue,
+                     oldvalue);
+                  */
+                  All.TreeAllocFactor = newvalue;
+                  flag                = -1;
+                  treeallocate(Tp->NumPart, Tp, D);
+                }
+            }
+        }
+      iter++;
+
+      if(iter > TREE_MAX_ITER)
+        Terminate("tree construction failed\n");
+    }
+  while(flag < 0);
+
+  TIMER_STOPSTART(CPU_TREEBUILD, CPU_TREEBUILD_BRANCHES);
+
+  /* first, construct the properties of the tree branches below the top leaves */
+
+  int ntopleaves = D->NumTopleafOfTask[D->ThisTask];
+  int *list      = D->ListOfTopleaves + D->FirstTopleafOfTask[D->ThisTask];
+
+  for(int i = 0; i < ntopleaves; i++)
+    {
+      int no  = NodeIndex[list[i]];
+      int sib = NodeSibling[list[i]];
+
+      update_node_recursive(no, sib, TREE_MODE_BRANCH);
+    }
+
+  TIMER_STOPSTART(CPU_TREEBUILD_BRANCHES, CPU_TREEBUILD_TOPLEVEL);
+
+  exchange_topleafdata();
+
+  /* now update the top-level tree nodes */
+  if(TreeSharedMem_ThisTask == 0)
+    update_node_recursive(MaxPart, -1, TREE_MODE_TOPLEVEL);
+
+  for(int k = D->NTopnodes; k < NumNodes; k++)
+    {
+      int index               = MaxPart + k;
+      Nodes[index].OriginTask = D->ThisTask;
+      Nodes[index].OriginNode = index;
+      Nodes[index].access.clear();
+      Nodes[index].flag_already_fetched = 0;
+    }
+
+  if(TreeSharedMem_ThisTask == 0)
+    {
+      for(int k = 0; k < D->NTopnodes; k++)
+        {
+          int index                  = MaxPart + k;
+          TopNodes[index].OriginTask = D->ThisTask;
+          TopNodes[index].OriginNode = index;
+          TopNodes[index].access.clear();
+          TopNodes[index].flag_already_fetched = 0;
+        }
+    }
+
+  tree_initialize_leaf_node_access_info();
+
+  double t1 = Logs.second();
+  Buildtime = Logs.timediff(t0, t1);
+
+  report_log_message();
+
+  TIMER_STOP(CPU_TREEBUILD_TOPLEVEL);
+
+  return NumNodes;
+}
+
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+void tree<node, partset, point_data, foreign_point_data>::tree_initialize_leaf_node_access_info(void)
+{
+  if(TreeSharedMem_ThisTask == 0)
+    {
+      for(int k = 0; k < D->NTopleaves; k++)
+        {
+          int index = NodeIndex[k];
+
+          if(Shmem.GetNodeIDForSimulCommRank[D->TaskOfLeaf[k]] == Shmem.GetNodeIDForSimulCommRank[D->ThisTask])
+            TopNodes[index].cannot_be_opened_locally = 0;
+          else
+            {
+              TopNodes[index].cannot_be_opened_locally = 1;
+              TopNodes[index].flag_already_fetched     = 0;
+              TopNodes[index].nextnode                 = MaxPart + MaxNodes + k;
+              TopNodes[index].nextnode_shmrank         = TreeSharedMem_ThisTask;
+            }
+          TopNodes[index].OriginTask = D->TaskOfLeaf[k];
+          TopNodes[index].OriginNode = index;
+        }
+    }
+}
+
+/*! Constructs the gravitational oct-tree.
+ *
+ *  The index convention for accessing tree nodes is the following: \n
+ *  node index \n
+ *  [0...            MaxPart-1]                references single particles, the indices \n
+ *  [MaxPart... MaxPart+MaxNodes-1]  references tree nodes \n
+ *  [MaxPart+MaxNodes...  MaxPart+MaxNodes+NTopleaves-1]                                  references "pseudo particles", i.e. markers
+ * for branches on foreign CPUs \n [MaxPart+MaxNodes+NTopleaves... MaxPart+MaxNodes+NTopleaves+NumPartImported-1]   references imported
+ * points \n
+ *
+ *  the pointer `Nodes' is shifted such that Nodes[MaxPart] gives the first tree node (i.e. the root node).
+ *
+ *  \return if successful returns the number of local+top nodes of the constructed tree \n
+ *          -1 if the number of allocated tree nodes is too small \n
+ *          -2 if the number of allocated tree nodes is even too small to fit the top nodes \n
+ *          -3 if a particle out of domain box condition was encountered
+ */
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+int tree<node, partset, point_data, foreign_point_data>::treebuild_construct(void)
+{
+  if(TreeSharedMem_ThisTask == 0)
+    {
+      /* create an empty root node  */
+      NextFreeNode = MaxPart; /* index of first free node */
+
+      node *nfreep             = &TopNodes[NextFreeNode]; /* select first node        */
+      nfreep->nextnode         = -1;
+      nfreep->sibling          = -1;
+      nfreep->father           = -1;
+      nfreep->level            = 0;
+      nfreep->sibling_shmrank  = TreeSharedMem_ThisTask;
+      nfreep->nextnode_shmrank = TreeSharedMem_ThisTask;
+      nfreep->father_shmrank   = TreeSharedMem_ThisTask;
+
+      for(int j = 0; j < 3; j++)
+        nfreep->center[j] = ((MyIntPosType)1) << (BITS_FOR_POSITIONS - 1);
+
+      NodeIndex[0]   = NextFreeNode;
+      NodeLevel[0]   = 0;
+      NodeSibling[0] = -1;
+      NumNodes       = 1;
+      NextFreeNode++;
+
+      /* create a set of empty nodes corresponding to the top-level domain
+       * grid. We need to generate these nodes first to make sure that we have a
+       * complete top-level tree which allows the easy insertion of the
+       * pseudo-particles at the right place.
+       */
+
+      if(create_empty_nodes(MaxPart, 1, 0, 1, -1, 0, 0, 0) < 0)
+        return -2;
+    }
+
+  MPI_Bcast(&NumNodes, 1, MPI_INT, 0, D->Communicator);
+  MPI_Bcast(&NextFreeNode, 1, MPI_INT, 0, D->Communicator);
+
+  if(NumNodes != D->NTopnodes)
+    Terminate("NumNodes=%d != D->NTopnodes=%d", NumNodes, D->NTopnodes);
+
+  FirstNonTopLevelNode = NextFreeNode;
+
+  if(MaxPart + D->NTopnodes != FirstNonTopLevelNode)
+    Terminate("unexpected");
+
+  for(int j = 0; j < D->NTask; j++)
+    Send_count[j] = 0;
+
+#ifndef LEAN
+  int *node_list = (int *)Mem.mymalloc_movable(&node_list, "node_list", Tp->NumPart * sizeof(int));
+  int *task_list = (int *)Mem.mymalloc_movable(&task_list, "task_list", Tp->NumPart * sizeof(int));
+#endif
+
+  /* now we determine for each point the insertion top-level node, and the task on which this lies */
+  for(int idx = 0; idx < Ninsert; idx++)
+    {
+      int i;
+      if(IndexList)
+        i = IndexList[idx];
+      else
+        i = idx;
+
+      if(Tp->P[i].get_Ti_Current() != All.Ti_Current)
+        Tp->drift_particle(&Tp->P[i], &Tp->SphP[i], All.Ti_Current);
+
+      int no;
+      int task;
+      tree_get_node_and_task(i, no, task);
+
+#ifndef LEAN
+      node_list[i] = no;
+      task_list[i] = task;
+#endif
+
+      if(task != D->ThisTask)
+        Send_count[task]++;
+    }
+
+  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, D->Communicator);
+
+  NumPartImported = 0;
+  NumPartExported = 0;
+  Recv_offset[0]  = 0;
+  Send_offset[0]  = 0;
+
+  for(int j = 0; j < D->NTask; j++)
+    {
+      NumPartImported += Recv_count[j];
+      NumPartExported += Send_count[j];
+      if(j > 0)
+        {
+          Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
+          Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
+        }
+    }
+
+  Points   = (point_data *)Mem.mymalloc_movable(&Points, "Points", NumPartImported * sizeof(point_data));
+  Nextnode = (int *)Mem.mymalloc_movable(&Nextnode, "Nextnode", (MaxPart + D->NTopleaves + NumPartImported) * sizeof(int));
+  Father   = (int *)Mem.mymalloc_movable(&Father, "Father", (MaxPart + NumPartImported) * sizeof(int));
+
+  /* now put in markers ("pseudo" particles) in top-leaf nodes to indicate on which task the branch lies */
+  for(int i = 0; i < D->NTopleaves; i++)
+    {
+      int index = NodeIndex[i];
+
+      if(TreeSharedMem_ThisTask == 0)
+        TopNodes[index].nextnode = MaxPart + MaxNodes + i;
+
+      /* set nextnode for pseudo-particle (Nextnode exists on all ranks) */
+      Nextnode[MaxPart + i] = TopNodes[index].sibling;
+    }
+
+  point_data *export_Points = (point_data *)Mem.mymalloc("export_Points", NumPartExported * sizeof(point_data));
+
+  for(int j = 0; j < D->NTask; j++)
+    Send_count[j] = 0;
+
+  for(int idx = 0; idx < Ninsert; idx++) /* prepare particle data to be copied to other tasks */
+    {
+      int i;
+      if(IndexList)
+        i = IndexList[idx];
+      else
+        i = idx;
+
+#ifdef LEAN
+      int no, task;
+      tree_get_node_and_task(i, no, task);
+#else
+      int task = task_list[i];
+      int no   = node_list[i];
+#endif
+
+      if(task != D->ThisTask)
+        {
+          int n = Send_offset[task] + Send_count[task]++;
+
+          fill_in_export_points(&export_Points[n], i, no);
+        }
+    }
+
+  /* exchange data */
+  for(int ngrp = 1; ngrp < (1 << D->PTask); ngrp++)
+    {
+      int recvTask = D->ThisTask ^ ngrp;
+      if(recvTask < D->NTask)
+        if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
+          MPI_Sendrecv(&export_Points[Send_offset[recvTask]], Send_count[recvTask] * sizeof(point_data), MPI_BYTE, recvTask,
+                       TAG_DENS_A, &Points[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(point_data), MPI_BYTE, recvTask,
+                       TAG_DENS_A, D->Communicator, MPI_STATUS_IGNORE);
+    }
+
+  Mem.myfree(export_Points);
+
+  ImportedNodeOffset = MaxPart + MaxNodes + D->NTopleaves;
+
+  MPI_Allreduce(&NumPartImported, &EndOfTreePoints, 1, MPI_INT, MPI_MAX, D->Communicator);
+  EndOfTreePoints += ImportedNodeOffset;
+  EndOfForeignNodes = EndOfTreePoints + (INT_MAX - EndOfTreePoints) / 2;
+
+  /* make a list that holds the particles that belong to a certain node */
+  index_data *index_list =
+      (index_data *)Mem.mymalloc_movable(&index_list, "index_list", (Ninsert + NumPartImported) * sizeof(index_data));
+  int count = 0;
+
+  for(int idx = 0; idx < Ninsert; idx++)
+    {
+      int i;
+      if(IndexList)
+        i = IndexList[idx];
+      else
+        i = idx;
+
+#ifdef LEAN
+      int no, task;
+      tree_get_node_and_task(i, no, task);
+#else
+      int task = task_list[i];
+      int no   = node_list[i];
+#endif
+
+      if(task == D->ThisTask)
+        {
+          index_list[count].p       = i;
+          index_list[count].subnode = no;
+          count++;
+        }
+    }
+
+  for(int i = 0; i < NumPartImported; i++)
+    {
+      index_list[count].p       = i + ImportedNodeOffset;
+      index_list[count].subnode = Points[i].no;
+      count++;
+    }
+
+#ifndef LEAN
+  Mem.myfree_movable(task_list);
+  Mem.myfree_movable(node_list);
+#endif
+
+  /* sort according to node so that particles indices in the same node are grouped together */
+  mycxxsort(index_list, index_list + count, compare_index_data_subnode);
+
+  int full_flag  = 0;
+  int ntopleaves = D->NumTopleafOfTask[D->ThisTask];
+  int start      = 0;
+
+  for(int n = 0; n < ntopleaves; n++)
+    {
+      int no              = D->ListOfTopleaves[D->FirstTopleafOfTask[D->ThisTask] + n];
+      int th              = NodeIndex[no];
+      unsigned char level = NodeLevel[no];
+      int sibling         = NodeSibling[no];
+
+      while(start < count && index_list[start].subnode < no)
+        start++;
+
+      if(start < count)
+        {
+          int last = start;
+          while(last < count && index_list[last].subnode == no)
+            last++;
+
+          int num = last - start;
+
+          if(treebuild_insert_group_of_points(num, &index_list[start], th, level, sibling))
+            {
+              full_flag = 1;
+              break;
+            }
+
+          start += num;
+        }
+    }
+
+  if((NumNodes = NextFreeNode - MaxPart) >= MaxNodes)
+    {
+      if(All.TreeAllocFactor > MAX_TREE_ALLOC_FACTOR)
+        {
+          Tp->dump_particles();
+          Terminate(
+              "task %d: looks like a serious problem, stopping with particle dump.  NumNodes=%d MaxNodes=%d  NumPartImported=%d "
+              "NumPart=%d\n",
+              D->ThisTask, NumNodes, MaxNodes, NumPartImported, Tp->NumPart);
+        }
+    }
+
+  Mem.myfree(index_list);
+
+  if(full_flag)
+    return -1;
+
+  return NumNodes;
+}
+
+/*! inserts a group of particles into the gravitational tree
+ *
+ *  level - level of target node
+ *  th    - target node
+ *
+ *  \return 0 if successful \n
+ *          1 if too few nodes have been allocated in the Nodes array
+ */
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+int tree<node, partset, point_data, foreign_point_data>::treebuild_insert_group_of_points(int num, index_data *index_list, int th,
+                                                                                          unsigned char level, int sibling)
+{
+  if(level >= BITS_FOR_POSITIONS)
+    Terminate(
+        "It appears we have reached the bottom of the tree because there are more than TREE_NUM_BEFORE_NODESPLIT=%d particles in the "
+        "smallest tree node representable for  BITS_FOR_POSITIONS=%d.\n"
+        "Either eliminate the particles at (nearly) indentical coordinates, increase the setting for TREE_NUM_BEFORE_NODESPLIT, or "
+        "possibly enlarge BITS_FOR_POSITIONS if you have really not enough dynamic range\n",
+        (int)TREE_NUM_BEFORE_NODESPLIT, (int)BITS_FOR_POSITIONS);
+
+  MyIntPosType mask       = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - 1 - level));
+  unsigned char shiftx    = (BITS_FOR_POSITIONS - 1 - level);
+  unsigned char shifty    = (BITS_FOR_POSITIONS - 2 - level);
+  unsigned char shiftz    = (BITS_FOR_POSITIONS - 3 - level);
+  MyIntPosType centermask = ~(~((MyIntPosType)0) >> level);
+
+  int subcount[8] = {0, 0, 0, 0, 0, 0, 0, 0}, subnode[8];
+  MyIntPosType *subintpos[8];
+
+  for(int i = 0; i < num; i++)
+    {
+      MyIntPosType *intpos;
+
+      int p = index_list[i].p;
+      if(p < MaxPart)
+        {
+          intpos = Tp->P[p].IntPos;
+        }
+      else
+        {
+          int n  = p - ImportedNodeOffset;
+          intpos = Points[n].IntPos;
+        }
+
+      unsigned char subnode = (((unsigned char)((intpos[0] & mask) >> shiftx)) | ((unsigned char)((intpos[1] & mask) >> shifty)) |
+                               ((unsigned char)((intpos[2] & mask) >> shiftz)));
+      if(subnode > 7)
+        Terminate("stop: subnode > 7");
+
+      subcount[subnode]++;
+
+      subintpos[subnode] = intpos;
+
+      index_list[i].subnode = subnode;
+    }
+
+  /* sort */
+  mycxxsort(index_list, index_list + num, compare_index_data_subnode);
+
+  centermask >>= 1;
+  centermask |= ~(~((MyIntPosType)0) >> 1); /* this sets the MSB */
+  mask >>= 1;
+
+  node *nfreep_last = NULL;
+
+  /* create the daughter nodes */
+  for(int i = 0; i < 8; i++)
+    {
+      if(subcount[i] > TREE_NUM_BEFORE_NODESPLIT)
+        {
+          int thnew;
+
+          thnew = NextFreeNode++;
+
+          if(thnew - MaxPart >= MaxNodes)
+            return 1; /* we are out of space */
+
+          subnode[i] = thnew;
+
+          if(nfreep_last)
+            {
+              nfreep_last->sibling  = thnew;
+              nfreep_last->nextnode = thnew;
+
+              nfreep_last->sibling_shmrank  = TreeSharedMem_ThisTask;
+              nfreep_last->nextnode_shmrank = TreeSharedMem_ThisTask;
+            }
+          else
+            {
+              get_nodep(th)->nextnode         = thnew;
+              get_nodep(th)->nextnode_shmrank = TreeSharedMem_ThisTask;
+            }
+
+          if(thnew < MaxPart + D->NTopnodes)
+            Terminate("thnew = %d   <  MaxPart=%d + D->NTopnodes=%d", thnew, MaxPart, D->NTopnodes);
+
+          node *nfreep = get_nodep(thnew);
+
+          nfreep->father = th;
+          nfreep->level  = level + 1;
+
+          nfreep->center[0] = ((subintpos[i][0] & centermask) | mask);
+          nfreep->center[1] = ((subintpos[i][1] & centermask) | mask);
+          nfreep->center[2] = ((subintpos[i][2] & centermask) | mask);
+
+          nfreep->father_shmrank   = TreeSharedMem_ThisTask;
+          nfreep->nextnode_shmrank = TreeSharedMem_ThisTask;
+          nfreep->sibling_shmrank  = TreeSharedMem_ThisTask;
+
+          nfreep_last = nfreep;
+        }
+    }
+
+  /* now insert the particles that are chained to the node */
+
+  int p_last = -1, p_first = -1;
+
+  /* now insert the particle groups into the created daughter nodes, or chain them to the node */
+  for(int i = 0, off = 0; i < 8; i++)
+    {
+      if(subcount[i] <= TREE_NUM_BEFORE_NODESPLIT)
+        {
+          /* put the particles into the node as a chain */
+          for(int j = 0; j < subcount[i]; j++)
+            {
+              int p = index_list[off + j].p;
+
+              if(nfreep_last == NULL && p_first == -1)
+                {
+                  get_nodep(th)->nextnode         = p;
+                  get_nodep(th)->nextnode_shmrank = TreeSharedMem_ThisTask;
+                }
+
+              if(p < MaxPart)
+                Father[p] = th;
+              else
+                Father[p - MaxNodes - D->NTopleaves] = th;
+
+              if(p_last >= 0)
+                {
+                  if(p_last < MaxPart)
+                    Nextnode[p_last] = p;
+                  else
+                    Nextnode[p_last - MaxNodes] = p; /* imported point */
+                }
+
+              p_last = p;
+
+              if(p_first == -1)
+                p_first = p;
+            }
+        }
+
+      off += subcount[i];
+    }
+
+  if(p_last >= 0)
+    {
+      if(p_last < MaxPart)
+        Nextnode[p_last] = sibling;
+      else
+        Nextnode[p_last - MaxNodes] = sibling; /* imported point */
+    }
+
+  if(nfreep_last)
+    {
+      if(p_first >= 0)
+        {
+          nfreep_last->sibling  = p_first;
+          nfreep_last->nextnode = p_first;
+
+          nfreep_last->sibling_shmrank  = TreeSharedMem_ThisTask;
+          nfreep_last->nextnode_shmrank = TreeSharedMem_ThisTask;
+        }
+      else
+        {
+          nfreep_last->sibling  = sibling;
+          nfreep_last->nextnode = sibling;
+
+          nfreep_last->sibling_shmrank  = TreeSharedMem_ThisTask;
+          nfreep_last->nextnode_shmrank = TreeSharedMem_ThisTask;
+        }
+    }
+
+  for(int i = 0, off = 0; i < 8; i++)
+    {
+      if(subcount[i] > TREE_NUM_BEFORE_NODESPLIT)
+        {
+          if(subnode[i] < MaxPart + D->NTopnodes)
+            Terminate("subnode[i]=%d < MaxPart=%d + D->NTopnodes=%d", subnode[i], MaxPart, D->NTopnodes);
+
+          int out_of_space =
+              treebuild_insert_group_of_points(subcount[i], &index_list[off], subnode[i], level + 1, get_nodep(subnode[i])->sibling);
+          if(out_of_space)
+            return out_of_space;
+        }
+
+      off += subcount[i];
+    }
+
+  return 0; /* success */
+}
+
+/*! This function recursively creates a set of empty tree nodes which
+ *  corresponds to the top-level tree for the domain grid. This is done to
+ *  ensure that this top-level tree is always "complete" so that we can easily
+ *  associate the pseudo-particles of other CPUs with tree-nodes at a given
+ *  level in the tree, even when the particle population is so sparse that
+ *  some of these nodes are actually empty.
+ *
+ * \return 0 if successful \n
+ *         -1 if number of allocated tree nodes is too small to fit the newly created nodes
+ */
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+int tree<node, partset, point_data, foreign_point_data>::create_empty_nodes(
+    int no_parent /*!< parent node for which daughter nodes shall be created */, int level /*!< level of new nodes */,
+    int topnode /*!< index of the parent node in the TopNodes array */,
+    int bits /*!< 2^bits is the number of nodes per dimension at the level of the daughter nodes */, int sibling,
+    MyIntPosType x /*!< position of the parent node in the x direction, in the range [0,2^(bits-1) - 1] */,
+    MyIntPosType y /*!< position of the parent node in the y direction, in the range [0,2^(bits-1) - 1] */,
+    MyIntPosType z /*!< position of the parent node in the z direction, in the range [0,2^(bits-1) - 1] */)
+{
+  if(D->TopNodes[topnode].Daughter >= 0)
+    {
+      int firstflag = 0;
+      int nostart   = NextFreeNode;
+
+      /* loop over daughter nodes */
+      for(int i = 0; i < 2; i++)
+        for(int j = 0; j < 2; j++)
+          for(int k = 0; k < 2; k++)
+            {
+              if(NumNodes >= MaxNodes)
+                {
+                  if(All.TreeAllocFactor > MAX_TREE_ALLOC_FACTOR)
+                    {
+                      char buf[MAXLEN_PATH];
+                      sprintf(buf, "task %d: looks like a serious problem (NTopnodes=%d), stopping with particle dump.\n", D->ThisTask,
+                              D->NTopnodes);
+                      Tp->dump_particles();
+                      Terminate(buf);
+                    }
+                  return -1;
+                }
+
+              int no = NextFreeNode++;
+              NumNodes++;
+
+              if(firstflag == 0)
+                {
+                  TopNodes[no_parent].nextnode = no;
+                  firstflag                    = 1;
+                }
+
+              TopNodes[no].father = no_parent;
+
+              if(i + 2 * j + 4 * k == 7)
+                TopNodes[no].sibling = sibling;
+              else
+                TopNodes[no].sibling = no + 1;
+
+              TopNodes[no].nextnode = TopNodes[no].sibling;
+
+              TopNodes[no].level = level;
+
+              MyIntPosType lenhalf   = ((MyIntPosType)1) << (BITS_FOR_POSITIONS - level - 1);
+              TopNodes[no].center[0] = TopNodes[no_parent].center[0] + (2 * i - 1) * lenhalf;
+              TopNodes[no].center[1] = TopNodes[no_parent].center[1] + (2 * j - 1) * lenhalf;
+              TopNodes[no].center[2] = TopNodes[no_parent].center[2] + (2 * k - 1) * lenhalf;
+
+              TopNodes[no].sibling_shmrank  = TreeSharedMem_ThisTask;
+              TopNodes[no].nextnode_shmrank = TreeSharedMem_ThisTask;
+              TopNodes[no].father_shmrank   = TreeSharedMem_ThisTask;
+            }
+
+      /* loop over daughter nodes */
+      for(int i = 0; i < 2; i++)
+        for(int j = 0; j < 2; j++)
+          for(int k = 0; k < 2; k++)
+            {
+              int no = nostart++;
+
+              peanokey key = peano_hilbert_key((x << 1) + i, (y << 1) + j, (z << 1) + k, bits);
+              int sub      = 7 & key.ls;
+
+              if(D->TopNodes[D->TopNodes[topnode].Daughter + sub].Daughter == -1)
+                {
+                  NodeIndex[D->TopNodes[D->TopNodes[topnode].Daughter + sub].Leaf]   = no;
+                  NodeLevel[D->TopNodes[D->TopNodes[topnode].Daughter + sub].Leaf]   = level;
+                  NodeSibling[D->TopNodes[D->TopNodes[topnode].Daughter + sub].Leaf] = TopNodes[no].sibling;
+                }
+
+              /* create grand daughter nodes for current daughter node */
+              if(create_empty_nodes(no, level + 1, D->TopNodes[topnode].Daughter + sub, bits + 1, TopNodes[no].sibling, 2 * x + i,
+                                    2 * y + j, 2 * z + k) < 0)
+                return -1;
+            }
+    }
+
+  return 0;
+}
+
+/*! This function allocates the memory used for storage of the tree nodes. Usually,
+ *  the number of required nodes is of order 0.7*maxpart, but if this is insufficient,
+ *  the code will try to allocated more space by increasing TreeAllocFactor.
+ */
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+void tree<node, partset, point_data, foreign_point_data>::treeallocate(int max_partindex, partset *Tp_ptr, domain<partset> *Dptr)
+{
+  D  = Dptr;
+  Tp = Tp_ptr;
+
+  if(max_partindex != -1)
+    {
+      MPI_Allreduce(&max_partindex, &MaxPart, 1, MPI_INT, MPI_MAX, D->Communicator);
+    }
+
+  if(MaxPart == 0)
+    return;  // nothing to be done
+
+  if(Nodes)
+    Terminate("Nodes already allocated");
+
+  Send_count  = (int *)Mem.mymalloc_movable(&Send_count, "Send_count", sizeof(int) * D->NTask);
+  Send_offset = (int *)Mem.mymalloc_movable(&Send_offset, "Send_offset", sizeof(int) * D->NTask);
+  Recv_count  = (int *)Mem.mymalloc_movable(&Recv_count, "Recv_count", sizeof(int) * D->NTask);
+  Recv_offset = (int *)Mem.mymalloc_movable(&Recv_offset, "Recv_offset", sizeof(int) * D->NTask);
+
+  if(max_partindex != -1)
+    {
+      MPI_Allreduce(MPI_IN_PLACE, &All.TreeAllocFactor, 1, MPI_DOUBLE, MPI_MAX, D->Communicator);
+
+      MaxNodes = (int)(All.TreeAllocFactor * (MaxPart + BASENUMBER)) + D->NTopnodes;
+
+      int max_nodes;
+      MPI_Allreduce(&MaxNodes, &max_nodes, 1, MPI_INT, MPI_MAX, D->Communicator);
+
+      if(max_nodes != MaxNodes)
+        Terminate("Strange: different maxnodes detected: %d %d", max_nodes, MaxNodes);
+
+      int max_leaves;
+      MPI_Allreduce(&D->NTopleaves, &max_leaves, 1, MPI_INT, MPI_MAX, D->Communicator);
+      if(max_leaves != D->NTopleaves)
+        Terminate("Strange: different maxnodes detected: %d %d", max_leaves, D->NTopleaves);
+    }
+  else
+    {
+      max_partindex = MaxPart;
+    }
+
+  /* now split up the communicator into pieces overlap with different shared memory regions */
+  MPI_Comm_split(D->Communicator, Shmem.Island_Smallest_WorldTask, 0, &TreeSharedMemComm);
+
+  MPI_Comm_rank(TreeSharedMemComm, &TreeSharedMem_ThisTask);
+  MPI_Comm_size(TreeSharedMemComm, &TreeSharedMem_NTask);
+
+  TreeNodes_offsets          = (ptrdiff_t *)Mem.mymalloc("TreeNodes_offsets", TreeSharedMem_NTask * sizeof(ptrdiff_t));
+  TreePoints_offsets         = (ptrdiff_t *)Mem.mymalloc("TreePoints_offsets", TreeSharedMem_NTask * sizeof(ptrdiff_t));
+  TreeNextnode_offsets       = (ptrdiff_t *)Mem.mymalloc("TreeNextnode_offsets", TreeSharedMem_NTask * sizeof(ptrdiff_t));
+  TreeForeign_Nodes_offsets  = (ptrdiff_t *)Mem.mymalloc("TreeForeign_Nodes_offsets", TreeSharedMem_NTask * sizeof(ptrdiff_t));
+  TreeForeign_Points_offsets = (ptrdiff_t *)Mem.mymalloc("TreeForeign_Points_offsets", TreeSharedMem_NTask * sizeof(ptrdiff_t));
+  TreeP_offsets              = (ptrdiff_t *)Mem.mymalloc("TreeP_offsets", TreeSharedMem_NTask * sizeof(ptrdiff_t));
+  TreeSphP_offsets           = (ptrdiff_t *)Mem.mymalloc("TreeSphP_offsets", TreeSharedMem_NTask * sizeof(ptrdiff_t));
+  TreePS_offsets             = (ptrdiff_t *)Mem.mymalloc("TreePS_offsets", TreeSharedMem_NTask * sizeof(ptrdiff_t));
+
+  TreeSharedMemBaseAddr = (void **)Mem.mymalloc("TreeSharedMemBaseAddr", TreeSharedMem_NTask * sizeof(void *));
+
+  for(int i = 0; i < TreeSharedMem_NTask; i++)
+    {
+      int island_rank = Shmem.Island_ThisTask + (i - TreeSharedMem_ThisTask);
+
+      if(island_rank < 0 || island_rank >= Shmem.Island_NTask)
+        Terminate("island_rank=%d  < 0 || island_rank >= Shmem.Island_NTask=%d", island_rank, Shmem.Island_NTask);
+
+      TreeSharedMemBaseAddr[i] = Shmem.SharedMemBaseAddr[island_rank];
+    }
+
+  /* allocate the Top-Level tree only once per shared-memory section in the communicator */
+  if(TreeSharedMem_ThisTask == 0)
+    {
+      /* if we have a single shared memory node, or we are building a tree on a subcommunicator, allocate locally */
+      if(TreeSharedMem_NTask == Shmem.World_NTask || D->NTask < Shmem.Sim_NTask)
+        {
+          NodeLevel   = (unsigned char *)Mem.mymalloc("NodeLevel", D->NTopleaves * sizeof(unsigned char));
+          NodeSibling = (int *)Mem.mymalloc("NodeSibling", D->NTopleaves * sizeof(int));
+          NodeIndex   = (int *)Mem.mymalloc("NodeIndex", D->NTopleaves * sizeof(int));
+
+          TopNodes = (node *)Mem.mymalloc("TopNodes", D->NTopnodes * sizeof(node));
+          TopNodes -= MaxPart;
+        }
+      else /* otherwise, allocate the storage on the ghost processor, and get the address in our space from him */
+        {
+          int ghost_rank = Shmem.GetGhostRankForSimulCommRank[Shmem.Sim_ThisTask];
+
+          /* request the storage from the responsible ghost rank, and map it into the local address space */
+          size_t tab_len[4] = {D->NTopleaves * sizeof(unsigned char), D->NTopleaves * sizeof(int), D->NTopleaves * sizeof(int),
+                               D->NTopnodes * sizeof(node)};
+
+          MPI_Send(tab_len, 4 * sizeof(tab_len), MPI_BYTE, ghost_rank, TAG_TOPNODE_ALLOC, MPI_COMM_WORLD);
+
+          ptrdiff_t off[4];
+          MPI_Recv(off, 4 * sizeof(ptrdiff_t), MPI_BYTE, ghost_rank, TAG_TOPNODE_OFFSET, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+          NodeLevel   = (unsigned char *)((char *)Shmem.SharedMemBaseAddr[Shmem.Island_NTask - 1] + off[0]);
+          NodeSibling = (int *)((char *)Shmem.SharedMemBaseAddr[Shmem.Island_NTask - 1] + off[1]);
+          NodeIndex   = (int *)((char *)Shmem.SharedMemBaseAddr[Shmem.Island_NTask - 1] + off[2]);
+
+          TopNodes = (node *)((char *)Shmem.SharedMemBaseAddr[Shmem.Island_NTask - 1] + off[3]);
+          TopNodes -= MaxPart;
+
+          MPI_Recv(&TreeInfoHandle, 1, MPI_INT, ghost_rank, TAG_N, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+    }
+
+  MPI_Bcast(&TreeInfoHandle, 1, MPI_INT, 0, TreeSharedMemComm);
+
+  ptrdiff_t off[4] = {((char *)NodeLevel - Mem.Base), ((char *)NodeSibling - Mem.Base), ((char *)NodeIndex - Mem.Base),
+                      ((char *)TopNodes - Mem.Base)};
+
+  MPI_Bcast(off, 4 * sizeof(ptrdiff_t), MPI_BYTE, 0, TreeSharedMemComm);
+
+  int shmrank = Shmem.GetShmRankForSimulCommRank[Shmem.Sim_ThisTask];
+  MPI_Bcast(&shmrank, 1, MPI_INT, 0, TreeSharedMemComm);
+
+  NodeLevel   = (unsigned char *)((char *)Shmem.SharedMemBaseAddr[shmrank] + off[0]);
+  NodeSibling = (int *)((char *)Shmem.SharedMemBaseAddr[shmrank] + off[1]);
+  NodeIndex   = (int *)((char *)Shmem.SharedMemBaseAddr[shmrank] + off[2]);
+  TopNodes    = (node *)((char *)Shmem.SharedMemBaseAddr[shmrank] + off[3]);
+
+  Nodes = (node *)Mem.mymalloc_movable(&Nodes, "Nodes", (MaxNodes - D->NTopnodes + 1) * sizeof(node));
+  Nodes -= (MaxPart + D->NTopnodes);
+}
+
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+void tree<node, partset, point_data, foreign_point_data>::prepare_shared_memory_access(void)
+{
+  if(D->NTask != Shmem.Sim_NTask)
+    Terminate("This version of the tree communication algorithm only works with the full simulation partition");
+
+  ptrdiff_t off;
+  off = (char *)Nodes - Mem.Base;
+  MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, TreeNodes_offsets, sizeof(ptrdiff_t), MPI_BYTE, TreeSharedMemComm);
+  off = (char *)Points - Mem.Base;
+  MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, TreePoints_offsets, sizeof(ptrdiff_t), MPI_BYTE, TreeSharedMemComm);
+  off = (char *)Nextnode - Mem.Base;
+  MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, TreeNextnode_offsets, sizeof(ptrdiff_t), MPI_BYTE, TreeSharedMemComm);
+  off = (char *)Foreign_Nodes - Mem.Base;
+  MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, TreeForeign_Nodes_offsets, sizeof(ptrdiff_t), MPI_BYTE, TreeSharedMemComm);
+  off = (char *)Foreign_Points - Mem.Base;
+  MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, TreeForeign_Points_offsets, sizeof(ptrdiff_t), MPI_BYTE, TreeSharedMemComm);
+  off = (char *)Tp->P - Mem.Base;
+  MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, TreeP_offsets, sizeof(ptrdiff_t), MPI_BYTE, TreeSharedMemComm);
+  off = (char *)Tp->SphP - Mem.Base;
+  MPI_Allgather(&off, sizeof(ptrdiff_t), MPI_BYTE, TreeSphP_offsets, sizeof(ptrdiff_t), MPI_BYTE, TreeSharedMemComm);
+
+  if(Shmem.Sim_NTask < Shmem.World_NTask)
+    {
+      Shmem.tree_info[TreeInfoHandle].Bd.MaxPart            = MaxPart;
+      Shmem.tree_info[TreeInfoHandle].Bd.MaxNodes           = MaxNodes;
+      Shmem.tree_info[TreeInfoHandle].Bd.NTopnodes          = D->NTopnodes;
+      Shmem.tree_info[TreeInfoHandle].Bd.ImportedNodeOffset = ImportedNodeOffset;
+      Shmem.tree_info[TreeInfoHandle].Bd.EndOfTreePoints    = EndOfTreePoints;
+      Shmem.tree_info[TreeInfoHandle].Bd.EndOfForeignNodes  = EndOfForeignNodes;
+
+      // need to inform also our shared shared memory processor
+      if(TreeSharedMem_ThisTask == 0)
+        {
+          MPI_Send(&TreeInfoHandle, 1, MPI_INT, Shmem.MyShmRankInGlobal, TAG_METDATA, MPI_COMM_WORLD);
+          MPI_Send(&All.Ti_Current, sizeof(All.Ti_Current), MPI_BYTE, Shmem.MyShmRankInGlobal, TAG_METDATA + 1, MPI_COMM_WORLD);
+          MPI_Send(&Shmem.tree_info[TreeInfoHandle].Bd, sizeof(Shmem.tree_info[TreeInfoHandle].Bd), MPI_BYTE, Shmem.MyShmRankInGlobal,
+                   TAG_METDATA + 2, MPI_COMM_WORLD);
+
+          intposconvert *convfac = Tp;
+          MPI_Send(convfac, sizeof(intposconvert), MPI_BYTE, Shmem.MyShmRankInGlobal, TAG_METDATA + 3, MPI_COMM_WORLD);
+        }
+
+      Shmem.inform_offset_table(TopNodes);
+      Shmem.inform_offset_table(Nodes);
+      Shmem.inform_offset_table(Nextnode);
+      Shmem.inform_offset_table(Points);
+      Shmem.inform_offset_table(Tp->P);
+      Shmem.inform_offset_table(Tp->SphP);
+      Shmem.inform_offset_table(Foreign_Nodes);
+      Shmem.inform_offset_table(Foreign_Points);
+
+      /* the following is needed to make sure that the shared memory handler on different nodes is already properly initialized */
+      MPI_Barrier(D->Communicator);
+    }
+}
+
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+void tree<node, partset, point_data, foreign_point_data>::cleanup_shared_memory_access(void)
+{
+  if(Shmem.Sim_NTask < Shmem.World_NTask)
+    {
+      if(TreeSharedMem_ThisTask == 0)
+        {
+          // need to send this flag to the correct processor rank (our shared memoin the global communicator
+          MPI_Send(&TreeInfoHandle, 1, MPI_INT, Shmem.MyShmRankInGlobal, TAG_HEADER, MPI_COMM_WORLD);
+        }
+    }
+}
+
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+void tree<node, partset, point_data, foreign_point_data>::tree_fetch_foreign_nodes(enum ftype fetch_type)
+{
+  // find out how many we have to fetch from each node
+  int *CountFetch  = (int *)Mem.mymalloc_movable_clear(&CountFetch, "CountFetch", Shmem.World_NTask * sizeof(int));
+  int *OffsetFetch = (int *)Mem.mymalloc_movable(&OffsetFetch, "OffsetFetch", Shmem.World_NTask * sizeof(int));
+
+  for(int i = 0; i < NumOnFetchStack; i++)
+    CountFetch[StackToFetch[i].GhostRank]++;
+
+  OffsetFetch[0] = 0;
+  for(int i = 1; i < Shmem.World_NTask; i++)
+    OffsetFetch[i] = OffsetFetch[i - 1] + CountFetch[i - 1];
+
+  mycxxsort(StackToFetch, StackToFetch + NumOnFetchStack, compare_ghostrank);
+
+  /* now go through each node in turn, and import from them the requested nodes
+   */
+  int CommPTask;
+  for(CommPTask = 0; Shmem.World_NTask > (1 << CommPTask); CommPTask++)
+    ;
+
+  for(int ngrp = 1; ngrp < (1 << CommPTask); ngrp++)
+    {
+      int ghost_rank = Shmem.World_ThisTask ^ ngrp;  // this is already the responsible ghost rank
+
+      if(ghost_rank < Shmem.World_NTask)
+        {
+          if(CountFetch[ghost_rank] > 0)
+            {
+              node_req *node_req_send =
+                  (node_req *)Mem.mymalloc_movable(&node_req_send, "node_req_send", CountFetch[ghost_rank] * sizeof(node_req));
+
+              for(int i = 0; i < CountFetch[ghost_rank]; i++)
+                {
+                  int k = OffsetFetch[ghost_rank] + i;
+
+                  node *nop = get_nodep(StackToFetch[k].NodeToOpen, StackToFetch[k].ShmRank);
+
+                  node_req_send[i].foreigntask = nop->OriginTask;
+                  node_req_send[i].foreignnode = nop->OriginNode;
+                }
+
+              /* now make the node request */
+              int tag;
+              if(fetch_type == FETCH_GRAVTREE)
+                tag = TAG_FETCH_GRAVTREE + TreeInfoHandle;
+              else if(fetch_type == FETCH_SPH_DENSITY)
+                tag = TAG_FETCH_SPH_DENSITY + TreeInfoHandle;
+              else if(fetch_type == FETCH_SPH_HYDRO)
+                tag = TAG_FETCH_SPH_HYDRO + TreeInfoHandle;
+              else if(fetch_type == FETCH_SPH_TREETIMESTEP)
+                tag = TAG_FETCH_SPH_TREETIMESTEP + TreeInfoHandle;
+              else
+                {
+                  tag = 0;
+                  Terminate("tag undefined");
+                }
+
+              MPI_Send(node_req_send, CountFetch[ghost_rank] * sizeof(node_req), MPI_BYTE, ghost_rank, tag, MPI_COMM_WORLD);
+              Mem.myfree(node_req_send);
+
+              // get the information about how many nodes and particles hang below each of the nodes
+              node_count_info *node_info_send = (node_count_info *)Mem.mymalloc_movable(
+                  &node_info_send, "node_info_send", CountFetch[ghost_rank] * sizeof(node_count_info));
+
+              MPI_Recv(node_info_send, CountFetch[ghost_rank] * sizeof(node_count_info), MPI_BYTE, ghost_rank, TAG_N, MPI_COMM_WORLD,
+                       MPI_STATUS_IGNORE);
+
+              /* now find out how many nodes and points we want to import in total */
+              int n_sendpoints = 0;
+              int n_sendnodes  = 0;
+
+              for(int i = 0; i < CountFetch[ghost_rank]; i++)
+                {
+                  n_sendpoints += node_info_send[i].count_parts;
+                  n_sendnodes += node_info_send[i].count_nodes;
+                }
+
+              foreign_point_data *buf_foreignpoints =
+                  (foreign_point_data *)Mem.mymalloc("buf_foreignpoints", n_sendpoints * sizeof(foreign_point_data));
+
+              node *buf_foreignnodes = (node *)Mem.mymalloc("buf_foreignnodes", n_sendnodes * sizeof(node));
+
+              /* now receive the points and nodes */
+              if(n_sendpoints > 0)
+                MPI_Recv(buf_foreignpoints, n_sendpoints * sizeof(foreign_point_data), MPI_BYTE, ghost_rank, TAG_PDATA, MPI_COMM_WORLD,
+                         MPI_STATUS_IGNORE);
+
+              if(n_sendnodes > 0)
+                MPI_Recv(buf_foreignnodes, n_sendnodes * sizeof(node), MPI_BYTE, ghost_rank, TAG_SPHDATA, MPI_COMM_WORLD,
+                         MPI_STATUS_IGNORE);
+
+              /* now we have to link the nodes and particles into the tree */
+
+              int n_nodes_used = 0;
+              int n_parts_used = 0;
+
+              for(int i = 0; i < CountFetch[ghost_rank]; i++)
+                {
+                  int k = OffsetFetch[ghost_rank] + i;
+
+                  int no      = StackToFetch[k].NodeToOpen;
+                  int shmrank = StackToFetch[k].ShmRank;
+
+                  node *nop = get_nodep(no, shmrank);
+
+                  int n_nodes = node_info_send[i].count_nodes;
+                  int n_parts = node_info_send[i].count_parts;
+
+                  while(nop->access.test_and_set(std::memory_order_acquire))
+                    {
+                      // acquire spin lock
+                    }
+
+                  if(nop->cannot_be_opened_locally)  // make sure the node hasn't been inserted by another task yet
+                    {
+                      int pfirst = -1;
+                      int plast  = -1;
+
+                      for(int j = 0; j < n_nodes; j++)
+                        {
+                          if(NumForeignNodes >= MaxForeignNodes)
+                            Terminate(
+                                "We are out of storage for foreign nodes: NumForeignNodes=%d MaxForeignNodes=%d  j=%d n_parts=%d",
+                                NumForeignNodes, MaxForeignNodes, j, n_parts);
+
+                          /* tree index of this node */
+                          int p = EndOfTreePoints + NumForeignNodes++;
+
+                          //  cannot do a Foreign_Nodes[p - EndOfTreePoints] = buf_foreignnodes[n_nodes_used++]; because out
+                          //  std::atomic_flag has a deleted default copy operator
+                          memcpy(static_cast<void *>(&Foreign_Nodes[p - EndOfTreePoints]),
+                                 static_cast<void *>(&buf_foreignnodes[n_nodes_used++]), sizeof(node));
+
+                          Foreign_Nodes[p - EndOfTreePoints].access.clear();
+                          Foreign_Nodes[p - EndOfTreePoints].flag_already_fetched = 0;
+
+                          Foreign_Nodes[p - EndOfTreePoints].nextnode = -1;
+
+                          if(plast >= 0) /* this is here still the previous one */
+                            {
+                              if(plast < EndOfForeignNodes) /* have a foreign node */
+                                {
+                                  Foreign_Nodes[plast - EndOfTreePoints].sibling         = p;
+                                  Foreign_Nodes[plast - EndOfTreePoints].sibling_shmrank = Shmem.Island_ThisTask;
+                                }
+                              else
+                                {
+                                  Foreign_Points[plast - EndOfForeignNodes].Nextnode         = p;
+                                  Foreign_Points[plast - EndOfForeignNodes].Nextnode_shmrank = Shmem.Island_ThisTask;
+                                }
+                            }
+
+                          if(pfirst < 0)
+                            pfirst = p;
+
+                          plast = p;
+                        }
+
+                      for(int j = 0; j < n_parts; j++)
+                        {
+                          if(NumForeignPoints >= MaxForeignPoints)
+                            Terminate(
+                                "We are out of storage for foreign points: NumForeignPoints=%d MaxForeignPoints=%d  j=%d n_parts=%d",
+                                NumForeignPoints, MaxForeignPoints, j, n_parts);
+
+                          /* global tree index of this foreign point */
+                          int p                                 = EndOfForeignNodes + NumForeignPoints++;
+                          Foreign_Points[p - EndOfForeignNodes] = buf_foreignpoints[n_parts_used++];
+
+                          if(plast >= 0) /* this is here still the previous one */
+                            {
+                              if(plast < EndOfForeignNodes) /* have a foreign node */
+                                {
+                                  Foreign_Nodes[plast - EndOfTreePoints].sibling         = p;
+                                  Foreign_Nodes[plast - EndOfTreePoints].sibling_shmrank = Shmem.Island_ThisTask;
+                                }
+                              else
+                                {
+                                  Foreign_Points[plast - EndOfForeignNodes].Nextnode         = p;
+                                  Foreign_Points[plast - EndOfForeignNodes].Nextnode_shmrank = Shmem.Island_ThisTask;
+                                }
+                            }
+
+                          if(pfirst < 0)
+                            pfirst = p;
+
+                          plast = p;
+                        }
+
+                      if(plast < 0 || pfirst < 0)
+                        Terminate("plast=%d < 0 || pfirst=%d < 0   n_nodes=%d n_parts=%d", plast, pfirst, n_nodes, n_parts);
+
+                      if(plast >= 0) /* this is here still the previous one */
+                        {
+                          if(plast < EndOfForeignNodes) /* have a foreign node */
+                            {
+                              Foreign_Nodes[plast - EndOfTreePoints].sibling         = nop->sibling;
+                              Foreign_Nodes[plast - EndOfTreePoints].sibling_shmrank = nop->sibling_shmrank;
+
+                              if(Foreign_Nodes[plast - EndOfTreePoints].cannot_be_opened_locally == 0)
+                                if(D->ThisTask == 0)
+                                  Terminate("what?    plast - EndOfTreePoints=%d", plast - EndOfTreePoints);
+                            }
+                          else
+                            {
+                              Foreign_Points[plast - EndOfForeignNodes].Nextnode         = nop->sibling;
+                              Foreign_Points[plast - EndOfForeignNodes].Nextnode_shmrank = nop->sibling_shmrank;
+                            }
+                        }
+
+                      nop->nextnode         = pfirst;
+                      nop->nextnode_shmrank = Shmem.Island_ThisTask;
+
+                      //                      if(nop->cannot_be_opened_locally == 0)  // don't remove this: acts a barrier to prevent
+                      //                      the compiler from
+                      //                        Terminate("bummer");                  // reordering the store operation on
+                      //                        'cannot_be_opened_locally'
+
+                      nop->cannot_be_opened_locally.store(0, std::memory_order_release);
+
+                      sum_NumForeignNodes += n_nodes;
+                      sum_NumForeignPoints += n_parts;
+                    }
+                  else
+                    {
+                      // skip this node, because apparently it was fetched in the meantime by another mpi task
+                      n_nodes_used += n_nodes;
+                      n_parts_used += n_parts;
+                    }
+
+                  nop->access.clear(std::memory_order_release);
+                }
+
+              if(n_sendpoints != n_parts_used || n_sendnodes != n_nodes_used)
+                Terminate("n_sendpoints != n_parts_used || n_sendnodes != n_nodes_used");
+
+              Mem.myfree(buf_foreignnodes);
+              Mem.myfree(buf_foreignpoints);
+              Mem.myfree(node_info_send);
+            }
+        }
+    }
+
+  NumOnFetchStack = 0;
+
+  Mem.myfree(OffsetFetch);
+  Mem.myfree(CountFetch);
+}
+
+/*! This function frees the memory allocated for the tree, i.e. it frees
+ *  the space allocated by the function force_treeallocate().
+ */
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+void tree<node, partset, point_data, foreign_point_data>::treefree(void)
+{
+  if(MaxPart == 0)
+    return;  // nothing to be done
+
+  if(Nodes)
+    {
+      MPI_Comm_free(&TreeSharedMemComm);
+
+      if(Father)
+        {
+          Mem.myfree_movable(Father);
+          Mem.myfree_movable(Nextnode);
+        }
+      if(Points)
+        {
+          Mem.myfree_movable(Points);
+        }
+
+      Mem.myfree_movable(Nodes + MaxPart + D->NTopnodes);
+
+      if(TreeSharedMem_ThisTask == 0)
+        {
+          if(TreeSharedMem_NTask == Shmem.World_NTask || D->NTask < Shmem.Sim_NTask)
+            {
+              Mem.myfree_movable(TopNodes + MaxPart);
+              Mem.myfree_movable(NodeIndex);
+              Mem.myfree_movable(NodeSibling);
+              Mem.myfree_movable(NodeLevel);
+            }
+          else
+            {
+              int ghost_rank = Shmem.GetGhostRankForSimulCommRank[Shmem.Sim_ThisTask];
+
+              // tell the ghost rank to free the storage
+              MPI_Send(&TreeInfoHandle, 1, MPI_INT, ghost_rank, TAG_TOPNODE_FREE, MPI_COMM_WORLD);
+            }
+        }
+
+      Mem.myfree(TreeSharedMemBaseAddr);
+
+      Mem.myfree(TreePS_offsets);
+      Mem.myfree(TreeSphP_offsets);
+      Mem.myfree(TreeP_offsets);
+      Mem.myfree(TreeForeign_Points_offsets);
+      Mem.myfree(TreeForeign_Nodes_offsets);
+      Mem.myfree(TreeNextnode_offsets);
+      Mem.myfree(TreePoints_offsets);
+      Mem.myfree(TreeNodes_offsets);
+
+      Mem.myfree_movable(Recv_offset);
+      Mem.myfree_movable(Recv_count);
+      Mem.myfree_movable(Send_offset);
+      Mem.myfree_movable(Send_count);
+
+      Nodes       = NULL;
+      TopNodes    = NULL;
+      NodeIndex   = NULL;
+      NodeSibling = NULL;
+      NodeLevel   = NULL;
+      Points      = NULL;
+      Nextnode    = NULL;
+      Father      = NULL;
+    }
+  else
+    Terminate("trying to free the tree even though it's not allocated");
+}
+
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+void tree<node, partset, point_data, foreign_point_data>::tree_export_node_threads(int no, int i, thread_data *thread,
+                                                                                   offset_tuple off)
+{
+  int task      = D->TaskOfLeaf[no - (MaxPart + MaxNodes)];
+  int nodeindex = NodeIndex[no - (MaxPart + MaxNodes)];
+
+  tree_export_node_threads_by_task_and_node(task, nodeindex, i, thread, off);
+}
+
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+void tree<node, partset, point_data, foreign_point_data>::tree_export_node_threads_by_task_and_node(int task, int nodeindex, int i,
+                                                                                                    thread_data *thread,
+                                                                                                    offset_tuple off)
+{
+  if(task < 0 || task >= D->NTask)
+    Terminate("task < 0 || task >= D->NTask");
+
+  if(task != D->ThisTask)
+    {
+      if(thread->Exportflag[task] != i)
+        {
+          thread->Exportflag[task]     = i;
+          int nexp                     = thread->Nexport++;
+          thread->PartList[nexp].Task  = task;
+          thread->PartList[nexp].Index = i;
+          thread->ExportSpace -= thread->ItemSize;
+        }
+
+      int nexp                     = thread->NexportNodes++;
+      nexp                         = -1 - nexp;
+      data_nodelist *nodelist      = (data_nodelist *)(((char *)thread->PartList) + thread->InitialSpace);
+      nodelist[nexp].Task          = task;
+      nodelist[nexp].Index         = i;
+      nodelist[nexp].NodeInfo.Node = nodeindex;
+      thread->ExportSpace -= (sizeof(data_nodelist) + sizeof(int));
+    }
+}
+
+#include "../ngbtree/ngbtree.h"
+template class tree<ngbnode, simparticles, ngbpoint_data, foreign_sphpoint_data>;
+
+#include "../gravtree/gravtree.h"
+template class tree<gravnode, simparticles, gravpoint_data, foreign_gravpoint_data>;
+
+#ifdef FOF
+#include "../fof/foftree.h"
+template class tree<fofnode, simparticles, fofpoint_data, foreign_fofpoint_data>;
+#if defined(LIGHTCONE) && (defined(LIGHTCONE_PARTICLES_GROUPS) || defined(LIGHTCONE_IMAGE_COMP_HSML_VELDISP))
+/* make sure that we instantiate the template */
+#include "../data/lcparticles.h"
+template class tree<fofnode, lcparticles, fofpoint_data, foreign_fofpoint_data>;
+template class tree<gravnode, lcparticles, gravpoint_data, foreign_gravpoint_data>;
+#endif
+#endif
diff --git a/src/tree/tree.h b/src/tree/tree.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d27963452c1743e7e7e238e64516b5fc01f8c63
--- /dev/null
+++ b/src/tree/tree.h
@@ -0,0 +1,429 @@
+/*******************************************************************************
+ * \copyright   This file is part of the GADGET4 N-body/SPH code developed
+ * \copyright   by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
+ * \copyright   (vspringel@mpa-garching.mpg.de) and all contributing authors.
+ *******************************************************************************/
+
+/*! \file  tree.h
+ *
+ *  \brief declaration of the base class for building oct-trees
+ */
+
+#ifndef TREE_H
+#define TREE_H
+
+#ifndef TREE_NUM_BEFORE_NODESPLIT
+#define TREE_NUM_BEFORE_NODESPLIT 3  // daughter nodes are only created if there are more than this number of particles in a node
+#endif
+
+#define TREE_MODE_BRANCH 0
+#define TREE_MODE_TOPLEVEL 1
+
+#define MAX_TREE_ALLOC_FACTOR 30.0
+
+#define TREE_MAX_ITER 100
+
+#include <mpi.h>
+
+#include "../domain/domain.h"
+#include "../mpi_utils/shared_mem_handler.h"
+#include "../sort/peano.h"
+#include "../system/system.h"
+
+class sim;
+
+/** The tree node data structure. Nodes points to the actual memory
+    allocated for the internal nodes, but is shifted such that
+    Nodes[Sp.MaxPart] gives the first allocated node. Note that node
+    numbers less than Sp.MaxPart are the leaf nodes that contain a
+    single particle, and node numbers >= MaxPart+MaxNodes are "pseudo
+    particles" that hang off the toplevel leaf nodes belonging to
+    other tasks. These are not represented by this structure. Instead,
+    the tree traversal for these are saved in the Nextnode, Prevnode
+    and Father arrays, indexed with the node number in the case of
+    real particles and by nodenumber-MaxNodes for pseudo
+    particles.  */
+
+struct basenode
+{
+  std::atomic<node_bit_field> flag_already_fetched;
+
+  vector<MyIntPosType> center; /**< geometrical center of node */
+
+  int sibling;
+  /** The next node in case the current node needs to be
+      opened. Applying nextnode repeatedly results in a pure
+      depth-first traversal of the tree. */
+  int nextnode;
+  /** The parent node of the node. (Is -1 for the root node.) */
+  int father;
+
+  int OriginTask; /* MPI rank (in full compute communicator) on which this node and its daughter nodes are natively stored */
+  int OriginNode; /* Index of the node on the MPI rank that stores it and its daughter nodes natively */
+
+  unsigned char level; /**< hold the tree level, used to store the side length of node in space efficient way */
+  unsigned char sibling_shmrank;
+  unsigned char nextnode_shmrank;
+  unsigned char father_shmrank;  // CHECK: can be deleted
+
+  std::atomic_flag access;
+
+  std::atomic<unsigned char> cannot_be_opened_locally;
+
+  // unsigned char cannot_be_opened_locally : 1;
+  unsigned char not_empty : 1;
+};
+
+struct node_info
+{
+  int Node;
+};
+
+struct data_nodelist
+{
+  int Task;           /** target process */
+  int Index;          /** local index that wants to open this node */
+  node_info NodeInfo; /** info about node to be opened on foreign process, as well as perdiodic box offset (needed for Ewald summation
+                      algorithm for periodic gravity */
+};
+
+template <typename node, typename partset, typename point_data, typename foreign_point_data>
+class tree
+{
+  /* class for oct tree */
+
+ public:
+  struct index_data
+  {
+    int p;
+    int subnode;
+  };
+
+  domain<partset> *D;
+  partset *Tp;
+
+  int *Father;
+  int *Nextnode;
+  int *NodeSibling;
+  int *NodeIndex;
+
+  node *TopNodes;
+  node *Nodes;
+  node *Foreign_Nodes;
+  foreign_point_data *Foreign_Points;
+
+  ptrdiff_t *TreeNodes_offsets;
+  ptrdiff_t *TreePoints_offsets;
+  ptrdiff_t *TreeNextnode_offsets;
+  ptrdiff_t *TreeForeign_Nodes_offsets;
+  ptrdiff_t *TreeForeign_Points_offsets;
+  ptrdiff_t *TreeP_offsets;
+  ptrdiff_t *TreeSphP_offsets;
+  ptrdiff_t *TreePS_offsets;
+
+  void **TreeSharedMemBaseAddr;
+
+  unsigned char *NodeLevel;
+
+  point_data *Points;
+  int *IndexList;
+  int *ResultIndexList;
+
+  int *Send_offset;
+  int *Send_count;
+  int *Recv_count;
+  int *Recv_offset;
+
+  int MaxPart;
+  int MaxNodes;
+  int NumNodes;
+  int NumPartImported;
+  int NumPartExported;
+
+  int NumForeignNodes;  // number of imported foreign tree nodes
+  int MaxForeignNodes;
+
+  int NumForeignPoints;  // number of imported foreign particles to allow completion of local tree walks
+  int MaxForeignPoints;
+
+  // for some statistics about the number of imported nodes and points
+  long long sum_NumForeignNodes;
+  long long sum_NumForeignPoints;
+
+  int FirstNonTopLevelNode;
+
+  int EndOfTreePoints;
+  int EndOfForeignNodes;
+
+  int ImportedNodeOffset;
+  int Ninsert;
+  int NextFreeNode;
+
+  MPI_Comm TreeSharedMemComm;
+  int TreeSharedMem_ThisTask;
+  int TreeSharedMem_NTask;
+
+  int TreeInfoHandle;
+
+  /** Number of tree nodes allocated (and counted in NumNodes) but actually
+   * unused (can happen with NUM_THREADS > 1 && TAKE_NSLOTS_IN_ONE_GO > 1) */
+
+  double Buildtime;
+
+  int NumOnFetchStack;
+  int MaxOnFetchStack;
+
+  struct fetch_data
+  {
+    int NodeToOpen;
+    int ShmRank;
+    int GhostRank;
+  };
+  fetch_data *StackToFetch;
+
+  static bool compare_ghostrank(const fetch_data &a, const fetch_data &b) { return a.GhostRank < b.GhostRank; }
+
+  int NumOnWorkStack;
+  int MaxOnWorkStack;
+  int NewOnWorkStack;
+  int AllocWorkStackBaseLow;
+  int AllocWorkStackBaseHigh;
+
+  struct workstack_data
+  {
+    int Target;
+    int Node;
+    int ShmRank;
+    int MinTopLeafNode;
+  };
+  workstack_data *WorkStack;
+
+  static bool compare_workstack(const workstack_data &a, const workstack_data &b)
+  {
+    if(a.MinTopLeafNode < b.MinTopLeafNode)
+      return true;
+    if(a.MinTopLeafNode > b.MinTopLeafNode)
+      return false;
+
+    return a.Target < b.Target;
+  }
+
+  void tree_add_to_fetch_stack(node *nop, int nodetoopen, unsigned char shmrank)
+  {
+    if(NumOnFetchStack >= MaxOnFetchStack)
+      {
+        Terminate("we shouldn't get here");
+        MaxOnFetchStack *= 1.1;
+        StackToFetch = (fetch_data *)Mem.myrealloc_movable(StackToFetch, MaxOnFetchStack * sizeof(fetch_data));
+      }
+
+    node_bit_field mybit = (((node_bit_field)1) << Shmem.Island_ThisTask);
+
+    node_bit_field oldval = nop->flag_already_fetched.fetch_or(mybit);
+
+    if((oldval & mybit) == 0)  // it wasn't fetched by me yet
+      {
+        int ghostrank = Shmem.GetGhostRankForSimulCommRank[nop->OriginTask];
+
+        StackToFetch[NumOnFetchStack].NodeToOpen = nodetoopen;
+        StackToFetch[NumOnFetchStack].ShmRank    = shmrank;
+        StackToFetch[NumOnFetchStack].GhostRank  = ghostrank;
+
+        NumOnFetchStack++;
+      }
+  }
+
+  void tree_add_to_work_stack(int target, int no, unsigned char shmrank, int mintopleafnode)
+  {
+    if(NumOnWorkStack + NewOnWorkStack >= MaxOnWorkStack)
+      {
+        Terminate("we shouldn't get here");
+        MaxOnWorkStack *= 1.1;
+        WorkStack = (workstack_data *)Mem.myrealloc_movable(WorkStack, MaxOnWorkStack * sizeof(workstack_data));
+      }
+
+    WorkStack[NumOnWorkStack + NewOnWorkStack].Target         = target;
+    WorkStack[NumOnWorkStack + NewOnWorkStack].Node           = no;
+    WorkStack[NumOnWorkStack + NewOnWorkStack].ShmRank        = shmrank;
+    WorkStack[NumOnWorkStack + NewOnWorkStack].MinTopLeafNode = mintopleafnode;
+
+    NewOnWorkStack++;
+  }
+
+  struct node_count_info
+  {
+    int count_nodes;
+    int count_parts;
+  };
+
+  struct node_req
+  {
+    int foreigntask;
+    int foreignnode;
+  };
+
+  void prepare_shared_memory_access(void);
+  void cleanup_shared_memory_access(void);
+
+  enum ftype
+  {
+    FETCH_GRAVTREE,
+    FETCH_SPH_DENSITY,
+    FETCH_SPH_HYDRO,
+    FETCH_SPH_TREETIMESTEP,
+  };
+
+  void tree_fetch_foreign_nodes(enum ftype fetch_type);
+  void tree_initialize_leaf_node_access_info(void);
+
+  inline foreign_point_data *get_foreignpointsp(int n, unsigned char shmrank)
+  {
+    return (foreign_point_data *)((char *)TreeSharedMemBaseAddr[shmrank] + TreeForeign_Points_offsets[shmrank]) + n;
+  }
+
+  inline subfind_data *get_PSp(int n, unsigned char shmrank)
+  {
+    return (subfind_data *)((char *)TreeSharedMemBaseAddr[shmrank] + TreePS_offsets[shmrank]) + n;
+  }
+
+  typedef decltype(Tp->P) pdata;
+
+  inline pdata get_Pp(int n, unsigned char shmrank)
+  {
+    return (pdata)((char *)TreeSharedMemBaseAddr[shmrank] + TreeP_offsets[shmrank]) + n;
+  }
+
+  inline sph_particle_data *get_SphPp(int n, unsigned char shmrank)
+  {
+    return (sph_particle_data *)((char *)TreeSharedMemBaseAddr[shmrank] + TreeSphP_offsets[shmrank]) + n;
+  }
+
+ private:
+  /** Gives next node in tree walk for the "particle" nodes. Entries 0
+         -- MaxPart-1 are the real particles, and the "pseudoparticles" are
+           indexed by the node number-MaxNodes. */
+
+  /** Gives previous node in tree walk for the leaf (particle)
+      nodes. Entries 0 -- MaxPart-1 are the real particles, and the
+      "pseudoparticles" are indexed by the node number-MaxNodes. */
+
+ public:
+  tree() /* constructor */
+  {
+    TopNodes    = NULL;
+    Nodes       = NULL;
+    NodeIndex   = NULL;
+    NodeSibling = NULL;
+    NodeLevel   = NULL;
+    Points      = NULL;
+    Nextnode    = NULL;
+    Father      = NULL;
+    D           = NULL;
+  }
+
+  /** public functions */
+  int treebuild(int ninsert, int *indexlist);
+  void treefree(void);
+  void treeallocate(int max_partindex, partset *Pptr, domain<partset> *Dptr);
+
+  void tree_export_node_threads(int no, int i, thread_data *thread, offset_tuple off = 0);
+  void tree_export_node_threads_by_task_and_node(int task, int nodeindex, int i, thread_data *thread, offset_tuple off = 0);
+
+  virtual void update_node_recursive(int no, int sib, int mode)            = 0;
+  virtual void exchange_topleafdata(void)                                  = 0;
+  virtual void report_log_message(void)                                    = 0;
+  virtual void fill_in_export_points(point_data *exp_point, int i, int no) = 0;
+
+  inline node *get_nodep(int no)
+  {
+    node *nop;
+
+    if(no < MaxPart + D->NTopnodes)
+      nop = &TopNodes[no];
+    else if(no < MaxPart + MaxNodes)
+      nop = &Nodes[no];
+    else
+      Terminate("illegale node index");
+
+    return nop;
+  }
+
+  inline node *get_nodep(int no, unsigned char shmrank)
+  {
+    node *nop;
+
+    if(no < MaxPart + D->NTopnodes)
+      nop = &TopNodes[no];
+    else if(no < MaxPart + MaxNodes) /* an internal node */
+      {
+        node *Nodes_shmrank = (node *)((char *)TreeSharedMemBaseAddr[shmrank] + TreeNodes_offsets[shmrank]);
+        nop                 = &Nodes_shmrank[no];
+      }
+    else if(no >= EndOfTreePoints && no < EndOfForeignNodes) /* an imported tree node */
+      {
+        node *Foreign_Nodes_shmrank = (node *)((char *)TreeSharedMemBaseAddr[shmrank] + TreeForeign_Nodes_offsets[shmrank]);
+
+        nop = &Foreign_Nodes_shmrank[no - EndOfTreePoints];
+      }
+    else
+      Terminate("illegale node index");
+
+    return nop;
+  }
+
+  inline int *get_nextnodep(unsigned char shmrank)
+  {
+    return (int *)((char *)TreeSharedMemBaseAddr[shmrank] + TreeNextnode_offsets[shmrank]);
+  }
+
+  inline point_data *get_pointsp(int no, unsigned char shmrank)
+  {
+    return (point_data *)((char *)TreeSharedMemBaseAddr[shmrank] + TreePoints_offsets[shmrank]) + no;
+  }
+
+  inline void tree_get_node_and_task(int i, int &no, int &task)
+  {
+    MyIntPosType xxb       = Tp->P[i].IntPos[0];
+    MyIntPosType yyb       = Tp->P[i].IntPos[1];
+    MyIntPosType zzb       = Tp->P[i].IntPos[2];
+    MyIntPosType mask      = (((MyIntPosType)1) << (BITS_FOR_POSITIONS - 1));
+    unsigned char shiftx   = (BITS_FOR_POSITIONS - 3);
+    unsigned char shifty   = (BITS_FOR_POSITIONS - 2);
+    unsigned char shiftz   = (BITS_FOR_POSITIONS - 1);
+    unsigned char level    = 0;
+    unsigned char rotation = 0;
+
+#if defined(PMGRID) && defined(PLACEHIGHRESREGION)
+    Tp->P[i].InsideOutsideFlag = Tp->check_high_res_point_location(Tp->P[i].IntPos);
+#endif
+
+    no = 0;
+    while(D->TopNodes[no].Daughter >= 0)  // walk down top tree to find correct leaf
+      {
+        unsigned char pix     = (((unsigned char)((xxb & mask) >> (shiftx--))) | ((unsigned char)((yyb & mask) >> (shifty--))) |
+                             ((unsigned char)((zzb & mask) >> (shiftz--))));
+        unsigned char subnode = peano_incremental_key(pix, &rotation);
+
+        mask >>= 1;
+        level++;
+
+        no = D->TopNodes[no].Daughter + subnode;
+      }
+
+    no   = D->TopNodes[no].Leaf;
+    task = D->TaskOfLeaf[no];
+  }
+
+ private:
+  /* private member functions */
+
+  int treebuild_construct(void);
+  int treebuild_insert_group_of_points(int num, index_data *index_list, int th, unsigned char level, int sibling);
+  int create_empty_nodes(int no, int level, int topnode, int bits, int sibling, MyIntPosType x, MyIntPosType y, MyIntPosType z);
+
+ private:
+  /* sort kernel */
+  static inline bool compare_index_data_subnode(const index_data &a, const index_data &b) { return a.subnode < b.subnode; }
+};
+
+#endif
diff --git a/src/vectorclass/changelog.txt b/src/vectorclass/changelog.txt
new file mode 100644
index 0000000000000000000000000000000000000000..554f7ea20c3e46b5f848e494159fa504bc5ff861
--- /dev/null
+++ b/src/vectorclass/changelog.txt
@@ -0,0 +1,176 @@
+Change log for vectorclass.zip
+------------------------------
+
+Projected future version, 2018:
+  * use mask registers for boolean vectors of all sizes with AVX512VL
+  * use C++14
+  * replace long template argument lists by constant arrays
+  
+2017-07-27 version 1.30
+  * fixed bug in permute8f for a particular combination of indexes  
+  
+2017-05-10 version 1.29
+  * Reversed Apple Clang patch in version 1.28 because the problem has reoccurred in
+    later versions of Clang
+  
+2017-05-02 version 1.28
+  * Fixed problem with Apple Clang version 6.2 in vectorf128.h
+  * Fixed return type for Vec8sb operator > (Vec8us, Vec8us)  
+  * cpuid function modified in instrset_detect.cpp
+  
+2017-02-19 version 1.27
+  * fixed problem with scatter functions in MS Visual Studio  
+  
+2016-12-21 version 1.26
+  * added constant4ui template
+  * fixed error for complexvec.h with clang  
+  * fixed error in vectormath_exp.h for MAX_VECTOR_SIZE < 512
+
+2016-11-25 version 1.25
+  * scatter functions
+  * new functions to_float for unsigned integer vectors
+  * instrset_detect function can detect AVX512VL, AVX512BW, AVX512DQ
+  * functions hasF16C and hasAVX512ER for detecting instruction set extensions
+  * fix bugs in horizontal_and and pow(0,0) for AVX512
+  * functions improved for AVX512 and AVX512VL: pow, approx_recipr,  
+      approx_rsqrt
+  * functions improved for AVX512DQ: 64 bit multiplication, to_double,
+      32 and 64 bit rotate_left, round_to_int64, truncate_to_int64
+  * functions improved for AVX512ER: approx_recipr, approx_rsqrt,
+      exponential functions
+
+2016-10-31 version 1.24
+  * fix bug in Vec8uq constructor in vectori512e.h
+
+2016-09-27 version 1.23
+  * temporary fix of a problem in Clang version 3.9 inserted in vectorf128.h
+
+2016-05-03 version 1.22
+  * added optional namespace
+  * fixed problem with decimal.h  
+
+2016-04-24 version 1.21
+  * fix problems with XOP option in gcc
+  * improved horizontal_and/or for sse2
+  * improved Vec2q and Vec4q constructor on Microsoft Visual Studio 2015
+  * removed warnings by gcc option -Wcast-qual  
+
+2015-12-04 version 1.20
+  * round functions: suppress precision exception under SSE4.1 and higher
+  * fix compiler problems with AVX512 multiplication in gcc version 5.1
+  * fix compiler problems with pow function in Microsoft Visual Studio 2015
+
+2015-11-14 version 1.19
+  * fix various problems with Clang compiler
+
+2015-09-25 version 1.18
+  * fix compiler error for Vec8s divide_by_i(Vec8s const & x) under Clang compiler
+  * fix error in Vec4d::size() in vectorf256e.h
+
+2015-07-31 version 1.17
+  * improved operator > for Vec4uq
+  * more special cases in blend4q
+  * nan_code functions made static inline
+  * template parameter BTYPE renamed to BVTYPE in mathematical functions to avoid clash
+      with macro named BTYPE in winnt.h
+  * fixed bug in Vec4db constructor
+
+2014-10-24 version 1.16
+  * workaround for problem in Clang compiler extended to version 3.09 because not 
+      fixed yet by Clang (vectorf128.h line 134)
+  * recognize problem with Apple version of Clang reporting wrong version number
+  * remove various minor problems with Clang
+  * function pow(vector, int) modified to strengthen type checking and avoid compiler warnings
+  * manual discusses dynamic allocation of arrays of vectors
+  * various minor changes
+
+2014-10-17 version 1.15
+  * added files ranvec1.h and ranvec1.cpp for random number generator
+  * constructors to make boolean vectors from their elements
+  * constructors and = operators to broadcast boolean scalar into boolean vectors
+  * various lookup functions improved
+  * operators &, |, ^, ~, etc. defined for various boolean vectors to avoid converson
+      to integer vectors
+  * nmul_add functions
+  * mul_add etc. moved to main header files
+  * explicit fused multiply-and-add used in math functions to improve performance 
+      on compilers that don't automatically insert FMA
+
+2014-07-24 version 1.14
+  * support for AVX-512f instruction set and 512-bit vectors:
+      Vec16i, Vec16ui, Vec8q, Vec8uq, Vec16f, Vec8d, and corresponding boolean vectors
+  * new define MAX_VECTOR_SIZE, valid values are 128, 256 and 512
+  * added hyperbolic functions sinh, cosh, tanh, asinh, acosh, atanh
+  * size() member function on all vector classes returns the number of elements
+  * functions for conversion between boolean vectors and integer bitfields
+  * extracting an element from a boolean vector now returns a bool, not an int
+  * improved precision in exp2 and exp10 functions
+  * various bug fixes
+
+2014-05-11 version 1.13
+  * pow function improved
+  * mul_add, mul_sub, mul_sub_x functions
+  * propagation of error codes through nan_code function
+  * "denormal" renamed to "subnormal" everywhere, in accordance with IEEE 754-2008 standard
+
+2014-04-20 version 1.12
+  * inline implementation of mathematical functions added (vectormath_exp.h vectormath_trig.h
+      vectormath_common.h)
+  * vectormath.h renamed to vectormath_lib.h because a new alternative is added
+  * gather functions with constant indexes
+  * function sign_combine
+  * function pow_const(vector, const int)
+  * function pow_ratio(vector, const int, const int)
+  * functions horizontal_find_first, horizontal_count
+  * function recipr_sqrt removed
+  * functions round_to_int64_limited, truncate_to_int64_limited, to_double_limited
+  * function cubic_root renamed to cbrt
+  * function atan(vector,vector) renamed to atan2
+  * function if_mul
+  * function Vec4i round_to_int(Vec2d)
+  * operator & (float vector, boolean vector)
+  * operator &= (int vector, int vector)
+  * removed constructor Vec128b(int) and Vec256b(int) to avoid implicit conversion
+  * removed signalling nan function
+  * minor improvements in various blend and lookup functions
+
+2014-03-01 version 1.11
+  * fixed missing unsigned operators >>= in vectori256.h
+
+2013-10-04 version 1.10
+  * clear distinction between boolean vectors and integer vectors for the sake of 
+      compatibility with mask registers in forthcoming AVX512 instruction set
+  * added function if_add
+  * tentative support for clang version 3.3 with workaround for bugs
+  * remove ambiguity for builtin m128i operator == in clang compiler. 
+  * problems in clang compiler, bug reports filed at clang
+      (http://llvm.org/bugs/show_bug.cgi?id=17164, 17312)
+  * instrset.h fixes problem with macros named min and max in MS windows.h
+  * workaround problem in MS Visual Studio 11.0. Bug report 735861 and 804274
+  * minor bug fixes
+
+2013-03-31 version 1.03 beta
+  * bug fix for Vec2d cos (Vec2d const & x), VECTORMATH = 1
+
+2012-08-01 version 1.02 beta
+  * added file vector3d.h for 3-dimensional vectors
+  * added file complexvec.h for complex numbers and complex vectors
+  * added file quaternion.h for quaternions
+  * added function change_sign for floating point vectors
+  * added operators +, -, *, / between floating point vectors and scalars to remove 
+      overloading ambiguity
+
+2012-07-08 version 1.01 beta
+  * added file decimal.h with Number <-> string conversion functions: 
+      bin2bcd, bin2ascii, bin2hex_ascii, ascii2bin
+  * added andnot function for boolean vectors
+  * added functions shift_bytes_up and shift_bytes_down
+  * added operators for unsigned integer vector classes: >>=, &, &&, |, ||, ^, ~
+  * inteldispatchpatch.cpp removed. Use asmlib instead (www.agner.org/optimize/#asmlib)
+  * prefix ++ and -- operators now return a reference, postfix operators return a value
+  * various improvements in permute and blend functions
+  * minor improvement in abs function
+  * added version number to VECTORCLASS_H
+
+2012-05-30 version 1.00 beta
+  * first public release
diff --git a/src/vectorclass/dispatch_example.cpp b/src/vectorclass/dispatch_example.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a53d5cd680423fa13502f7a501a32ce313359d2
--- /dev/null
+++ b/src/vectorclass/dispatch_example.cpp
@@ -0,0 +1,112 @@
+/*************************  dispatch_example.cpp   ****************************
+| Author:        Agner Fog
+| Date created:  2012-05-30
+* Last modified: 2016-04-26
+* Version:       1.22
+| Project:       vector classes
+| Description:
+| Example of CPU dispatching.
+|
+| # Example of compiling this with GCC compiler:
+| # Compile dispatch_example.cpp five times for different instruction sets:
+| g++ -O3 -msse2    -c dispatch_example.cpp -od2.o
+| g++ -O3 -msse4.1  -c dispatch_example.cpp -od5.o
+| g++ -O3 -mavx     -c dispatch_example.cpp -od7.o
+| g++ -O3 -mavx2    -c dispatch_example.cpp -od8.o
+| g++ -O3 -mavx512f -c dispatch_example.cpp -od9.o
+| g++ -O3 -msse2 -otest instrset_detect.cpp d2.o d5.o d7.o d8.o d9.o
+| ./test
+|
+| (c) Copyright 2012-2016 GNU General Public License http://www.gnu.org/licenses
+\*****************************************************************************/
+
+#include <stdio.h>
+
+#define MAX_VECTOR_SIZE 512
+#include "vectorclass.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+// define function type (change this to fit your purpose. Should not contain vector types)
+typedef float MyFuncType(float*);
+
+// function prototypes for each version
+MyFuncType myfunc, myfunc_SSE2, myfunc_SSE41, myfunc_AVX, myfunc_AVX2, myfunc_AVX512, myfunc_dispatch;
+
+// Define function name depending on which instruction set we compile for
+#if INSTRSET == 2  // SSE2
+#define FUNCNAME myfunc_SSE2
+#elif INSTRSET == 5  // SSE4.1
+#define FUNCNAME myfunc_SSE41
+#elif INSTRSET == 7  // AVX
+#define FUNCNAME myfunc_AVX
+#elif INSTRSET == 8  // AVX2
+#define FUNCNAME myfunc_AVX2
+#elif INSTRSET == 9  // AVX512
+#define FUNCNAME myfunc_AVX512
+#endif
+
+// specific version of the function. Compile once for each version
+float FUNCNAME(float* f)
+{
+  Vec16f a;                  // vector of 16 floats
+  a.load(f);                 // load array into vector
+  return horizontal_add(a);  // return sum of 16 elements
+}
+
+#if INSTRSET == 2
+// make dispatcher in only the lowest of the compiled versions
+
+// Function pointer initially points to the dispatcher.
+// After first call it points to the selected version
+MyFuncType* myfunc_pointer = &myfunc_dispatch;  // function pointer
+
+// Dispatcher
+float myfunc_dispatch(float* f)
+{
+  int iset = instrset_detect();  // Detect supported instruction set
+  if(iset >= 9)
+    myfunc_pointer = &myfunc_AVX512;  // AVX512 version
+  else if(iset >= 8)
+    myfunc_pointer = &myfunc_AVX2;  // AVX2 version
+  else if(iset >= 7)
+    myfunc_pointer = &myfunc_AVX;  // AVX version
+  else if(iset >= 5)
+    myfunc_pointer = &myfunc_SSE41;  // SSE4.1 version
+  else if(iset >= 2)
+    myfunc_pointer = &myfunc_SSE2;  // SSE2 version
+  else
+    {
+      // Error: lowest instruction set not supported (put your own error message here:)
+      fprintf(stderr, "\nError: Instruction set SSE2 not supported on this computer");
+      return 0.f;
+    }
+  // continue in dispatched version
+  return (*myfunc_pointer)(f);
+}
+
+// Entry to dispatched function call
+inline float myfunc(float* f)
+{
+  return (*myfunc_pointer)(f);  // go to dispatched version
+}
+
+// Example: main calls myfunc
+int main(int argc, char* argv[])
+{
+  float a[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};  // array of 16 floats
+
+  float sum = myfunc(a);  // call function with dispatching
+
+  printf("\nsum = %8.3f \n", sum);  // print result
+  return 0;
+}
+
+#endif  // INSTRSET == 2
+
+#ifdef VCL_NAMESPACE
+}
+#endif
diff --git a/src/vectorclass/instrset.h b/src/vectorclass/instrset.h
new file mode 100644
index 0000000000000000000000000000000000000000..c025df3f3a4f460c5f72434711200bf46be62e76
--- /dev/null
+++ b/src/vectorclass/instrset.h
@@ -0,0 +1,228 @@
+/****************************  instrset.h   **********************************
+ * Author:        Agner Fog
+ * Date created:  2012-05-30
+ * Last modified: 2016-11-25
+ * Version:       1.25
+ * Project:       vector classes
+ * Description:
+ * Header file for various compiler-specific tasks and other common tasks to
+ * vector class library:
+ * > selects the supported instruction set
+ * > defines integer types
+ * > defines compiler version macros
+ * > undefines certain macros that prevent function overloading
+ * > defines template class to represent compile-time integer constant
+ * > defines template for compile-time error messages
+ *
+ * (c) Copyright 2012-2016 GNU General Public License www.gnu.org/licenses
+ ******************************************************************************/
+
+#ifndef INSTRSET_H
+#define INSTRSET_H 125
+
+// Detect 64 bit mode
+#if(defined(_M_AMD64) || defined(_M_X64) || defined(__amd64)) && !defined(__x86_64__)
+#define __x86_64__ 1  // There are many different macros for this, decide on only one
+#endif
+
+// Find instruction set from compiler macros if INSTRSET not defined
+// Note: Most of these macros are not defined in Microsoft compilers
+#ifndef INSTRSET
+#if defined(__AVX512F__) || defined(__AVX512__)
+#define INSTRSET 9
+#elif defined(__AVX2__)
+#define INSTRSET 8
+#elif defined(__AVX__)
+#define INSTRSET 7
+#elif defined(__SSE4_2__)
+#define INSTRSET 6
+#elif defined(__SSE4_1__)
+#define INSTRSET 5
+#elif defined(__SSSE3__)
+#define INSTRSET 4
+#elif defined(__SSE3__)
+#define INSTRSET 3
+#elif defined(__SSE2__) || defined(__x86_64__)
+#define INSTRSET 2
+#elif defined(__SSE__)
+#define INSTRSET 1
+#elif defined(_M_IX86_FP)  // Defined in MS compiler. 1: SSE, 2: SSE2
+#define INSTRSET _M_IX86_FP
+#else
+#define INSTRSET 0
+#endif  // instruction set defines
+#endif  // INSTRSET
+
+// Include the appropriate header file for intrinsic functions
+#if INSTRSET > 7  // AVX2 and later
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#include <x86intrin.h>  // x86intrin.h includes header files for whatever instruction
+                        // sets are specified on the compiler command line, such as:
+                        // xopintrin.h, fma4intrin.h
+#else
+#include <immintrin.h>  // MS version of immintrin.h covers AVX, AVX2 and FMA3
+#endif                  // __GNUC__
+#elif INSTRSET == 7
+#include <immintrin.h>  // AVX
+#elif INSTRSET == 6
+#include <nmmintrin.h>  // SSE4.2
+#elif INSTRSET == 5
+#include <smmintrin.h>  // SSE4.1
+#elif INSTRSET == 4
+#include <tmmintrin.h>  // SSSE3
+#elif INSTRSET == 3
+#include <pmmintrin.h>  // SSE3
+#elif INSTRSET == 2
+#include <emmintrin.h>  // SSE2
+#elif INSTRSET == 1
+#include <xmmintrin.h>  // SSE
+#endif                  // INSTRSET
+
+#if INSTRSET >= 8 && !defined(__FMA__)
+// Assume that all processors that have AVX2 also have FMA3
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+// Prevent error message in g++ when using FMA intrinsics with avx2:
+#pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
+#else
+#define __FMA__ 1
+#endif
+#endif
+
+// AMD  instruction sets
+#if defined(__XOP__) || defined(__FMA4__)
+#ifdef __GNUC__
+#include <x86intrin.h>  // AMD XOP (Gnu)
+#else
+#include <ammintrin.h>    // AMD XOP (Microsoft)
+#endif                    //  __GNUC__
+#elif defined(__SSE4A__)  // AMD SSE4A
+#include <ammintrin.h>
+#endif  // __XOP__
+
+// FMA3 instruction set
+#if defined(__FMA__) && (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+#include <fmaintrin.h>
+#endif  // __FMA__
+
+// FMA4 instruction set
+#if defined(__FMA4__) && (defined(__GNUC__) || defined(__clang__))
+#include <fma4intrin.h>  // must have both x86intrin.h and fma4intrin.h, don't know why
+#endif                   // __FMA4__
+
+// Define integer types with known size
+#if defined(__GNUC__) || defined(__clang__) || (defined(_MSC_VER) && _MSC_VER >= 1600)
+// Compilers supporting C99 or C++0x have stdint.h defining these integer types
+#include <stdint.h>
+#elif defined(_MSC_VER)
+// Older Microsoft compilers have their own definitions
+typedef signed __int8 int8_t;
+typedef unsigned __int8 uint8_t;
+typedef signed __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef signed __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef signed __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#ifndef _INTPTR_T_DEFINED
+#define _INTPTR_T_DEFINED
+#ifdef __x86_64__
+typedef int64_t intptr_t;
+#else
+typedef int32_t intptr_t;
+#endif
+#endif
+#else
+// This works with most compilers
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef signed short int int16_t;
+typedef unsigned short int uint16_t;
+typedef signed int int32_t;
+typedef unsigned int uint32_t;
+typedef long long int64_t;
+typedef unsigned long long uint64_t;
+#ifdef __x86_64__
+typedef int64_t intptr_t;
+#else
+typedef int32_t intptr_t;
+#endif
+#endif
+
+#include <stdlib.h>  // define abs(int)
+
+#ifdef _MSC_VER      // Microsoft compiler or compatible Intel compiler
+#include <intrin.h>  // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int)
+#endif               // _MSC_VER
+
+// functions in instrset_detect.cpp
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+int instrset_detect(void);  // tells which instruction sets are supported
+bool hasFMA3(void);         // true if FMA3 instructions supported
+bool hasFMA4(void);         // true if FMA4 instructions supported
+bool hasXOP(void);          // true if XOP  instructions supported
+bool hasAVX512ER(void);     // true if AVX512ER instructions supported
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+// GCC version
+#if defined(__GNUC__) && !defined(GCC_VERSION) && !defined(__clang__)
+#define GCC_VERSION ((__GNUC__)*10000 + (__GNUC_MINOR__)*100 + (__GNUC_PATCHLEVEL__))
+#endif
+
+// Clang version
+#if defined(__clang__)
+#define CLANG_VERSION ((__clang_major__)*10000 + (__clang_minor__)*100 + (__clang_patchlevel__))
+// Problem: The version number is not consistent across platforms
+// http://llvm.org/bugs/show_bug.cgi?id=12643
+// Apple bug 18746972
+#endif
+
+// Fix problem with non-overloadable macros named min and max in WinDef.h
+#ifdef _MSC_VER
+#if defined(_WINDEF_) && defined(min) && defined(max)
+#undef min
+#undef max
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#endif
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+// Template class to represent compile-time integer constant
+template <int32_t n>
+class Const_int_t
+{
+};  // represent compile-time signed integer constant
+template <uint32_t n>
+class Const_uint_t
+{
+};                                         // represent compile-time unsigned integer constant
+#define const_int(n) (Const_int_t<n>())    // n must be compile-time integer constant
+#define const_uint(n) (Const_uint_t<n>())  // n must be compile-time unsigned integer constant
+
+// Template for compile-time error messages
+template <bool>
+class Static_error_check
+{
+ public:
+  Static_error_check(){};
+};
+template <>
+class Static_error_check<false>
+{  // generate compile-time error if false
+ private:
+  Static_error_check(){};
+};
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // INSTRSET_H
diff --git a/src/vectorclass/instrset_detect.cpp b/src/vectorclass/instrset_detect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a0cf8dfd8a50e0d4e93e519194e52dce43275c97
--- /dev/null
+++ b/src/vectorclass/instrset_detect.cpp
@@ -0,0 +1,209 @@
+/**************************  instrset_detect.cpp   ****************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2017-05-02
+* Version:       1.28
+* Project:       vector classes
+* Description:
+* Functions for checking which instruction sets are supported.
+*
+* (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
+\*****************************************************************************/
+
+#include "instrset.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+// Define interface to cpuid instruction.
+// input:  eax = functionnumber, ecx = 0
+// output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3]
+static inline void cpuid(int output[4], int functionnumber)
+{
+#if defined(__GNUC__) || defined(__clang__)  // use inline assembly, Gnu/AT&T syntax
+
+  int a, b, c, d;
+  __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(0) :);
+  output[0] = a;
+  output[1] = b;
+  output[2] = c;
+  output[3] = d;
+
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)  // Microsoft or Intel compiler, intrin.h included
+
+  __cpuidex(output, functionnumber, 0);  // intrinsic function for CPUID
+
+#else  // unknown platform. try inline assembly with masm/intel syntax
+
+  __asm {
+        mov eax, functionnumber
+        xor ecx, ecx
+        cpuid;
+        mov esi, output
+        mov [esi],    eax
+        mov [esi+4],  ebx
+        mov [esi+8],  ecx
+        mov [esi+12], edx
+  }
+
+#endif
+}
+
+// Define interface to xgetbv instruction
+static inline int64_t xgetbv(int ctr)
+{
+#if(defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || \
+    (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)  // Microsoft or Intel compiler supporting _xgetbv intrinsic
+
+  return _xgetbv(ctr);  // intrinsic function for XGETBV
+
+#elif defined(__GNUC__)  // use inline assembly, Gnu/AT&T syntax
+
+  uint32_t a, d;
+  __asm("xgetbv" : "=a"(a), "=d"(d) : "c"(ctr) :);
+  return a | (uint64_t(d) << 32);
+
+#else  // #elif defined (_WIN32)                           // other compiler. try inline assembly with masm/intel/MS syntax
+
+  uint32_t a, d;
+  __asm {
+        mov ecx, ctr
+        _emit 0x0f
+        _emit 0x01
+        _emit 0xd0 ;  // xgetbv
+        mov a, eax
+        mov d, edx
+  }
+  return a | (uint64_t(d) << 32);
+
+#endif
+}
+
+/* find supported instruction set
+    return value:
+    0           = 80386 instruction set
+    1  or above = SSE (XMM) supported by CPU (not testing for O.S. support)
+    2  or above = SSE2
+    3  or above = SSE3
+    4  or above = Supplementary SSE3 (SSSE3)
+    5  or above = SSE4.1
+    6  or above = SSE4.2
+    7  or above = AVX supported by CPU and operating system
+    8  or above = AVX2
+    9  or above = AVX512F
+    10 or above = AVX512VL
+    11 or above = AVX512BW, AVX512DQ
+*/
+int instrset_detect(void)
+{
+  static int iset = -1;  // remember value for next call
+  if(iset >= 0)
+    {
+      return iset;  // called before
+    }
+  iset        = 0;             // default value
+  int abcd[4] = {0, 0, 0, 0};  // cpuid results
+  cpuid(abcd, 0);              // call cpuid function 0
+  if(abcd[0] == 0)
+    return iset;   // no further cpuid function supported
+  cpuid(abcd, 1);  // call cpuid function 1 for feature flags
+  if((abcd[3] & (1 << 0)) == 0)
+    return iset;  // no floating point
+  if((abcd[3] & (1 << 23)) == 0)
+    return iset;  // no MMX
+  if((abcd[3] & (1 << 15)) == 0)
+    return iset;  // no conditional move
+  if((abcd[3] & (1 << 24)) == 0)
+    return iset;  // no FXSAVE
+  if((abcd[3] & (1 << 25)) == 0)
+    return iset;  // no SSE
+  iset = 1;       // 1: SSE supported
+  if((abcd[3] & (1 << 26)) == 0)
+    return iset;  // no SSE2
+  iset = 2;       // 2: SSE2 supported
+  if((abcd[2] & (1 << 0)) == 0)
+    return iset;  // no SSE3
+  iset = 3;       // 3: SSE3 supported
+  if((abcd[2] & (1 << 9)) == 0)
+    return iset;  // no SSSE3
+  iset = 4;       // 4: SSSE3 supported
+  if((abcd[2] & (1 << 19)) == 0)
+    return iset;  // no SSE4.1
+  iset = 5;       // 5: SSE4.1 supported
+  if((abcd[2] & (1 << 23)) == 0)
+    return iset;  // no POPCNT
+  if((abcd[2] & (1 << 20)) == 0)
+    return iset;  // no SSE4.2
+  iset = 6;       // 6: SSE4.2 supported
+  if((abcd[2] & (1 << 27)) == 0)
+    return iset;  // no OSXSAVE
+  if((xgetbv(0) & 6) != 6)
+    return iset;  // AVX not enabled in O.S.
+  if((abcd[2] & (1 << 28)) == 0)
+    return iset;   // no AVX
+  iset = 7;        // 7: AVX supported
+  cpuid(abcd, 7);  // call cpuid leaf 7 for feature flags
+  if((abcd[1] & (1 << 5)) == 0)
+    return iset;  // no AVX2
+  iset = 8;
+  if((abcd[1] & (1 << 16)) == 0)
+    return iset;     // no AVX512
+  cpuid(abcd, 0xD);  // call cpuid leaf 0xD for feature flags
+  if((abcd[0] & 0x60) != 0x60)
+    return iset;  // no AVX512
+  iset = 9;
+  cpuid(abcd, 7);  // call cpuid leaf 7 for feature flags
+  if((abcd[1] & (1 << 31)) == 0)
+    return iset;  // no AVX512VL
+  iset = 10;
+  if((abcd[1] & 0x40020000) != 0x40020000)
+    return iset;  // no AVX512BW, AVX512DQ
+  iset = 11;
+  return iset;
+}
+
+// detect if CPU supports the FMA3 instruction set
+bool hasFMA3(void)
+{
+  if(instrset_detect() < 7)
+    return false;                       // must have AVX
+  int abcd[4];                          // cpuid results
+  cpuid(abcd, 1);                       // call cpuid function 1
+  return ((abcd[2] & (1 << 12)) != 0);  // ecx bit 12 indicates FMA3
+}
+
+// detect if CPU supports the FMA4 instruction set
+bool hasFMA4(void)
+{
+  if(instrset_detect() < 7)
+    return false;                       // must have AVX
+  int abcd[4];                          // cpuid results
+  cpuid(abcd, 0x80000001);              // call cpuid function 0x80000001
+  return ((abcd[2] & (1 << 16)) != 0);  // ecx bit 16 indicates FMA4
+}
+
+// detect if CPU supports the XOP instruction set
+bool hasXOP(void)
+{
+  if(instrset_detect() < 7)
+    return false;                       // must have AVX
+  int abcd[4];                          // cpuid results
+  cpuid(abcd, 0x80000001);              // call cpuid function 0x80000001
+  return ((abcd[2] & (1 << 11)) != 0);  // ecx bit 11 indicates XOP
+}
+
+// detect if CPU supports the AVX512ER instruction set
+bool hasAVX512ER(void)
+{
+  if(instrset_detect() < 9)
+    return false;                       // must have AVX512F
+  int abcd[4];                          // cpuid results
+  cpuid(abcd, 7);                       // call cpuid function 7
+  return ((abcd[1] & (1 << 27)) != 0);  // ebx bit 27 indicates AVX512ER
+}
+
+#ifdef VCL_NAMESPACE
+}
+#endif
diff --git a/src/vectorclass/license.txt b/src/vectorclass/license.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f5801aea1ea6595db4dbb66f3d936eaddf14bf4
--- /dev/null
+++ b/src/vectorclass/license.txt
@@ -0,0 +1,617 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed;
+                    section 10 makes it unnecessary.
+
+                        3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+                        No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling
+                            obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996,
+                        or similar laws prohibiting or
+                            restricting circumvention of such measures.
+
+                            When you convey a covered work,
+                        you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is
+                            effected by exercising rights under this License with respect to the covered work,
+                        and you disclaim any intention to limit operation or modification of the work as a means of enforcing,
+                        against the work's users, your or
+                            third parties' legal rights to forbid circumvention of technological measures
+                                .
+
+                            4. Conveying Verbatim Copies.
+
+                            You may convey verbatim copies of the Program's source code as you receive it, in any medium,
+                        provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice;
+                    keep intact all notices stating that this License and any non -
+                        permissive terms added in accord with section 7 apply to the code;
+                    keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
diff --git a/src/vectorclass/vectorclass.h b/src/vectorclass/vectorclass.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ef293f6452980b36a65e7584e1ea4518e99354e
--- /dev/null
+++ b/src/vectorclass/vectorclass.h
@@ -0,0 +1,69 @@
+/****************************  vectorclass.h   ********************************
+ * Author:        Agner Fog
+ * Date created:  2012-05-30
+ * Last modified: 2017-05-10
+ * Version:       1.29
+ * Project:       vector classes
+ * Description:
+ * Header file defining vector classes as interface to intrinsic functions
+ * in x86 microprocessors with SSE2 and later instruction sets up to AVX512.
+ *
+ * Instructions:
+ * Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired
+ * instruction set, which must be at least SSE2. Specify the supported
+ * instruction set by a command line define, e.g. __SSE4_1__ if the
+ * compiler does not automatically do so.
+ *
+ * Each vector object is represented internally in the CPU as a vector
+ * register with 128, 256 or 512 bits.
+ *
+ * This header file includes the appropriate header files depending on the
+ * supported instruction set
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2012-2017 GNU General Public License www.gnu.org/licenses
+ ******************************************************************************/
+#ifndef VECTORCLASS_H
+#define VECTORCLASS_H 129
+
+// Maximum vector size, bits. Allowed values are 128, 256, 512
+#ifndef MAX_VECTOR_SIZE
+#define MAX_VECTOR_SIZE 256
+#endif
+
+#include "instrset.h"  // Select supported instruction set
+
+#if INSTRSET < 2  // SSE2 required
+#error Please compile for the SSE2 instruction set or higher
+#else
+
+#include "vectorf128.h"  // 128-bit floating point vectors
+#include "vectori128.h"  // 128-bit integer vectors
+
+#if MAX_VECTOR_SIZE >= 256
+#if INSTRSET >= 8
+#include "vectori256.h"  // 256-bit integer vectors, requires AVX2 instruction set
+#else
+#include "vectori256e.h"  // 256-bit integer vectors, emulated
+#endif                    // INSTRSET >= 8
+#if INSTRSET >= 7
+#include "vectorf256.h"  // 256-bit floating point vectors, requires AVX instruction set
+#else
+#include "vectorf256e.h"  // 256-bit floating point vectors, emulated
+#endif                    //  INSTRSET >= 7
+#endif                    //  MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+#if INSTRSET >= 9
+#include "vectorf512.h"  // 512-bit floating point vectors, requires AVX512 instruction set
+#include "vectori512.h"  // 512-bit integer vectors, requires AVX512 instruction set
+#else
+#include "vectorf512e.h"  // 512-bit floating point vectors, emulated
+#include "vectori512e.h"  // 512-bit integer vectors, emulated
+#endif                    //  INSTRSET >= 9
+#endif                    //  MAX_VECTOR_SIZE >= 512
+
+#endif  // INSTRSET >= 2
+
+#endif  // VECTORCLASS_H
diff --git a/src/vectorclass/vectorclass.pdf b/src/vectorclass/vectorclass.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..96753304c590005c78488cc05101c3f9ada82069
Binary files /dev/null and b/src/vectorclass/vectorclass.pdf differ
diff --git a/src/vectorclass/vectorf128.h b/src/vectorclass/vectorf128.h
new file mode 100644
index 0000000000000000000000000000000000000000..1592a091a7eecca31e526ecad5081c98becb9de3
--- /dev/null
+++ b/src/vectorclass/vectorf128.h
@@ -0,0 +1,2758 @@
+/****************************  vectorf128.h   *******************************
+ * Author:        Agner Fog
+ * Date created:  2012-05-30
+ * Last modified: 2017-05-10
+ * Version:       1.29
+ * Project:       vector classes
+ * Description:
+ * Header file defining floating point vector classes as interface to
+ * intrinsic functions in x86 microprocessors with SSE2 and later instruction
+ * sets up to AVX.
+ *
+ * Instructions:
+ * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
+ * instruction set, which must be at least SSE2. Specify the supported
+ * instruction set by a command line define, e.g. __SSE4_1__ if the
+ * compiler does not automatically do so.
+ *
+ * The following vector classes are defined here:
+ * Vec4f     Vector of 4 single precision floating point numbers
+ * Vec4fb    Vector of 4 Booleans for use with Vec4f
+ * Vec2d     Vector of 2 double precision floating point numbers
+ * Vec2db    Vector of 2 Booleans for use with Vec2d
+ *
+ * Each vector object is represented internally in the CPU as a 128-bit register.
+ * This header file defines operators and functions for these vectors.
+ *
+ * For example:
+ * Vec2d a(1.0, 2.0), b(3.0, 4.0), c;
+ * c = a + b;     // now c contains (4.0, 6.0)
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
+ *****************************************************************************/
+#ifndef VECTORF128_H
+#define VECTORF128_H
+
+#if defined _MSC_VER && _MSC_VER >= 1800
+// solve problem with ambiguous overloading of pow function in Microsoft math.h:
+// make sure math.h is included first rather than last
+#include <math.h>
+#endif
+
+#include "vectori128.h"  // Define integer vectors
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/*****************************************************************************
+ *
+ *          select functions
+ *
+ *****************************************************************************/
+// Select between two __m128 sources, element by element. Used in various functions
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are
+// allowed. The implementation depends on the instruction set:
+// If SSE4.1 is supported then only bit 31 in each dword of s is checked,
+// otherwise all bits in s are used.
+static inline __m128 selectf(__m128 const& s, __m128 const& a, __m128 const& b)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_blendv_ps(b, a, s);
+#else
+  return _mm_or_ps(_mm_and_ps(s, a), _mm_andnot_ps(s, b));
+#endif
+}
+
+// Same, with two __m128d sources.
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other
+// values are allowed. The implementation depends on the instruction set:
+// If SSE4.1 is supported then only bit 63 in each dword of s is checked,
+// otherwise all bits in s are used.
+static inline __m128d selectd(__m128d const& s, __m128d const& a, __m128d const& b)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_blendv_pd(b, a, s);
+#else
+  return _mm_or_pd(_mm_and_pd(s, a), _mm_andnot_pd(s, b));
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Vec4fb: Vector of 4 Booleans for use with Vec4f
+ *
+ *****************************************************************************/
+
+class Vec4fb
+{
+ protected:
+  __m128 xmm;  // Float vector
+ public:
+  // Default constructor:
+  Vec4fb() {}
+  // Constructor to build from all elements:
+  Vec4fb(bool b0, bool b1, bool b2, bool b3) { xmm = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3)); }
+  // Constructor to convert from type __m128 used in intrinsics:
+  Vec4fb(__m128 const& x) { xmm = x; }
+  // Assignment operator to convert from type __m128 used in intrinsics:
+  Vec4fb& operator=(__m128 const& x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec4fb(bool b) { xmm = _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b))); }
+  // Assignment operator to broadcast scalar value:
+  Vec4fb& operator=(bool b)
+  {
+    *this = Vec4fb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec4fb(int b);
+  Vec4fb& operator=(int x);
+
+ public:
+  // Constructor to convert from type Vec4ib used as Boolean for integer vectors
+  Vec4fb(Vec4ib const& x) { xmm = _mm_castsi128_ps(x); }
+  // Assignment operator to convert from type Vec4ib used as Boolean for integer vectors
+  Vec4fb& operator=(Vec4ib const& x)
+  {
+    xmm = _mm_castsi128_ps(x);
+    return *this;
+  }
+  // Type cast operator to convert to __m128 used in intrinsics
+  operator __m128() const { return xmm; }
+  /* Clang problem:
+  The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128f as identical.
+  I have reported this problem in 2013 but it is still not fixed in 2017!
+  See the bug report at http://llvm.org/bugs/show_bug.cgi?id=17164
+  Additional problem: The version number is not consistent across platforms. The Apple build has
+  different version numbers. We have to rely on __apple_build_version__ on the Mac platform:
+  http://llvm.org/bugs/show_bug.cgi?id=12643
+  I have received reports that there was no aliasing of vector types on __apple_build_version__ = 6020053
+  but apparently the problem has come back. The aliasing of vector types has been reported on
+  __apple_build_version__ = 8000042
+  We have to make switches here when - hopefully - the error some day has been fixed.
+  We need different version checks with and whithout __apple_build_version__
+  */
+
+//#if (defined (__clang__) && !defined(__apple_build_version__)) || (defined(__apple_build_version__) && __apple_build_version__ <
+// 6020000)
+#if defined(__clang__) /* && CLANG_VERSION < xxxxx */ || defined(__apple_build_version__)
+#define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+#else
+  // Type cast operator to convert to type Vec4ib used as Boolean for integer vectors
+  operator Vec4ib() const { return _mm_castps_si128(xmm); }
+#endif
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4fb const& insert(uint32_t index, bool value)
+  {
+    static const int32_t maskl[8] = {0, 0, 0, 0, -1, 0, 0, 0};
+    __m128 mask                   = _mm_loadu_ps((float const*)(maskl + 4 - (index & 3)));  // mask with FFFFFFFF at index position
+    if(value)
+      {
+        xmm = _mm_or_ps(xmm, mask);
+      }
+    else
+      {
+        xmm = _mm_andnot_ps(mask, xmm);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const
+  {
+    // return Vec4ib(*this).extract(index);
+    return Vec4ib(_mm_castps_si128(xmm)).extract(index);
+  }
+  // Extract a single element. Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 4; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec4fb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4fb operator&(Vec4fb const& a, Vec4fb const& b) { return _mm_and_ps(a, b); }
+static inline Vec4fb operator&&(Vec4fb const& a, Vec4fb const& b) { return a & b; }
+
+// vector operator &= : bitwise and
+static inline Vec4fb& operator&=(Vec4fb& a, Vec4fb const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4fb operator|(Vec4fb const& a, Vec4fb const& b) { return _mm_or_ps(a, b); }
+static inline Vec4fb operator||(Vec4fb const& a, Vec4fb const& b) { return a | b; }
+
+// vector operator |= : bitwise or
+static inline Vec4fb& operator|=(Vec4fb& a, Vec4fb const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4fb operator^(Vec4fb const& a, Vec4fb const& b) { return _mm_xor_ps(a, b); }
+
+// vector operator ^= : bitwise xor
+static inline Vec4fb& operator^=(Vec4fb& a, Vec4fb const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4fb operator~(Vec4fb const& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1))); }
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec4fb operator!(Vec4fb const& a) { return Vec4fb(!Vec4ib(a)); }
+
+// Functions for Vec4fb
+
+// andnot: a & ~ b
+static inline Vec4fb andnot(Vec4fb const& a, Vec4fb const& b) { return _mm_andnot_ps(b, a); }
+
+/*****************************************************************************
+ *
+ *          Horizontal Boolean functions
+ *
+ *****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec4fb const& a)
+{
+  return _mm_movemask_ps(a) == 0x0F;
+  // return horizontal_and(Vec128b(_mm_castps_si128(a)));
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec4fb const& a)
+{
+  return _mm_movemask_ps(a) != 0;
+  // return horizontal_or(Vec128b(_mm_castps_si128(a)));
+}
+
+/*****************************************************************************
+ *
+ *          Vec2db: Vector of 2 Booleans for use with Vec2d
+ *
+ *****************************************************************************/
+
+class Vec2db
+{
+ protected:
+  __m128d xmm;  // Double vector
+ public:
+  // Default constructor:
+  Vec2db() {}
+  // Constructor to broadcast the same value into all elements:
+  // Constructor to build from all elements:
+  Vec2db(bool b0, bool b1) { xmm = _mm_castsi128_pd(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1)); }
+  // Constructor to convert from type __m128d used in intrinsics:
+  Vec2db(__m128d const& x) { xmm = x; }
+  // Assignment operator to convert from type __m128d used in intrinsics:
+  Vec2db& operator=(__m128d const& x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec2db(bool b) { xmm = _mm_castsi128_pd(_mm_set1_epi32(-int32_t(b))); }
+  // Assignment operator to broadcast scalar value:
+  Vec2db& operator=(bool b)
+  {
+    *this = Vec2db(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec2db(int b);
+  Vec2db& operator=(int x);
+
+ public:
+  // Constructor to convert from type Vec2qb used as Boolean for integer vectors
+  Vec2db(Vec2qb const& x) { xmm = _mm_castsi128_pd(x); }
+  // Assignment operator to convert from type Vec2qb used as Boolean for integer vectors
+  Vec2db& operator=(Vec2qb const& x)
+  {
+    xmm = _mm_castsi128_pd(x);
+    return *this;
+  }
+  // Type cast operator to convert to __m128d used in intrinsics
+  operator __m128d() const { return xmm; }
+#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+  // Type cast operator to convert to type Vec2qb used as Boolean for integer vectors
+  operator Vec2qb() const { return _mm_castpd_si128(xmm); }
+#endif
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec2db const& insert(uint32_t index, bool value)
+  {
+    static const int32_t maskl[8] = {0, 0, 0, 0, -1, -1, 0, 0};
+    __m128 mask = _mm_loadu_ps((float const*)(maskl + 4 - (index & 1) * 2));  // mask with FFFFFFFFFFFFFFFF at index position
+    if(value)
+      {
+        xmm = _mm_or_pd(xmm, _mm_castps_pd(mask));
+      }
+    else
+      {
+        xmm = _mm_andnot_pd(_mm_castps_pd(mask), xmm);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const { return Vec2qb(*this).extract(index); }
+  // Extract a single element. Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 2; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec2db
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec2db operator&(Vec2db const& a, Vec2db const& b) { return _mm_and_pd(a, b); }
+static inline Vec2db operator&&(Vec2db const& a, Vec2db const& b) { return a & b; }
+
+// vector operator &= : bitwise and
+static inline Vec2db& operator&=(Vec2db& a, Vec2db const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec2db operator|(Vec2db const& a, Vec2db const& b) { return _mm_or_pd(a, b); }
+static inline Vec2db operator||(Vec2db const& a, Vec2db const& b) { return a | b; }
+
+// vector operator |= : bitwise or
+static inline Vec2db& operator|=(Vec2db& a, Vec2db const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2db operator^(Vec2db const& a, Vec2db const& b) { return _mm_xor_pd(a, b); }
+
+// vector operator ^= : bitwise xor
+static inline Vec2db& operator^=(Vec2db& a, Vec2db const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec2db operator~(Vec2db const& a) { return _mm_xor_pd(a, _mm_castsi128_pd(_mm_set1_epi32(-1))); }
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec2db operator!(Vec2db const& a) { return Vec2db(!Vec2qb(a)); }
+
+// Functions for Vec2db
+
+// andnot: a & ~ b
+static inline Vec2db andnot(Vec2db const& a, Vec2db const& b) { return _mm_andnot_pd(b, a); }
+
+/*****************************************************************************
+ *
+ *          Horizontal Boolean functions
+ *
+ *****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec2db const& a)
+{
+  return _mm_movemask_pd(a) == 3;
+  // return horizontal_and(Vec128b(_mm_castpd_si128(a)));
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec2db const& a)
+{
+  return _mm_movemask_pd(a) != 0;
+  // return horizontal_or(Vec128b(_mm_castpd_si128(a)));
+}
+
+/*****************************************************************************
+ *
+ *          Vec4f: Vector of 4 single precision floating point values
+ *
+ *****************************************************************************/
+
+class Vec4f
+{
+ protected:
+  __m128 xmm;  // Float vector
+ public:
+  // Default constructor:
+  Vec4f() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec4f(float f) { xmm = _mm_set1_ps(f); }
+  // Constructor to build from all elements:
+  Vec4f(float f0, float f1, float f2, float f3) { xmm = _mm_setr_ps(f0, f1, f2, f3); }
+  // Constructor to convert from type __m128 used in intrinsics:
+  Vec4f(__m128 const& x) { xmm = x; }
+  // Assignment operator to convert from type __m128 used in intrinsics:
+  Vec4f& operator=(__m128 const& x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m128 used in intrinsics
+  operator __m128() const { return xmm; }
+  // Member function to load from array (unaligned)
+  Vec4f& load(float const* p)
+  {
+    xmm = _mm_loadu_ps(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 16
+  // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1,
+  // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 16.
+  Vec4f& load_a(float const* p)
+  {
+    xmm = _mm_load_ps(p);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(float* p) const { _mm_storeu_ps(p, xmm); }
+  // Member function to store into array, aligned by 16
+  // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1,
+  // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 16.
+  void store_a(float* p) const { _mm_store_ps(p, xmm); }
+  // Partial load. Load n elements and set the rest to 0
+  Vec4f& load_partial(int n, float const* p)
+  {
+    __m128 t1, t2;
+    switch(n)
+      {
+        case 1:
+          xmm = _mm_load_ss(p);
+          break;
+        case 2:
+          xmm = _mm_castpd_ps(_mm_load_sd((double const*)p));
+          break;
+        case 3:
+          t1  = _mm_castpd_ps(_mm_load_sd((double const*)p));
+          t2  = _mm_load_ss(p + 2);
+          xmm = _mm_movelh_ps(t1, t2);
+          break;
+        case 4:
+          load(p);
+          break;
+        default:
+          xmm = _mm_setzero_ps();
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, float* p) const
+  {
+    __m128 t1;
+    switch(n)
+      {
+        case 1:
+          _mm_store_ss(p, xmm);
+          break;
+        case 2:
+          _mm_store_sd((double*)p, _mm_castps_pd(xmm));
+          break;
+        case 3:
+          _mm_store_sd((double*)p, _mm_castps_pd(xmm));
+          t1 = _mm_movehl_ps(xmm, xmm);
+          _mm_store_ss(p + 2, t1);
+          break;
+        case 4:
+          store(p);
+          break;
+        default:;
+      }
+  }
+  // cut off vector to n elements. The last 4-n elements are set to zero
+  Vec4f& cutoff(int n)
+  {
+    if(uint32_t(n) >= 4)
+      return *this;
+    static const union
+    {
+      int32_t i[8];
+      float f[8];
+    } mask = {{1, -1, -1, -1, 0, 0, 0, 0}};
+    xmm    = _mm_and_ps(xmm, Vec4f().load(mask.f + 4 - n));
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4f const& insert(uint32_t index, float value)
+  {
+#if INSTRSET >= 5  // SSE4.1 supported
+    switch(index & 3)
+      {
+        case 0:
+          xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 0 << 4);
+          break;
+        case 1:
+          xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 1 << 4);
+          break;
+        case 2:
+          xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 2 << 4);
+          break;
+        default:
+          xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 3 << 4);
+          break;
+      }
+#else
+    static const int32_t maskl[8] = {0, 0, 0, 0, -1, 0, 0, 0};
+    __m128 broad                  = _mm_set1_ps(value);                                     // broadcast value into all elements
+    __m128 mask                   = _mm_loadu_ps((float const*)(maskl + 4 - (index & 3)));  // mask with FFFFFFFF at index position
+    xmm                           = selectf(mask, broad, xmm);
+#endif
+    return *this;
+  };
+  // Member function extract a single element from vector
+  float extract(uint32_t index) const
+  {
+    float x[4];
+    store(x);
+    return x[index & 3];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  float operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 4; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec4f
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4f operator+(Vec4f const& a, Vec4f const& b) { return _mm_add_ps(a, b); }
+
+// vector operator + : add vector and scalar
+static inline Vec4f operator+(Vec4f const& a, float b) { return a + Vec4f(b); }
+static inline Vec4f operator+(float a, Vec4f const& b) { return Vec4f(a) + b; }
+
+// vector operator += : add
+static inline Vec4f& operator+=(Vec4f& a, Vec4f const& b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec4f operator++(Vec4f& a, int)
+{
+  Vec4f a0 = a;
+  a        = a + 1.0f;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec4f& operator++(Vec4f& a)
+{
+  a = a + 1.0f;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4f operator-(Vec4f const& a, Vec4f const& b) { return _mm_sub_ps(a, b); }
+
+// vector operator - : subtract vector and scalar
+static inline Vec4f operator-(Vec4f const& a, float b) { return a - Vec4f(b); }
+static inline Vec4f operator-(float a, Vec4f const& b) { return Vec4f(a) - b; }
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec4f operator-(Vec4f const& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+
+// vector operator -= : subtract
+static inline Vec4f& operator-=(Vec4f& a, Vec4f const& b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec4f operator--(Vec4f& a, int)
+{
+  Vec4f a0 = a;
+  a        = a - 1.0f;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec4f& operator--(Vec4f& a)
+{
+  a = a - 1.0f;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4f operator*(Vec4f const& a, Vec4f const& b) { return _mm_mul_ps(a, b); }
+
+// vector operator * : multiply vector and scalar
+static inline Vec4f operator*(Vec4f const& a, float b) { return a * Vec4f(b); }
+static inline Vec4f operator*(float a, Vec4f const& b) { return Vec4f(a) * b; }
+
+// vector operator *= : multiply
+static inline Vec4f& operator*=(Vec4f& a, Vec4f const& b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec4f operator/(Vec4f const& a, Vec4f const& b) { return _mm_div_ps(a, b); }
+
+// vector operator / : divide vector and scalar
+static inline Vec4f operator/(Vec4f const& a, float b) { return a / Vec4f(b); }
+static inline Vec4f operator/(float a, Vec4f const& b) { return Vec4f(a) / b; }
+
+// vector operator /= : divide
+static inline Vec4f& operator/=(Vec4f& a, Vec4f const& b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4fb operator==(Vec4f const& a, Vec4f const& b) { return _mm_cmpeq_ps(a, b); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4fb operator!=(Vec4f const& a, Vec4f const& b) { return _mm_cmpneq_ps(a, b); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4fb operator<(Vec4f const& a, Vec4f const& b) { return _mm_cmplt_ps(a, b); }
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec4fb operator<=(Vec4f const& a, Vec4f const& b) { return _mm_cmple_ps(a, b); }
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4fb operator>(Vec4f const& a, Vec4f const& b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec4fb operator>=(Vec4f const& a, Vec4f const& b) { return b <= a; }
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec4f operator&(Vec4f const& a, Vec4f const& b) { return _mm_and_ps(a, b); }
+
+// vector operator &= : bitwise and
+static inline Vec4f& operator&=(Vec4f& a, Vec4f const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator & : bitwise and of Vec4f and Vec4fb
+static inline Vec4f operator&(Vec4f const& a, Vec4fb const& b) { return _mm_and_ps(a, b); }
+static inline Vec4f operator&(Vec4fb const& a, Vec4f const& b) { return _mm_and_ps(a, b); }
+
+// vector operator | : bitwise or
+static inline Vec4f operator|(Vec4f const& a, Vec4f const& b) { return _mm_or_ps(a, b); }
+
+// vector operator |= : bitwise or
+static inline Vec4f& operator|=(Vec4f& a, Vec4f const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4f operator^(Vec4f const& a, Vec4f const& b) { return _mm_xor_ps(a, b); }
+
+// vector operator ^= : bitwise xor
+static inline Vec4f& operator^=(Vec4f& a, Vec4f const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec4fb operator!(Vec4f const& a) { return a == Vec4f(0.0f); }
+
+/*****************************************************************************
+ *
+ *          Functions for Vec4f
+ *
+ *****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec4f select(Vec4fb const& s, Vec4f const& a, Vec4f const& b) { return selectf(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4f if_add(Vec4fb const& f, Vec4f const& a, Vec4f const& b) { return a + (Vec4f(f) & b); }
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec4f if_mul(Vec4fb const& f, Vec4f const& a, Vec4f const& b) { return a * select(f, b, 1.f); }
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add(Vec4f const& a)
+{
+#if INSTRSET >= 3  // SSE3
+  __m128 t1 = _mm_hadd_ps(a, a);
+  __m128 t2 = _mm_hadd_ps(t1, t1);
+  return _mm_cvtss_f32(t2);
+#else
+  __m128 t1 = _mm_movehl_ps(a, a);
+  __m128 t2 = _mm_add_ps(a, t1);
+  __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
+  __m128 t4 = _mm_add_ss(t2, t3);
+  return _mm_cvtss_f32(t4);
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec4f max(Vec4f const& a, Vec4f const& b) { return _mm_max_ps(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec4f min(Vec4f const& a, Vec4f const& b) { return _mm_min_ps(a, b); }
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec4f abs(Vec4f const& a)
+{
+  __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+  return _mm_and_ps(a, mask);
+}
+
+// function sqrt: square root
+static inline Vec4f sqrt(Vec4f const& a) { return _mm_sqrt_ps(a); }
+
+// function square: a * a
+static inline Vec4f square(Vec4f const& a) { return a * a; }
+
+// pow(vector,int) function template
+template <typename VTYPE>
+static inline VTYPE pow_template_i(VTYPE const& x0, int n)
+{
+  VTYPE x = x0;   // a^(2^i)
+  VTYPE y(1.0f);  // accumulator
+  if(n >= 0)
+    {  // make sure n is not negative
+      while(true)
+        {  // loop for each bit in n
+          if(n & 1)
+            y *= x;  // multiply if bit = 1
+          n >>= 1;   // get next bit of n
+          if(n == 0)
+            return y;  // finished
+          x *= x;      // x = a^2, a^4, a^8, etc.
+        }
+    }
+  else
+    {                                                      // n < 0
+      return VTYPE(1.0f) / pow_template_i<VTYPE>(x0, -n);  // reciprocal
+    }
+}
+
+// pow(Vec4f, int):
+// The purpose of this template is to prevent implicit conversion of a float
+// exponent to int when calling pow(vector, float) and vectormath_exp.h is
+// not included
+
+template <typename TT>
+static Vec4f pow(Vec4f const& a, TT const& n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec4f pow<int>(Vec4f const& x0, int const& n)
+{
+  return pow_template_i<Vec4f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec4f pow<uint32_t>(Vec4f const& x0, uint32_t const& n)
+{
+  return pow_template_i<Vec4f>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec4f pow_n(Vec4f const& a)
+{
+  if(n < 0)
+    return Vec4f(1.0f) / pow_n<-n>(a);
+  if(n == 0)
+    return Vec4f(1.0f);
+  if(n >= 256)
+    return pow(a, n);
+  Vec4f x = a;                           // a^(2^i)
+  Vec4f y;                               // accumulator
+  const int lowest = n - (n & (n - 1));  // lowest set bit in n
+  if(n & 1)
+    y = x;
+  if(n < 2)
+    return y;
+  x = x * x;  // x^2
+  if(n & 2)
+    {
+      if(lowest == 2)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 4)
+    return y;
+  x = x * x;  // x^4
+  if(n & 4)
+    {
+      if(lowest == 4)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 8)
+    return y;
+  x = x * x;  // x^8
+  if(n & 8)
+    {
+      if(lowest == 8)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 16)
+    return y;
+  x = x * x;  // x^16
+  if(n & 16)
+    {
+      if(lowest == 16)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 32)
+    return y;
+  x = x * x;  // x^32
+  if(n & 32)
+    {
+      if(lowest == 32)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 64)
+    return y;
+  x = x * x;  // x^64
+  if(n & 64)
+    {
+      if(lowest == 64)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 128)
+    return y;
+  x = x * x;  // x^128
+  if(n & 128)
+    {
+      if(lowest == 128)
+        y = x;
+      else
+        y *= x;
+    }
+  return y;
+}
+
+// implement as function pow(vector, const_int)
+template <int n>
+static inline Vec4f pow(Vec4f const& a, Const_int_t<n>)
+{
+  return pow_n<n>(a);
+}
+
+// implement the same as macro pow_const(vector, int)
+#define pow_const(x, n) pow_n<n>(x)
+
+// avoid unsafe optimization in function round
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && INSTRSET < 5
+static inline Vec4f round(Vec4f const& a) __attribute__((optimize("-fno-unsafe-math-optimizations")));
+#elif defined(__clang__) && INSTRSET < 5
+// static inline Vec4f round(Vec4f const & a) __attribute__ ((optnone));
+// This doesn't work, but current versions of Clang (3.5) don't optimize away signedmagic, even with -funsafe-math-optimizations
+// Add volatile to b if future versions fail
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) && INSTRSET < 5
+#pragma float_control(push)
+#pragma float_control(precise, on)
+#define FLOAT_CONTROL_PRECISE_FOR_ROUND
+#endif
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec4f round(Vec4f const& a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_round_ps(a, 8);
+#else  // SSE2. Use magic number method
+  // Note: assume MXCSR control register is set to rounding
+  // (don't use conversion to int, it will limit the value to +/- 2^31)
+  Vec4f signmask = _mm_castsi128_ps(constant4ui<0x80000000, 0x80000000, 0x80000000, 0x80000000>());  // -0.0
+  Vec4f magic = _mm_castsi128_ps(constant4ui<0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000>());     // magic number = 2^23
+  Vec4f sign = _mm_and_ps(a, signmask);                                                              // signbit of a
+  Vec4f signedmagic = _mm_or_ps(magic, sign);                                                        // magic number with sign of a
+  // volatile
+  Vec4f b = a + signedmagic;     // round by adding magic number
+  return b - signedmagic;        // .. and subtracting it again
+#endif
+}
+#ifdef FLOAT_CONTROL_PRECISE_FOR_ROUND
+#pragma float_control(pop)
+#endif
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec4f truncate(Vec4f const& a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_round_ps(a, 3 + 8);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+  uint32_t t1 = _mm_getcsr();    // MXCSR
+  uint32_t t2 = t1 | (3 << 13);  // bit 13-14 = 11
+  _mm_setcsr(t2);                // change MXCSR
+  Vec4f r = round(a);            // use magic number method
+  _mm_setcsr(t1);                // restore MXCSR
+  return r;
+#endif
+}
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec4f floor(Vec4f const& a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_round_ps(a, 1 + 8);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+  uint32_t t1 = _mm_getcsr();    // MXCSR
+  uint32_t t2 = t1 | (1 << 13);  // bit 13-14 = 01
+  _mm_setcsr(t2);                // change MXCSR
+  Vec4f r = round(a);            // use magic number method
+  _mm_setcsr(t1);                // restore MXCSR
+  return r;
+#endif
+}
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec4f ceil(Vec4f const& a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_round_ps(a, 2 + 8);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+  uint32_t t1 = _mm_getcsr();    // MXCSR
+  uint32_t t2 = t1 | (2 << 13);  // bit 13-14 = 10
+  _mm_setcsr(t2);                // change MXCSR
+  Vec4f r = round(a);            // use magic number method
+  _mm_setcsr(t1);                // restore MXCSR
+  return r;
+#endif
+}
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec4i round_to_int(Vec4f const& a)
+{
+  // Note: assume MXCSR control register is set to rounding
+  return _mm_cvtps_epi32(a);
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec4i truncate_to_int(Vec4f const& a) { return _mm_cvttps_epi32(a); }
+
+// function to_float: convert integer vector to float vector
+static inline Vec4f to_float(Vec4i const& a) { return _mm_cvtepi32_ps(a); }
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec4f to_float(Vec4ui const& a)
+{
+#ifdef __AVX512VL__
+  return _mm_cvtepu32_ps(a);
+#else
+  Vec4f b = to_float(Vec4i(a & 0x7FFFFFFF));                   // 31 bits
+  Vec4i c = Vec4i(a) >> 31;                                    // generate mask from highest bit
+  Vec4f d = Vec4f(2147483648.f) & Vec4f(_mm_castsi128_ps(c));  // mask floating point constant 2^31
+  return b + d;
+#endif
+}
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
+static inline Vec4f approx_recipr(Vec4f const& a)
+{
+#if INSTRSET >= 9    // use more accurate version if available. (none of these will raise exceptions on zero)
+#ifdef __AVX512ER__  // AVX512ER: full precision
+  // todo: if future processors have both AVX512ER and AVX512VL: _mm128_rcp28_round_ps(a, _MM_FROUND_NO_EXC);
+  return _mm512_castps512_ps128(_mm512_rcp28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC));
+#elif defined __AVX512VL__  // AVX512VL: 14 bit precision
+  return _mm_rcp14_ps(a);
+#else                       // AVX512F: 14 bit precision
+  return _mm512_castps512_ps128(_mm512_rcp14_ps(_mm512_castps128_ps512(a)));
+#endif
+#else  // AVX: 11 bit precision
+  return _mm_rcp_ps(a);
+#endif
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
+static inline Vec4f approx_rsqrt(Vec4f const& a)
+{
+#if INSTRSET >= 9    // use more accurate version if available. (none of these will raise exceptions on zero)
+#ifdef __AVX512ER__  // AVX512ER: full precision
+  // todo: if future processors have both AVX512ER and AVX521VL: _mm128_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
+  return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC));
+#elif defined __AVX512VL__  // AVX512VL: 14 bit precision
+  return _mm_rsqrt14_ps(a);
+#else                       // AVX512F: 14 bit precision
+  return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a)));
+#endif
+#else  // AVX: 11 bit precision
+  return _mm_rsqrt_ps(a);
+#endif
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec4f mul_add(Vec4f const& a, Vec4f const& b, Vec4f const& c)
+{
+#ifdef __FMA__
+  return _mm_fmadd_ps(a, b, c);
+#elif defined(__FMA4__)
+  return _mm_macc_ps(a, b, c);
+#else
+  return a * b + c;
+#endif
+}
+
+// Multiply and subtract
+static inline Vec4f mul_sub(Vec4f const& a, Vec4f const& b, Vec4f const& c)
+{
+#ifdef __FMA__
+  return _mm_fmsub_ps(a, b, c);
+#elif defined(__FMA4__)
+  return _mm_msub_ps(a, b, c);
+#else
+  return a * b - c;
+#endif
+}
+
+// Multiply and inverse subtract
+static inline Vec4f nmul_add(Vec4f const& a, Vec4f const& b, Vec4f const& c)
+{
+#ifdef __FMA__
+  return _mm_fnmadd_ps(a, b, c);
+#elif defined(__FMA4__)
+  return _mm_nmacc_ps(a, b, c);
+#else
+  return c - a * b;
+#endif
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec4f mul_sub_x(Vec4f const& a, Vec4f const& b, Vec4f const& c)
+{
+#ifdef __FMA__
+  return _mm_fmsub_ps(a, b, c);
+#elif defined(__FMA4__)
+  return _mm_msub_ps(a, b, c);
+#else
+  // calculate a * b - c with extra precision
+  Vec4i upper_mask = -(1 << 12);                               // mask to remove lower 12 bits
+  Vec4f a_high     = a & Vec4f(_mm_castsi128_ps(upper_mask));  // split into high and low parts
+  Vec4f b_high     = b & Vec4f(_mm_castsi128_ps(upper_mask));
+  Vec4f a_low      = a - a_high;
+  Vec4f b_low      = b - b_high;
+  Vec4f r1         = a_high * b_high;                                         // this product is exact
+  Vec4f r2         = r1 - c;                                                  // subtract c from high product
+  Vec4f r3         = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low;  // add rest of product
+  return r3;                                                                  // + ((r2 - r1) + c);
+#endif
+}
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec4i exponent(Vec4f const& a)
+{
+  Vec4ui t1 = _mm_castps_si128(a);  // reinterpret as 32-bit integer
+  Vec4ui t2 = t1 << 1;              // shift out sign bit
+  Vec4ui t3 = t2 >> 24;             // shift down logical to position 0
+  Vec4i t4  = Vec4i(t3) - 0x7F;     // subtract bias from exponent
+  return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
+// NOTE: The name fraction clashes with an ENUM in MAC XCode CarbonCore script.h !
+static inline Vec4f fraction(Vec4f const& a)
+{
+  Vec4ui t1 = _mm_castps_si128(a);                     // reinterpret as 32-bit integer
+  Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F800000);  // set exponent to 0 + bias
+  return _mm_castsi128_ps(t2);
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec4f exp2(Vec4i const& n)
+{
+  Vec4i t1 = max(n, -0x7F);  // limit to allowed range
+  Vec4i t2 = min(t1, 0x80);
+  Vec4i t3 = t2 + 0x7F;         // add bias
+  Vec4i t4 = t3 << 23;          // put exponent into position 23
+  return _mm_castsi128_ps(t4);  // reinterpret as float
+}
+// static Vec4f exp2(Vec4f const & x); // defined in vectormath_exp.h
+
+// Control word manipulaton
+// ------------------------
+// The MXCSR control word has the following bits:
+//  0:    Invalid Operation Flag
+//  1:    Denormal Flag (=subnormal)
+//  2:    Divide-by-Zero Flag
+//  3:    Overflow Flag
+//  4:    Underflow Flag
+//  5:    Precision Flag
+//  6:    Denormals Are Zeros (=subnormals)
+//  7:    Invalid Operation Mask
+//  8:    Denormal Operation Mask (=subnormal)
+//  9:    Divide-by-Zero Mask
+// 10:    Overflow Mask
+// 11:    Underflow Mask
+// 12:    Precision Mask
+// 13-14: Rounding control
+//        00: round to nearest or even
+//        01: round down towards -infinity
+//        10: round up   towards +infinity
+//        11: round towards zero (truncate)
+// 15: Flush to Zero
+
+// Function get_control_word:
+// Read the MXCSR control word
+static inline uint32_t get_control_word() { return _mm_getcsr(); }
+
+// Function set_control_word:
+// Write the MXCSR control word
+static inline void set_control_word(uint32_t w) { _mm_setcsr(w); }
+
+// Function no_subnormals:
+// Set "Denormals Are Zeros" and "Flush to Zero" mode to avoid the extremely
+// time-consuming denormals in case of underflow
+static inline void no_subnormals()
+{
+  uint32_t t1 = get_control_word();
+  t1 |= (1 << 6) | (1 << 15);  // set bit 6 and 15 in MXCSR
+  set_control_word(t1);
+}
+
+// Function reset_control_word:
+// Set the MXCSR control word to the default value 0x1F80.
+// This will mask floating point exceptions, set rounding mode to nearest (or even),
+// and allow denormals.
+static inline void reset_control_word() { set_control_word(0x1F80); }
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec4f(-0.0f)) gives true, while Vec4f(-0.0f) < Vec4f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec4fb sign_bit(Vec4f const& a)
+{
+  Vec4i t1 = _mm_castps_si128(a);  // reinterpret as 32-bit integer
+  Vec4i t2 = t1 >> 31;             // extend sign bit
+  return _mm_castsi128_ps(t2);     // reinterpret as 32-bit Boolean
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec4f sign_combine(Vec4f const& a, Vec4f const& b)
+{
+  Vec4f signmask = _mm_castsi128_ps(constant4ui<0x80000000, 0x80000000, 0x80000000, 0x80000000>());  // -0.0
+  return a ^ (b & signmask);
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec4fb is_finite(Vec4f const& a)
+{
+  Vec4i t1 = _mm_castps_si128(a);                   // reinterpret as 32-bit integer
+  Vec4i t2 = t1 << 1;                               // shift out sign bit
+  Vec4i t3 = Vec4i(t2 & 0xFF000000) != 0xFF000000;  // exponent field is not all 1s
+  return Vec4ib(t3);
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec4fb is_inf(Vec4f const& a)
+{
+  Vec4i t1 = _mm_castps_si128(a);  // reinterpret as 32-bit integer
+  Vec4i t2 = t1 << 1;              // shift out sign bit
+  return t2 == Vec4i(0xFF000000);  // exponent is all 1s, fraction is 0
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec4fb is_nan(Vec4f const& a)
+{
+  Vec4i t1 = _mm_castps_si128(a);         // reinterpret as 32-bit integer
+  Vec4i t2 = t1 << 1;                     // shift out sign bit
+  Vec4i t3 = 0xFF000000;                  // exponent mask
+  Vec4i t4 = t2 & t3;                     // exponent
+  Vec4i t5 = _mm_andnot_si128(t3, t2);    // fraction
+  return Vec4ib((t4 == t3) & (t5 != 0));  // exponent = all 1s and fraction != 0
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec4fb is_subnormal(Vec4f const& a)
+{
+  Vec4i t1 = _mm_castps_si128(a);        // reinterpret as 32-bit integer
+  Vec4i t2 = t1 << 1;                    // shift out sign bit
+  Vec4i t3 = 0xFF000000;                 // exponent mask
+  Vec4i t4 = t2 & t3;                    // exponent
+  Vec4i t5 = _mm_andnot_si128(t3, t2);   // fraction
+  return Vec4ib((t4 == 0) & (t5 != 0));  // exponent = 0 and fraction != 0
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec4fb is_zero_or_subnormal(Vec4f const& a)
+{
+  Vec4i t = _mm_castps_si128(a);  // reinterpret as 32-bit integer
+  t &= 0x7F800000;                // isolate exponent
+  return t == 0;                  // exponent = 0
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec4f infinite4f() { return _mm_castsi128_ps(_mm_set1_epi32(0x7F800000)); }
+
+// Function nan4f: returns a vector where all elements are NAN (quiet)
+static inline Vec4f nan4f(int n = 0x10) { return _mm_castsi128_ps(_mm_set1_epi32(0x7FC00000 + n)); }
+
+/*****************************************************************************
+ *
+ *          Vector Vec4f permute and blend functions
+ *
+ ******************************************************************************
+ *
+ * The permute function can reorder the elements of a vector and optionally
+ * set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select. A negative index will generate zero.
+ *
+ * Example:
+ * Vec4f a(10.f,11.f,12.f,13.f);        // a is (10,11,12,13)
+ * Vec4f b, c;
+ * b = permute4f<0,0,2,2>(a);           // b is (10,10,12,12)
+ * c = permute4f<3,2,-1,-1>(a);         // c is (13,12, 0, 0)
+ *
+ *
+ * The blend function can mix elements from two different vectors and
+ * optionally set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select, where indexes 0 - 3 indicate an element from the first source
+ * vector and indexes 4 - 7 indicate an element from the second source vector.
+ * A negative index will generate zero.
+ *
+ *
+ * Example:
+ * Vec4f a(10.f,11.f,12.f,13.f);        // a is (10, 11, 12, 13)
+ * Vec4f b(20.f,21.f,22.f,23.f);        // b is (20, 21, 22, 23)
+ * Vec4f c;
+ * c = blend4f<1,4,-1,7> (a,b);         // c is (11, 20,  0, 23)
+ *
+ * Don't worry about the complicated code for these functions. Most of the
+ * code is resolved at compile time to generate only a few instructions.
+ *****************************************************************************/
+
+// permute vector Vec4f
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f permute4f(Vec4f const& a)
+{
+  // is shuffling needed
+  const bool do_shuffle = (i0 > 0) || (i1 != 1 && i1 >= 0) || (i2 != 2 && i2 >= 0) || (i3 != 3 && i3 >= 0);
+  // is zeroing needed
+  const bool do_zero = (i0 | i1 | i2 | i3) < 0 && ((i0 | i1 | i2 | i3) & 0x80);
+
+  if(!do_shuffle && !do_zero)
+    {
+      return a;  // trivial case: do nothing
+    }
+  if(do_zero && !do_shuffle)
+    {  // zeroing, not shuffling
+      if((i0 & i1 & i2 & i3) < 0)
+        return _mm_setzero_ps();  // zero everything
+      // zero some elements
+      __m128i mask1 = constant4i<-int(i0 >= 0), -int(i1 >= 0), -int(i2 >= 0), -int(i3 >= 0)>();
+      return _mm_and_ps(a, _mm_castsi128_ps(mask1));  // zero with AND mask
+    }
+  if(do_shuffle && !do_zero)
+    {  // shuffling, not zeroing
+      return _mm_shuffle_ps(a, a, (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6);
+    }
+  // both shuffle and zero
+  if((i0 & i1) < 0 && (i2 | i3) >= 0)
+    {  // zero low half, shuffle high half
+      return _mm_shuffle_ps(_mm_setzero_ps(), a, (i2 & 3) << 4 | (i3 & 3) << 6);
+    }
+  if((i0 | i1) >= 0 && (i2 & i3) < 0)
+    {  // shuffle low half, zero high half
+      return _mm_shuffle_ps(a, _mm_setzero_ps(), (i0 & 3) | (i1 & 3) << 2);
+    }
+#if INSTRSET >= 4  // SSSE3
+  // With SSSE3 we can do both with the PSHUFB instruction
+  const int j0  = (i0 & 3) << 2;
+  const int j1  = (i1 & 3) << 2;
+  const int j2  = (i2 & 3) << 2;
+  const int j3  = (i3 & 3) << 2;
+  __m128i mask2 = constant4i < i0 < 0 ? -1 : j0 | (j0 + 1) << 8 | (j0 + 2) << 16 | (j0 + 3) << 24,
+          i1 < 0 ? -1 : j1 | (j1 + 1) << 8 | (j1 + 2) << 16 | (j1 + 3) << 24,
+          i2 < 0 ? -1 : j2 | (j2 + 1) << 8 | (j2 + 2) << 16 | (j2 + 3) << 24,
+          i3 < 0 ? -1 : j3 | (j3 + 1) << 8 | (j3 + 2) << 16 | (j3 + 3) << 24 > ();
+  return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), mask2));
+#else
+  __m128 t1 = _mm_shuffle_ps(a, a, (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6);  // shuffle
+  __m128i mask3 = constant4i<-int(i0 >= 0), -int(i1 >= 0), -int(i2 >= 0), -int(i3 >= 0)>();
+  return _mm_and_ps(t1, _mm_castsi128_ps(mask3));  // zero with AND mask
+#endif
+}
+
+// blend vectors Vec4f
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f blend4f(Vec4f const& a, Vec4f const& b)
+{
+  // Combine all the indexes into a single bitfield, with 8 bits for each
+  const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24;
+
+  // Mask to zero out negative indexes
+  const int m2 = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24;
+
+  if((m1 & 0x04040404 & m2) == 0)
+    {
+      // no elements from b
+      return permute4f<i0, i1, i2, i3>(a);
+    }
+  if(((m1 ^ 0x04040404) & 0x04040404 & m2) == 0)
+    {
+      // no elements from a
+      return permute4f<i0 & ~4, i1 & ~4, i2 & ~4, i3 & ~4>(b);
+    }
+  if(((m1 & ~0x04040404) ^ 0x03020100) == 0 && m2 == -1)
+    {
+      // selecting without shuffling or zeroing
+      __m128i sel = constant4i < i0 & 4 ? 0 : -1, i1 & 4 ? 0 : -1, i2 & 4 ? 0 : -1, i3 & 4 ? 0 : -1 > ();
+      return selectf(_mm_castsi128_ps(sel), a, b);
+    }
+#ifdef __XOP__  // Use AMD XOP instruction PPERM
+  __m128i maska = constant4i < i0 < 0
+                      ? 0x80808080
+                      : (i0 * 4 & 31) + (((i0 * 4 & 31) + 1) << 8) + (((i0 * 4 & 31) + 2) << 16) + (((i0 * 4 & 31) + 3) << 24),
+          i1 < 0 ? 0x80808080 : (i1 * 4 & 31) + (((i1 * 4 & 31) + 1) << 8) + (((i1 * 4 & 31) + 2) << 16) + (((i1 * 4 & 31) + 3) << 24),
+          i2 < 0 ? 0x80808080 : (i2 * 4 & 31) + (((i2 * 4 & 31) + 1) << 8) + (((i2 * 4 & 31) + 2) << 16) + (((i2 * 4 & 31) + 3) << 24),
+          i3 < 0 ? 0x80808080
+                 : (i3 * 4 & 31) + (((i3 * 4 & 31) + 1) << 8) + (((i3 * 4 & 31) + 2) << 16) + (((i3 * 4 & 31) + 3) << 24) > ();
+  return _mm_castsi128_ps(_mm_perm_epi8(_mm_castps_si128(a), _mm_castps_si128(b), maska));
+#else
+  if((((m1 & ~0x04040404) ^ 0x03020100) & m2) == 0)
+    {
+      // selecting and zeroing, not shuffling
+      __m128i sel1 = constant4i < i0 & 4 ? 0 : -1, i1 & 4 ? 0 : -1, i2 & 4 ? 0 : -1, i3 & 4 ? 0 : -1 > ();
+      __m128i mask1 = constant4i<-int(i0 >= 0), -int(i1 >= 0), -int(i2 >= 0), -int(i3 >= 0)>();
+      __m128 t1 = selectf(_mm_castsi128_ps(sel1), a, b);  // select
+      return _mm_and_ps(t1, _mm_castsi128_ps(mask1));     // zero
+    }
+  // special cases unpckhps, unpcklps, shufps
+  Vec4f t;
+  if(((m1 ^ 0x05010400) & m2) == 0)
+    {
+      t = _mm_unpacklo_ps(a, b);
+      goto DOZERO;
+    }
+  if(((m1 ^ 0x01050004) & m2) == 0)
+    {
+      t = _mm_unpacklo_ps(b, a);
+      goto DOZERO;
+    }
+  if(((m1 ^ 0x07030602) & m2) == 0)
+    {
+      t = _mm_unpackhi_ps(a, b);
+      goto DOZERO;
+    }
+  if(((m1 ^ 0x03070206) & m2) == 0)
+    {
+      t = _mm_unpackhi_ps(b, a);
+      goto DOZERO;
+    }
+  // first two elements from a, last two from b
+  if(((m1 ^ 0x04040000) & 0x04040404 & m2) == 0)
+    {
+      t = _mm_shuffle_ps(a, b, (i0 & 3) + ((i1 & 3) << 2) + ((i2 & 3) << 4) + ((i3 & 3) << 6));
+      goto DOZERO;
+    }
+  // first two elements from b, last two from a
+  if(((m1 ^ 0x00000404) & 0x04040404 & m2) == 0)
+    {
+      t = _mm_shuffle_ps(b, a, (i0 & 3) + ((i1 & 3) << 2) + ((i2 & 3) << 4) + ((i3 & 3) << 6));
+      goto DOZERO;
+    }
+  {  // general case. combine two permutes
+    __m128 a1 = permute4f < (uint32_t)i0 < 4 ? i0 : -1, (uint32_t)i1 < 4 ? i1 : -1, (uint32_t)i2 < 4 ? i2 : -1,
+           (uint32_t)i3 < 4 ? i3 : -1 > (a);
+    __m128 b1 = permute4f < (uint32_t)(i0 ^ 4) < 4 ? (i0 ^ 4) : -1, (uint32_t)(i1 ^ 4) < 4 ? (i1 ^ 4) : -1,
+           (uint32_t)(i2 ^ 4) < 4 ? (i2 ^ 4) : -1, (uint32_t)(i3 ^ 4) < 4 ? (i3 ^ 4) : -1 > (b);
+    return _mm_or_ps(a1, b1);
+  }
+DOZERO:
+  if((i0 | i1 | i2 | i3) & 0x80)
+    {
+      // zero some elements
+      __m128i mask1 = constant4i<-int(i0 >= 0), -int(i1 >= 0), -int(i2 >= 0), -int(i3 >= 0)>();
+      t = _mm_and_ps(t, _mm_castsi128_ps(mask1));  // zero with AND mask
+    }
+  return t;
+
+#endif  // __XOP__
+}
+
+// change signs on vectors Vec4f
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f change_sign(Vec4f const& a)
+{
+  if((i0 | i1 | i2 | i3) == 0)
+    return a;
+  __m128i mask = constant4ui < i0 ? 0x80000000 : 0, i1 ? 0x80000000 : 0, i2 ? 0x80000000 : 0, i3 ? 0x80000000 : 0 > ();
+  return _mm_xor_ps(a, _mm_castsi128_ps(mask));  // flip sign bits
+}
+
+/*****************************************************************************
+ *
+ *          Vec2d: Vector of 2 double precision floating point values
+ *
+ *****************************************************************************/
+
+class Vec2d
+{
+ protected:
+  __m128d xmm;  // double vector
+ public:
+  // Default constructor:
+  Vec2d() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec2d(double d) { xmm = _mm_set1_pd(d); }
+  // Constructor to build from all elements:
+  Vec2d(double d0, double d1) { xmm = _mm_setr_pd(d0, d1); }
+  // Constructor to convert from type __m128d used in intrinsics:
+  Vec2d(__m128d const& x) { xmm = x; }
+  // Assignment operator to convert from type __m128d used in intrinsics:
+  Vec2d& operator=(__m128d const& x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m128d used in intrinsics
+  operator __m128d() const { return xmm; }
+  // Member function to load from array (unaligned)
+  Vec2d& load(double const* p)
+  {
+    xmm = _mm_loadu_pd(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 16
+  // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1,
+  // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 16.
+  Vec2d const& load_a(double const* p)
+  {
+    xmm = _mm_load_pd(p);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(double* p) const { _mm_storeu_pd(p, xmm); }
+  // Member function to store into array, aligned by 16
+  // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1,
+  // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 16.
+  void store_a(double* p) const { _mm_store_pd(p, xmm); }
+  // Partial load. Load n elements and set the rest to 0
+  Vec2d& load_partial(int n, double const* p)
+  {
+    if(n == 1)
+      {
+        xmm = _mm_load_sd(p);
+      }
+    else if(n == 2)
+      {
+        load(p);
+      }
+    else
+      {
+        xmm = _mm_setzero_pd();
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, double* p) const
+  {
+    if(n == 1)
+      {
+        _mm_store_sd(p, xmm);
+      }
+    else if(n == 2)
+      {
+        store(p);
+      }
+  }
+  // cut off vector to n elements. The last 4-n elements are set to zero
+  Vec2d& cutoff(int n)
+  {
+    xmm = _mm_castps_pd(Vec4f(_mm_castpd_ps(xmm)).cutoff(n * 2));
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec2d const& insert(uint32_t index, double value)
+  {
+    __m128d v2 = _mm_set_sd(value);
+    if(index == 0)
+      {
+        xmm = _mm_shuffle_pd(v2, xmm, 2);
+      }
+    else
+      {
+        xmm = _mm_shuffle_pd(xmm, v2, 0);
+      }
+    return *this;
+  };
+  // Member function extract a single element from vector
+  double extract(uint32_t index) const
+  {
+    double x[2];
+    store(x);
+    return x[index & 1];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  double operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 2; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec2d
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec2d operator+(Vec2d const& a, Vec2d const& b) { return _mm_add_pd(a, b); }
+
+// vector operator + : add vector and scalar
+static inline Vec2d operator+(Vec2d const& a, double b) { return a + Vec2d(b); }
+static inline Vec2d operator+(double a, Vec2d const& b) { return Vec2d(a) + b; }
+
+// vector operator += : add
+static inline Vec2d& operator+=(Vec2d& a, Vec2d const& b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec2d operator++(Vec2d& a, int)
+{
+  Vec2d a0 = a;
+  a        = a + 1.0;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec2d& operator++(Vec2d& a)
+{
+  a = a + 1.0;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec2d operator-(Vec2d const& a, Vec2d const& b) { return _mm_sub_pd(a, b); }
+
+// vector operator - : subtract vector and scalar
+static inline Vec2d operator-(Vec2d const& a, double b) { return a - Vec2d(b); }
+static inline Vec2d operator-(double a, Vec2d const& b) { return Vec2d(a) - b; }
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec2d operator-(Vec2d const& a) { return _mm_xor_pd(a, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000))); }
+
+// vector operator -= : subtract
+static inline Vec2d& operator-=(Vec2d& a, Vec2d const& b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec2d operator--(Vec2d& a, int)
+{
+  Vec2d a0 = a;
+  a        = a - 1.0;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec2d& operator--(Vec2d& a)
+{
+  a = a - 1.0;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec2d operator*(Vec2d const& a, Vec2d const& b) { return _mm_mul_pd(a, b); }
+
+// vector operator * : multiply vector and scalar
+static inline Vec2d operator*(Vec2d const& a, double b) { return a * Vec2d(b); }
+static inline Vec2d operator*(double a, Vec2d const& b) { return Vec2d(a) * b; }
+
+// vector operator *= : multiply
+static inline Vec2d& operator*=(Vec2d& a, Vec2d const& b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec2d operator/(Vec2d const& a, Vec2d const& b) { return _mm_div_pd(a, b); }
+
+// vector operator / : divide vector and scalar
+static inline Vec2d operator/(Vec2d const& a, double b) { return a / Vec2d(b); }
+static inline Vec2d operator/(double a, Vec2d const& b) { return Vec2d(a) / b; }
+
+// vector operator /= : divide
+static inline Vec2d& operator/=(Vec2d& a, Vec2d const& b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec2db operator==(Vec2d const& a, Vec2d const& b) { return _mm_cmpeq_pd(a, b); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec2db operator!=(Vec2d const& a, Vec2d const& b) { return _mm_cmpneq_pd(a, b); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec2db operator<(Vec2d const& a, Vec2d const& b) { return _mm_cmplt_pd(a, b); }
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec2db operator<=(Vec2d const& a, Vec2d const& b) { return _mm_cmple_pd(a, b); }
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec2db operator>(Vec2d const& a, Vec2d const& b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec2db operator>=(Vec2d const& a, Vec2d const& b) { return b <= a; }
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec2d operator&(Vec2d const& a, Vec2d const& b) { return _mm_and_pd(a, b); }
+
+// vector operator &= : bitwise and
+static inline Vec2d& operator&=(Vec2d& a, Vec2d const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator & : bitwise and of Vec2d and Vec2db
+static inline Vec2d operator&(Vec2d const& a, Vec2db const& b) { return _mm_and_pd(a, b); }
+static inline Vec2d operator&(Vec2db const& a, Vec2d const& b) { return _mm_and_pd(a, b); }
+
+// vector operator | : bitwise or
+static inline Vec2d operator|(Vec2d const& a, Vec2d const& b) { return _mm_or_pd(a, b); }
+
+// vector operator |= : bitwise or
+static inline Vec2d& operator|=(Vec2d& a, Vec2d const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2d operator^(Vec2d const& a, Vec2d const& b) { return _mm_xor_pd(a, b); }
+
+// vector operator ^= : bitwise xor
+static inline Vec2d& operator^=(Vec2d& a, Vec2d const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec2db operator!(Vec2d const& a) { return a == Vec2d(0.0); }
+
+/*****************************************************************************
+ *
+ *          Functions for Vec2d
+ *
+ *****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true).
+// No other values are allowed.
+static inline Vec2d select(Vec2db const& s, Vec2d const& a, Vec2d const& b) { return selectd(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec2d if_add(Vec2db const& f, Vec2d const& a, Vec2d const& b) { return a + (Vec2d(f) & b); }
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec2d if_mul(Vec2db const& f, Vec2d const& a, Vec2d const& b) { return a * select(f, b, 1.); }
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add(Vec2d const& a)
+{
+#if INSTRSET >= 3  // SSE3
+  __m128d t1 = _mm_hadd_pd(a, a);
+  return _mm_cvtsd_f64(t1);
+#else
+  __m128 t0 = _mm_castpd_ps(a);
+  __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0, t0));
+  __m128d t2 = _mm_add_sd(a, t1);
+  return _mm_cvtsd_f64(t2);
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec2d max(Vec2d const& a, Vec2d const& b) { return _mm_max_pd(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec2d min(Vec2d const& a, Vec2d const& b) { return _mm_min_pd(a, b); }
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec2d abs(Vec2d const& a)
+{
+  __m128d mask = _mm_castsi128_pd(_mm_setr_epi32(-1, 0x7FFFFFFF, -1, 0x7FFFFFFF));
+  return _mm_and_pd(a, mask);
+}
+
+// function sqrt: square root
+static inline Vec2d sqrt(Vec2d const& a) { return _mm_sqrt_pd(a); }
+
+// function square: a * a
+static inline Vec2d square(Vec2d const& a) { return a * a; }
+
+// pow(Vec2d, int):
+// The purpose of this template is to prevent implicit conversion of a float
+// exponent to int when calling pow(vector, float) and vectormath_exp.h is
+// not included
+
+template <typename TT>
+static Vec2d pow(Vec2d const& a, TT const& n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec2d pow<int>(Vec2d const& x0, int const& n)
+{
+  return pow_template_i<Vec2d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec2d pow<uint32_t>(Vec2d const& x0, uint32_t const& n)
+{
+  return pow_template_i<Vec2d>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec2d pow_n(Vec2d const& a)
+{
+  if(n < 0)
+    return Vec2d(1.0) / pow_n<-n>(a);
+  if(n == 0)
+    return Vec2d(1.0);
+  if(n >= 256)
+    return pow(a, n);
+  Vec2d x = a;                           // a^(2^i)
+  Vec2d y;                               // accumulator
+  const int lowest = n - (n & (n - 1));  // lowest set bit in n
+  if(n & 1)
+    y = x;
+  if(n < 2)
+    return y;
+  x = x * x;  // x^2
+  if(n & 2)
+    {
+      if(lowest == 2)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 4)
+    return y;
+  x = x * x;  // x^4
+  if(n & 4)
+    {
+      if(lowest == 4)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 8)
+    return y;
+  x = x * x;  // x^8
+  if(n & 8)
+    {
+      if(lowest == 8)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 16)
+    return y;
+  x = x * x;  // x^16
+  if(n & 16)
+    {
+      if(lowest == 16)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 32)
+    return y;
+  x = x * x;  // x^32
+  if(n & 32)
+    {
+      if(lowest == 32)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 64)
+    return y;
+  x = x * x;  // x^64
+  if(n & 64)
+    {
+      if(lowest == 64)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 128)
+    return y;
+  x = x * x;  // x^128
+  if(n & 128)
+    {
+      if(lowest == 128)
+        y = x;
+      else
+        y *= x;
+    }
+  return y;
+}
+
+template <int n>
+static inline Vec2d pow(Vec2d const& a, Const_int_t<n>)
+{
+  return pow_n<n>(a);
+}
+
+// avoid unsafe optimization in function round
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && INSTRSET < 5
+static inline Vec2d round(Vec2d const& a) __attribute__((optimize("-fno-unsafe-math-optimizations")));
+#elif defined(FLOAT_CONTROL_PRECISE_FOR_ROUND)
+#pragma float_control(push)
+#pragma float_control(precise, on)
+#endif
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec2d round(Vec2d const& a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_round_pd(a, 0 + 8);
+#else  // SSE2. Use magic number method
+  // Note: assume MXCSR control register is set to rounding
+  // (don't use conversion to int, it will limit the value to +/- 2^31)
+  Vec2d signmask = _mm_castsi128_pd(constant4ui<0, 0x80000000, 0, 0x80000000>());  // -0.0
+  Vec2d magic = _mm_castsi128_pd(constant4ui<0, 0x43300000, 0, 0x43300000>());     // magic number = 2^52
+  Vec2d sign = _mm_and_pd(a, signmask);                                            // signbit of a
+  Vec2d signedmagic = _mm_or_pd(magic, sign);                                      // magic number with sign of a
+  return a + signedmagic - signedmagic;                                            // round by adding magic number
+#endif
+}
+#if defined(FLOAT_CONTROL_PRECISE_FOR_ROUND)
+#pragma float_control(pop)
+#endif
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec2d truncate(Vec2d const& a)
+{
+// (note: may fail on MS Visual Studio 2008, works in later versions)
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_round_pd(a, 3 + 8);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+  uint32_t t1 = _mm_getcsr();                                                      // MXCSR
+  uint32_t t2 = t1 | (3 << 13);                                                    // bit 13-14 = 11
+  _mm_setcsr(t2);                                                                  // change MXCSR
+  Vec2d r = round(a);                                                              // use magic number method
+  _mm_setcsr(t1);                                                                  // restore MXCSR
+  return r;
+#endif
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+// (note: may fail on MS Visual Studio 2008, works in later versions)
+static inline Vec2d floor(Vec2d const& a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_round_pd(a, 1 + 8);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+  uint32_t t1 = _mm_getcsr();    // MXCSR
+  uint32_t t2 = t1 | (1 << 13);  // bit 13-14 = 01
+  _mm_setcsr(t2);                // change MXCSR
+  Vec2d r = round(a);            // use magic number method
+  _mm_setcsr(t1);                // restore MXCSR
+  return r;
+#endif
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec2d ceil(Vec2d const& a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_round_pd(a, 2 + 8);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+  uint32_t t1 = _mm_getcsr();    // MXCSR
+  uint32_t t2 = t1 | (2 << 13);  // bit 13-14 = 10
+  _mm_setcsr(t2);                // change MXCSR
+  Vec2d r = round(a);            // use magic number method
+  _mm_setcsr(t1);                // restore MXCSR
+  return r;
+#endif
+}
+
+// function truncate_to_int: round towards zero.
+static inline Vec4i truncate_to_int(Vec2d const& a, Vec2d const& b)
+{
+  Vec4i t1 = _mm_cvttpd_epi32(a);
+  Vec4i t2 = _mm_cvttpd_epi32(b);
+  return blend4i<0, 1, 4, 5>(t1, t2);
+}
+
+// function round_to_int: round to nearest integer (even).
+// result as 32-bit integer vector
+static inline Vec4i round_to_int(Vec2d const& a, Vec2d const& b)
+{
+  // Note: assume MXCSR control register is set to rounding
+  Vec4i t1 = _mm_cvtpd_epi32(a);
+  Vec4i t2 = _mm_cvtpd_epi32(b);
+  return blend4i<0, 1, 4, 5>(t1, t2);
+}
+// function round_to_int: round to nearest integer (even).
+// result as 32-bit integer vector. Upper two values of result are 0
+static inline Vec4i round_to_int(Vec2d const& a)
+{
+  Vec4i t1 = _mm_cvtpd_epi32(a);
+  return t1;
+}
+
+// function truncate_to_int64: round towards zero. (inefficient)
+static inline Vec2q truncate_to_int64(Vec2d const& a)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  // return _mm_maskz_cvttpd_epi64( __mmask8(0xFF), a);
+  return _mm_cvttpd_epi64(a);
+#else
+  double aa[2];
+  a.store(aa);
+  return Vec2q(int64_t(aa[0]), int64_t(aa[1]));
+#endif
+}
+
+// function truncate_to_int64_limited: round towards zero. (inefficient)
+// result as 64-bit integer vector, but with limited range. Deprecated!
+static inline Vec2q truncate_to_int64_limited(Vec2d const& a)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return truncate_to_int64(a);
+#else
+  // Note: assume MXCSR control register is set to rounding
+  Vec4i t1 = _mm_cvttpd_epi32(a);
+  return extend_low(t1);
+#endif
+}
+
+// function round_to_int64: round to nearest or even. (inefficient)
+static inline Vec2q round_to_int64(Vec2d const& a)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return _mm_cvtpd_epi64(a);
+#else
+  return truncate_to_int64(round(a));
+#endif
+}
+
+// function round_to_int: round to nearest integer (even)
+// result as 64-bit integer vector, but with limited range. Deprecated!
+static inline Vec2q round_to_int64_limited(Vec2d const& a)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return round_to_int64(a);
+#else
+  // Note: assume MXCSR control register is set to rounding
+  Vec4i t1 = _mm_cvtpd_epi32(a);
+  return extend_low(t1);
+#endif
+}
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec2d to_double(Vec2q const& a)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return _mm_maskz_cvtepi64_pd(__mmask8(0xFF), a);
+#else
+  int64_t aa[2];
+  a.store(aa);
+  return Vec2d(double(aa[0]), double(aa[1]));
+#endif
+}
+
+// function to_double_limited: convert integer vector elements to double vector
+// limited to abs(x) < 2^31. Deprecated!
+static inline Vec2d to_double_limited(Vec2q const& x)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return to_double(x);
+#else
+  Vec4i compressed = permute4i<0, 2, -256, -256>(Vec4i(x));
+  return _mm_cvtepi32_pd(compressed);
+#endif
+}
+
+// function to_double_low: convert integer vector elements [0] and [1] to double vector
+static inline Vec2d to_double_low(Vec4i const& a) { return _mm_cvtepi32_pd(a); }
+
+// function to_double_high: convert integer vector elements [2] and [3] to double vector
+static inline Vec2d to_double_high(Vec4i const& a) { return to_double_low(_mm_srli_si128(a, 8)); }
+
+// function compress: convert two Vec2d to one Vec4f
+static inline Vec4f compress(Vec2d const& low, Vec2d const& high)
+{
+  Vec4f t1 = _mm_cvtpd_ps(low);
+  Vec4f t2 = _mm_cvtpd_ps(high);
+  return blend4f<0, 1, 4, 5>(t1, t2);
+}
+
+// Function extend_low : convert Vec4f vector elements [0] and [1] to Vec2d
+static inline Vec2d extend_low(Vec4f const& a) { return _mm_cvtps_pd(a); }
+
+// Function extend_high : convert Vec4f vector elements [2] and [3] to Vec2d
+static inline Vec2d extend_high(Vec4f const& a) { return _mm_cvtps_pd(_mm_movehl_ps(a, a)); }
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec2d mul_add(Vec2d const& a, Vec2d const& b, Vec2d const& c)
+{
+#ifdef __FMA__
+  return _mm_fmadd_pd(a, b, c);
+#elif defined(__FMA4__)
+  return _mm_macc_pd(a, b, c);
+#else
+  return a * b + c;
+#endif
+}
+
+// Multiply and subtract
+static inline Vec2d mul_sub(Vec2d const& a, Vec2d const& b, Vec2d const& c)
+{
+#ifdef __FMA__
+  return _mm_fmsub_pd(a, b, c);
+#elif defined(__FMA4__)
+  return _mm_msub_pd(a, b, c);
+#else
+  return a * b - c;
+#endif
+}
+
+// Multiply and inverse subtract
+static inline Vec2d nmul_add(Vec2d const& a, Vec2d const& b, Vec2d const& c)
+{
+#ifdef __FMA__
+  return _mm_fnmadd_pd(a, b, c);
+#elif defined(__FMA4__)
+  return _mm_nmacc_pd(a, b, c);
+#else
+  return c - a * b;
+#endif
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec2d mul_sub_x(Vec2d const& a, Vec2d const& b, Vec2d const& c)
+{
+#ifdef __FMA__
+  return _mm_fmsub_pd(a, b, c);
+#elif defined(__FMA4__)
+  return _mm_msub_pd(a, b, c);
+#else
+  // calculate a * b - c with extra precision
+  Vec2q upper_mask = -(1LL << 27);                             // mask to remove lower 27 bits
+  Vec2d a_high     = a & Vec2d(_mm_castsi128_pd(upper_mask));  // split into high and low parts
+  Vec2d b_high     = b & Vec2d(_mm_castsi128_pd(upper_mask));
+  Vec2d a_low      = a - a_high;
+  Vec2d b_low      = b - b_high;
+  Vec2d r1         = a_high * b_high;                                         // this product is exact
+  Vec2d r2         = r1 - c;                                                  // subtract c from high product
+  Vec2d r3         = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low;  // add rest of product
+  return r3;                                                                  // + ((r2 - r1) + c);
+#endif
+}
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec2q exponent(Vec2d const& a)
+{
+  Vec2uq t1 = _mm_castpd_si128(a);  // reinterpret as 64-bit integer
+  Vec2uq t2 = t1 << 1;              // shift out sign bit
+  Vec2uq t3 = t2 >> 53;             // shift down logical to position 0
+  Vec2q t4  = Vec2q(t3) - 0x3FF;    // subtract bias from exponent
+  return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25
+// NOTE: The name fraction clashes with an ENUM in MAC XCode CarbonCore script.h !
+static inline Vec2d fraction(Vec2d const& a)
+{
+  Vec2uq t1 = _mm_castpd_si128(a);                                         // reinterpret as 64-bit integer
+  Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FF0000000000000ll);  // set exponent to 0 + bias
+  return _mm_castsi128_pd(t2);
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec2d exp2(Vec2q const& n)
+{
+  Vec2q t1 = max(n, -0x3FF);  // limit to allowed range
+  Vec2q t2 = min(t1, 0x400);
+  Vec2q t3 = t2 + 0x3FF;        // add bias
+  Vec2q t4 = t3 << 52;          // put exponent into position 52
+  return _mm_castsi128_pd(t4);  // reinterpret as double
+}
+// static Vec2d exp2(Vec2d const & x); // defined in vectormath_exp.h
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec2d(-0.0)) gives true, while Vec2d(-0.0) < Vec2d(0.0) gives false
+static inline Vec2db sign_bit(Vec2d const& a)
+{
+  Vec2q t1 = _mm_castpd_si128(a);  // reinterpret as 64-bit integer
+  Vec2q t2 = t1 >> 63;             // extend sign bit
+  return _mm_castsi128_pd(t2);     // reinterpret as 64-bit Boolean
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec2d sign_combine(Vec2d const& a, Vec2d const& b)
+{
+  Vec2d signmask = _mm_castsi128_pd(constant4ui<0, 0x80000000, 0, 0x80000000>());  // -0.0
+  return a ^ (b & signmask);
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+static inline Vec2db is_finite(Vec2d const& a)
+{
+  Vec2q t1  = _mm_castpd_si128(a);   // reinterpret as integer
+  Vec2q t2  = t1 << 1;               // shift out sign bit
+  Vec2q t3  = 0xFFE0000000000000ll;  // exponent mask
+  Vec2qb t4 = Vec2q(t2 & t3) != t3;  // exponent field is not all 1s
+  return t4;
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec2db is_inf(Vec2d const& a)
+{
+  Vec2q t1 = _mm_castpd_si128(a);     // reinterpret as integer
+  Vec2q t2 = t1 << 1;                 // shift out sign bit
+  return t2 == 0xFFE0000000000000ll;  // exponent is all 1s, fraction is 0
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec2db is_nan(Vec2d const& a)
+{
+  Vec2q t1 = _mm_castpd_si128(a);         // reinterpret as integer
+  Vec2q t2 = t1 << 1;                     // shift out sign bit
+  Vec2q t3 = 0xFFE0000000000000ll;        // exponent mask
+  Vec2q t4 = t2 & t3;                     // exponent
+  Vec2q t5 = _mm_andnot_si128(t3, t2);    // fraction
+  return Vec2qb((t4 == t3) & (t5 != 0));  // exponent = all 1s and fraction != 0
+}
+
+// Function is_subnormal: gives true for elements that are subnormal (denormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec2db is_subnormal(Vec2d const& a)
+{
+  Vec2q t1 = _mm_castpd_si128(a);        // reinterpret as 32-bit integer
+  Vec2q t2 = t1 << 1;                    // shift out sign bit
+  Vec2q t3 = 0xFFE0000000000000ll;       // exponent mask
+  Vec2q t4 = t2 & t3;                    // exponent
+  Vec2q t5 = _mm_andnot_si128(t3, t2);   // fraction
+  return Vec2qb((t4 == 0) & (t5 != 0));  // exponent = 0 and fraction != 0
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec2db is_zero_or_subnormal(Vec2d const& a)
+{
+  Vec2q t = _mm_castpd_si128(a);  // reinterpret as 32-bit integer
+  t &= 0x7FF0000000000000ll;      // isolate exponent
+  return t == 0;                  // exponent = 0
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec2d infinite2d() { return _mm_castsi128_pd(_mm_setr_epi32(0, 0x7FF00000, 0, 0x7FF00000)); }
+
+// Function nan2d: returns a vector where all elements are +NAN (quiet)
+static inline Vec2d nan2d(int n = 0x10) { return _mm_castsi128_pd(_mm_setr_epi32(n, 0x7FF80000, n, 0x7FF80000)); }
+
+/*****************************************************************************
+ *
+ *          Functions for reinterpretation between vector types
+ *
+ *****************************************************************************/
+
+static inline __m128i reinterpret_i(__m128i const& x) { return x; }
+
+static inline __m128i reinterpret_i(__m128 const& x) { return _mm_castps_si128(x); }
+
+static inline __m128i reinterpret_i(__m128d const& x) { return _mm_castpd_si128(x); }
+
+static inline __m128 reinterpret_f(__m128i const& x) { return _mm_castsi128_ps(x); }
+
+static inline __m128 reinterpret_f(__m128 const& x) { return x; }
+
+static inline __m128 reinterpret_f(__m128d const& x) { return _mm_castpd_ps(x); }
+
+static inline __m128d reinterpret_d(__m128i const& x) { return _mm_castsi128_pd(x); }
+
+static inline __m128d reinterpret_d(__m128 const& x) { return _mm_castps_pd(x); }
+
+static inline __m128d reinterpret_d(__m128d const& x) { return x; }
+
+/*****************************************************************************
+ *
+ *          Vector permute and blend functions
+ *
+ ******************************************************************************
+ *
+ * The permute function can reorder the elements of a vector and optionally
+ * set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select. An index of -1 will generate zero. An index of -256 means don't care.
+ *
+ * Example:
+ * Vec2d a(10., 11.);              // a is (10, 11)
+ * Vec2d b, c;
+ * b = permute2d<1,1>(a);          // b is (11, 11)
+ * c = permute2d<-1,0>(a);         // c is ( 0, 10)
+ *
+ *
+ * The blend function can mix elements from two different vectors and
+ * optionally set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select, where indexes 0 - 1 indicate an element from the first source
+ * vector and indexes 2 - 3 indicate an element from the second source vector.
+ * An index of -1 will generate zero.
+ *
+ *
+ * Example:
+ * Vec2d a(10., 11.);              // a is (10, 11)
+ * Vec2d b(20., 21.);              // b is (20, 21)
+ * Vec2d c;
+ * c = blend2d<0,3> (a,b);         // c is (10, 21)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+// permute vector Vec2d
+template <int i0, int i1>
+static inline Vec2d permute2d(Vec2d const& a)
+{
+  // is shuffling needed
+  const bool do_shuffle = (i0 > 0) || (i1 != 1 && i1 >= 0);
+  // is zeroing needed
+  const bool do_zero = ((i0 | i1) < 0 && (i0 | i1) & 0x80);
+
+  if(do_zero && !do_shuffle)
+    {  // zeroing, not shuffling
+      if((i0 & i1) < 0)
+        return _mm_setzero_pd();  // zero everything
+      // zero some elements
+      __m128i mask1 = constant4i<-int(i0 >= 0), -int(i0 >= 0), -int(i1 >= 0), -int(i1 >= 0)>();
+      return _mm_and_pd(a, _mm_castsi128_pd(mask1));  // zero with AND mask
+    }
+  else if(do_shuffle && !do_zero)
+    {  // shuffling, not zeroing
+      return _mm_shuffle_pd(a, a, (i0 & 1) | (i1 & 1) << 1);
+    }
+  else if(do_shuffle && do_zero)
+    {  // shuffling and zeroing
+      // both shuffle and zero
+      if(i0 < 0 && i1 >= 0)
+        {  // zero low half, shuffle high half
+          return _mm_shuffle_pd(_mm_setzero_pd(), a, (i1 & 1) << 1);
+        }
+      if(i0 >= 0 && i1 < 0)
+        {  // shuffle low half, zero high half
+          return _mm_shuffle_pd(a, _mm_setzero_pd(), i0 & 1);
+        }
+    }
+  return a;  // trivial case: do nothing
+}
+
+// blend vectors Vec2d
+template <int i0, int i1>
+static inline Vec2d blend2d(Vec2d const& a, Vec2d const& b)
+{
+  // Combine all the indexes into a single bitfield, with 8 bits for each
+  const int m1 = (i0 & 3) | (i1 & 3) << 8;
+
+  // Mask to zero out negative indexes
+  const int m2 = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8;
+
+  if((m1 & 0x0202 & m2) == 0)
+    {
+      // no elements from b, only elements from a and possibly zero
+      return permute2d<i0, i1>(a);
+    }
+  if(((m1 ^ 0x0202) & 0x0202 & m2) == 0)
+    {
+      // no elements from a, only elements from b and possibly zero
+      return permute2d<i0 & ~2, i1 & ~2>(b);
+    }
+  // selecting from both a and b without zeroing
+  if((i0 & 2) == 0)
+    {  // first element from a, second element from b
+      return _mm_shuffle_pd(a, b, (i0 & 1) | (i1 & 1) << 1);
+    }
+  else
+    {  // first element from b, second element from a
+      return _mm_shuffle_pd(b, a, (i0 & 1) | (i1 & 1) << 1);
+    }
+}
+
+// change signs on vectors Vec4f
+// Each index i0 - i1 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1>
+static inline Vec2d change_sign(Vec2d const& a)
+{
+  if((i0 | i1) == 0)
+    return a;
+  __m128i mask = constant4ui < 0, i0 ? 0x80000000 : 0, 0, i1 ? 0x80000000 : 0 > ();
+  return _mm_xor_pd(a, _mm_castsi128_pd(mask));  // flip sign bits
+}
+
+/*****************************************************************************
+ *
+ *          Vector lookup functions
+ *
+ ******************************************************************************
+ *
+ * These functions use vector elements as indexes into a table.
+ * The table is given as one or more vectors or as an array.
+ *
+ * This can be used for several purposes:
+ *  - table lookup
+ *  - permute or blend with variable indexes
+ *  - blend from more than two sources
+ *  - gather non-contiguous data
+ *
+ * An index out of range may produce any value - the actual value produced is
+ * implementation dependent and may be different for different instruction
+ * sets. An index out of range does not produce an error message or exception.
+ *
+ * Example:
+ * Vec4i a(2,0,0,3);               // index  a is (  2,   0,   0,   3)
+ * Vec4f b(1.0f,1.1f,1.2f,1.3f);   // table  b is (1.0, 1.1, 1.2, 1.3)
+ * Vec4f c;
+ * c = lookup4 (a,b);              // result c is (1.2, 1.0, 1.0, 1.3)
+ *
+ *****************************************************************************/
+
+static inline Vec4f lookup4(Vec4i const& index, Vec4f const& table)
+{
+#if INSTRSET >= 7  // AVX
+  return _mm_permutevar_ps(table, index);
+#else
+  int32_t ii[4];
+  float tt[6];
+  table.store(tt);
+  (index & 3).store(ii);
+  __m128 r01 = _mm_loadh_pi(_mm_load_ss(&tt[ii[0]]), (const __m64*)&tt[ii[1]]);
+  __m128 r23 = _mm_loadh_pi(_mm_load_ss(&tt[ii[2]]), (const __m64*)&tt[ii[3]]);
+  return _mm_shuffle_ps(r01, r23, 0x88);
+#endif
+}
+
+static inline Vec4f lookup8(Vec4i const& index, Vec4f const& table0, Vec4f const& table1)
+{
+#if INSTRSET >= 8                                                               // AVX2
+  __m256 tt = _mm256_insertf128_ps(_mm256_castps128_ps256(table0), table1, 1);  // combine tables
+
+#if defined(_MSC_VER) && _MSC_VER < 1700 && !defined(__INTEL_COMPILER)
+  // bug in MS VS 11 beta: operands in wrong order
+  __m128 r =
+      _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), _mm256_castps_si256(tt)));
+  r = _mm_and_ps(r, r);  // fix another bug in VS 11 beta (would store r as 256 bits aligned by 16)
+#elif defined(GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+  // Gcc 4.7.0 has wrong parameter type and operands in wrong order
+  __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), tt));
+#else
+  // no bug version
+  __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index)));
+#endif
+  return r;
+
+#elif INSTRSET >= 7  // AVX
+  __m128 r0 = _mm_permutevar_ps(table0, index);
+  __m128 r1 = _mm_permutevar_ps(table1, index);
+  __m128i i4 = _mm_slli_epi32(index, 29);
+  return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4));
+
+#elif INSTRSET >= 5  // SSE4.1
+  Vec4f r0   = lookup4(index, table0);
+  Vec4f r1   = lookup4(index, table1);
+  __m128i i4 = _mm_slli_epi32(index, 29);
+  return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4));
+
+#else  // SSE2
+  Vec4f r0   = lookup4(index, table0);
+  Vec4f r1   = lookup4(index, table1);
+  __m128i i4 = _mm_srai_epi32(_mm_slli_epi32(index, 29), 31);
+  return selectf(_mm_castsi128_ps(i4), r1, r0);
+#endif
+}
+
+template <int n>
+static inline Vec4f lookup(Vec4i const& index, float const* table)
+{
+  if(n <= 0)
+    return 0.0f;
+  if(n <= 4)
+    return lookup4(index, Vec4f().load(table));
+  if(n <= 8)
+    {
+#if INSTRSET >= 8  // AVX2
+      __m256 tt = _mm256_loadu_ps(table);
+#if defined(_MSC_VER) && _MSC_VER < 1700 && !defined(__INTEL_COMPILER)
+      // bug in MS VS 11 beta: operands in wrong order
+      __m128 r = _mm256_castps256_ps128(
+          _mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), _mm256_castps_si256(tt)));
+      r = _mm_and_ps(r, r);  // fix another bug in VS 11 beta (would store r as 256 bits aligned by 16)
+#elif defined(GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+      // Gcc 4.7.0 has wrong parameter type and operands in wrong order
+      __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), tt));
+#else
+      // no bug version
+      __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index)));
+#endif
+      return r;
+#else   // not AVX2
+      return lookup8(index, Vec4f().load(table), Vec4f().load(table + 4));
+#endif  // INSTRSET
+    }
+  // n > 8. Limit index
+  Vec4ui index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec4ui(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec4ui(index), n - 1);
+    }
+#if INSTRSET >= 8  // AVX2
+  return _mm_i32gather_ps(table, index1, 4);
+#else
+  uint32_t ii[4];
+  index1.store(ii);
+  return Vec4f(table[ii[0]], table[ii[1]], table[ii[2]], table[ii[3]]);
+#endif
+}
+
+static inline Vec2d lookup2(Vec2q const& index, Vec2d const& table)
+{
+#if INSTRSET >= 7  // AVX
+  return _mm_permutevar_pd(table, index + index);
+#else
+  int32_t ii[4];
+  double tt[2];
+  table.store(tt);
+  (index & 1).store(ii);
+  return Vec2d(tt[ii[0]], tt[ii[2]]);
+#endif
+}
+
+static inline Vec2d lookup4(Vec2q const& index, Vec2d const& table0, Vec2d const& table1)
+{
+#if INSTRSET >= 7                // AVX
+  Vec2q index2 = index + index;  // index << 1
+  __m128d r0   = _mm_permutevar_pd(table0, index2);
+  __m128d r1   = _mm_permutevar_pd(table1, index2);
+  __m128i i4   = _mm_slli_epi64(index, 62);
+  return _mm_blendv_pd(r0, r1, _mm_castsi128_pd(i4));
+#else
+  int32_t ii[4];
+  double tt[4];
+  table0.store(tt);
+  table1.store(tt + 2);
+  (index & 3).store(ii);
+  return Vec2d(tt[ii[0]], tt[ii[2]]);
+#endif
+}
+
+template <int n>
+static inline Vec2d lookup(Vec2q const& index, double const* table)
+{
+  if(n <= 0)
+    return 0.0;
+  if(n <= 2)
+    return lookup2(index, Vec2d().load(table));
+#if INSTRSET < 8  // not AVX2
+  if(n <= 4)
+    return lookup4(index, Vec2d().load(table), Vec2d().load(table + 2));
+#endif
+  // Limit index
+  Vec2uq index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec2uq(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec2uq(index), n - 1);
+    }
+#if INSTRSET >= 8  // AVX2
+  return _mm_i64gather_pd(table, index1, 8);
+#else
+  uint32_t ii[4];
+  index1.store(ii);
+  return Vec2d(table[ii[0]], table[ii[2]]);
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Gather functions with fixed indexes
+ *
+ *****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f gather4f(void const* a)
+{
+  return reinterpret_f(gather4i<i0, i1, i2, i3>(a));
+}
+
+// Load elements from array a with indices i0, i1
+template <int i0, int i1>
+static inline Vec2d gather2d(void const* a)
+{
+  return reinterpret_d(gather2q<i0, i1>(a));
+}
+
+/*****************************************************************************
+ *
+ *          Vector scatter functions
+ *
+ ******************************************************************************
+ *
+ * These functions write the elements of a vector to arbitrary positions in an
+ * array in memory. Each vector element is written to an array position
+ * determined by an index. An element is not written if the corresponding
+ * index is out of range.
+ * The indexes can be specified as constant template parameters or as an
+ * integer vector.
+ *
+ * The scatter functions are useful if the data are distributed in a sparce
+ * manner into the array. If the array is dense then it is more efficient
+ * to permute the data into the right positions and then write the whole
+ * permuted vector into the array.
+ *
+ * Example:
+ * Vec8d a(10,11,12,13,14,15,16,17);
+ * double b[16] = {0};
+ * scatter<0,2,14,10,1,-1,5,9>(a,b);
+ * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
+ *
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4f const& data, float* array)
+{
+#if defined(__AVX512VL__)
+  __m128i indx   = constant4i<i0, i1, i2, i3>();
+  __mmask16 mask = uint16_t(i0 >= 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3);
+  _mm_mask_i32scatter_ps(array, mask, indx, data, 4);
+#else
+  const int index[4] = {i0, i1, i2, i3};
+  for(int i = 0; i < 4; i++)
+    {
+      if(index[i] >= 0)
+        array[index[i]] = data[i];
+    }
+#endif
+}
+
+template <int i0, int i1>
+static inline void scatter(Vec2d const& data, double* array)
+{
+  if(i0 >= 0)
+    array[i0] = data[0];
+  if(i1 >= 0)
+    array[i1] = data[1];
+}
+
+static inline void scatter(Vec4i const& index, uint32_t limit, Vec4f const& data, float* array)
+{
+#if defined(__AVX512VL__)
+  __mmask16 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit));
+  _mm_mask_i32scatter_ps(array, mask, index, data, 4);
+#else
+  for(int i = 0; i < 4; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        array[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec2q const& index, uint32_t limit, Vec2d const& data, double* array)
+{
+  if(uint64_t(index[0]) < uint64_t(limit))
+    array[index[0]] = data[0];
+  if(uint64_t(index[1]) < uint64_t(limit))
+    array[index[1]] = data[1];
+}
+
+static inline void scatter(Vec4i const& index, uint32_t limit, Vec2d const& data, double* array)
+{
+  if(uint32_t(index[0]) < limit)
+    array[index[0]] = data[0];
+  if(uint32_t(index[1]) < limit)
+    array[index[1]] = data[1];
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal scan functions
+ *
+ *****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec4fb const& x) { return horizontal_find_first(Vec4ib(x)); }
+
+static inline int horizontal_find_first(Vec2db const& x) { return horizontal_find_first(Vec2qb(x)); }
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec4fb const& x) { return horizontal_count(Vec4ib(x)); }
+
+static inline uint32_t horizontal_count(Vec2db const& x) { return horizontal_count(Vec2qb(x)); }
+
+/*****************************************************************************
+ *
+ *          Boolean <-> bitfield conversion functions
+ *
+ *****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4fb const& x) { return to_bits(Vec4ib(x)); }
+
+// to_Vec4fb: convert integer bitfield to boolean vector
+static inline Vec4fb to_Vec4fb(uint8_t x) { return Vec4fb(to_Vec4ib(x)); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec2db const& x) { return to_bits(Vec2qb(x)); }
+
+// to_Vec2db: convert integer bitfield to boolean vector
+static inline Vec2db to_Vec2db(uint8_t x) { return Vec2db(to_Vec2qb(x)); }
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORF128_H
diff --git a/src/vectorclass/vectorf256.h b/src/vectorclass/vectorf256.h
new file mode 100644
index 0000000000000000000000000000000000000000..04d462ddc50474ab8f368490d737489b17933d49
--- /dev/null
+++ b/src/vectorclass/vectorf256.h
@@ -0,0 +1,3359 @@
+/****************************  vectorf256.h   *******************************
+ * Author:        Agner Fog
+ * Date created:  2012-05-30
+ * Last modified: 2017-07-27
+ * Version:       1.30
+ * Project:       vector classes
+ * Description:
+ * Header file defining 256-bit floating point vector classes as interface
+ * to intrinsic functions in x86 microprocessors with AVX instruction set.
+ *
+ * Instructions:
+ * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
+ * instruction set, which must be at least AVX.
+ *
+ * The following vector classes are defined here:
+ * Vec8f     Vector of 8 single precision floating point numbers
+ * Vec8fb    Vector of 8 Booleans for use with Vec8f
+ * Vec4d     Vector of 4 double precision floating point numbers
+ * Vec4db    Vector of 4 Booleans for use with Vec4d
+ *
+ * Each vector object is represented internally in the CPU as a 256-bit register.
+ * This header file defines operators and functions for these vectors.
+ *
+ * For example:
+ * Vec4d a(1., 2., 3., 4.), b(5., 6., 7., 8.), c;
+ * c = a + b;     // now c contains (6., 8., 10., 12.)
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
+ *****************************************************************************/
+
+// check combination of header files
+#if defined(VECTORF256_H)
+#if VECTORF256_H != 2
+#error Two different versions of vectorf256.h included
+#endif
+#else
+#define VECTORF256_H 2
+
+#if INSTRSET < 7  // AVX required
+#error Please compile for the AVX instruction set or higher
+#endif
+
+#include "vectorf128.h"  // Define 128-bit vectors
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/*****************************************************************************
+ *
+ *          select functions
+ *
+ *****************************************************************************/
+// Select between two __m256 sources, element by element. Used in various functions
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFF (true).
+static inline __m256 selectf(__m256 const& s, __m256 const& a, __m256 const& b) { return _mm256_blendv_ps(b, a, s); }
+
+// Same, with two __m256d sources.
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other
+// values are allowed.
+static inline __m256d selectd(__m256d const& s, __m256d const& a, __m256d const& b) { return _mm256_blendv_pd(b, a, s); }
+
+/*****************************************************************************
+ *
+ *          Generate compile-time constant vector
+ *
+ *****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory,
+// load as __m256
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline __m256 constant8f()
+{
+  static const union
+  {
+    int i[8];
+    __m256 ymm;
+  } u = {{i0, i1, i2, i3, i4, i5, i6, i7}};
+  return u.ymm;
+}
+
+/*****************************************************************************
+ *
+ *         Join two 128-bit vectors
+ *
+ *****************************************************************************/
+#define set_m128r(lo, hi) _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 1)
+// _mm256_set_m128(hi,lo); // not defined in all versions of immintrin.h
+
+/*****************************************************************************
+ *
+ *          Vec8fb: Vector of 8 Booleans for use with Vec8f
+ *
+ *****************************************************************************/
+
+class Vec8fb
+{
+ protected:
+  __m256 ymm;  // Float vector
+ public:
+  // Default constructor:
+  Vec8fb() {}
+  // Constructor to build from all elements:
+  Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7)
+  {
+#if INSTRSET >= 8  // AVX2
+    ymm = _mm256_castsi256_ps(_mm256_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7));
+#else
+    __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3));
+    __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b4, -(int)b5, -(int)b6, -(int)b7));
+    ymm        = set_m128r(blo, bhi);
+#endif
+  }
+  // Constructor to build from two Vec4fb:
+  Vec8fb(Vec4fb const& a0, Vec4fb const& a1) { ymm = set_m128r(a0, a1); }
+  // Constructor to convert from type __m256 used in intrinsics:
+  Vec8fb(__m256 const& x) { ymm = x; }
+  // Assignment operator to convert from type __m256 used in intrinsics:
+  Vec8fb& operator=(__m256 const& x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Constructor to broadcast the same value into all elements:
+  Vec8fb(bool b)
+  {
+#if INSTRSET >= 8  // AVX2
+    ymm = _mm256_castsi256_ps(_mm256_set1_epi32(-(int)b));
+#else
+    __m128 b1  = _mm_castsi128_ps(_mm_set1_epi32(-(int)b));
+    // ymm = _mm256_set_m128(b1,b1);
+    ymm        = set_m128r(b1, b1);
+#endif
+  }
+  // Assignment operator to broadcast scalar value:
+  Vec8fb& operator=(bool b)
+  {
+    *this = Vec8fb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec8fb(int b);
+  Vec8fb& operator=(int x);
+
+ public:
+  // Type cast operator to convert to __m256 used in intrinsics
+  operator __m256() const { return ymm; }
+#if defined(VECTORI256_H)
+#if VECTORI256_H >= 2  // AVX2 version
+  // Constructor to convert from type Vec8ib used as Boolean for integer vectors
+  Vec8fb(Vec8ib const& x) { ymm = _mm256_castsi256_ps(x); }
+  // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
+  Vec8fb& operator=(Vec8ib const& x)
+  {
+    ymm = _mm256_castsi256_ps(x);
+    return *this;
+  }
+#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+  // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
+  operator Vec8ib() const { return _mm256_castps_si256(ymm); }
+#endif
+#else
+  // Constructor to convert from type Vec8ib used as Boolean for integer vectors
+  Vec8fb(Vec8ib const& x) { ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high())); }
+  // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
+  Vec8fb& operator=(Vec8ib const& x)
+  {
+    ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high()));
+    return *this;
+  }
+  // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
+  operator Vec8ib() const { return Vec8i(_mm_castps_si128(get_low()), _mm_castps_si128(get_high())); }
+#endif
+#endif             // VECTORI256_H
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8fb const& insert(uint32_t index, bool value)
+  {
+    static const int32_t maskl[16] = {0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0};
+    __m256 mask                    = _mm256_loadu_ps((float const*)(maskl + 8 - (index & 7)));  // mask with FFFFFFFF at index position
+    if(value)
+      {
+        ymm = _mm256_or_ps(ymm, mask);
+      }
+    else
+      {
+        ymm = _mm256_andnot_ps(mask, ymm);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const
+  {
+    union
+    {
+      float f[8];
+      int32_t i[8];
+    } u;
+    _mm256_storeu_ps(u.f, ymm);
+    return u.i[index & 7] != 0;
+  }
+  // Extract a single element. Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4fb:
+  Vec4fb get_low() const { return _mm256_castps256_ps128(ymm); }
+  Vec4fb get_high() const { return _mm256_extractf128_ps(ymm, 1); }
+  static int size() { return 8; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec8fb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8fb operator&(Vec8fb const& a, Vec8fb const& b) { return _mm256_and_ps(a, b); }
+static inline Vec8fb operator&&(Vec8fb const& a, Vec8fb const& b) { return a & b; }
+
+// vector operator &= : bitwise and
+static inline Vec8fb& operator&=(Vec8fb& a, Vec8fb const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8fb operator|(Vec8fb const& a, Vec8fb const& b) { return _mm256_or_ps(a, b); }
+static inline Vec8fb operator||(Vec8fb const& a, Vec8fb const& b) { return a | b; }
+
+// vector operator |= : bitwise or
+static inline Vec8fb& operator|=(Vec8fb& a, Vec8fb const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8fb operator^(Vec8fb const& a, Vec8fb const& b) { return _mm256_xor_ps(a, b); }
+
+// vector operator ^= : bitwise xor
+static inline Vec8fb& operator^=(Vec8fb& a, Vec8fb const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8fb operator~(Vec8fb const& a) { return _mm256_xor_ps(a, constant8f<-1, -1, -1, -1, -1, -1, -1, -1>()); }
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec8fb operator!(Vec8fb const& a) { return Vec8fb(!Vec8ib(a)); }
+
+// Functions for Vec8fb
+
+// andnot: a & ~ b
+static inline Vec8fb andnot(Vec8fb const& a, Vec8fb const& b) { return _mm256_andnot_ps(b, a); }
+
+/*****************************************************************************
+ *
+ *          Horizontal Boolean functions
+ *
+ *****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec8fb const& a) { return _mm256_testc_ps(a, constant8f<-1, -1, -1, -1, -1, -1, -1, -1>()) != 0; }
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec8fb const& a) { return !_mm256_testz_ps(a, a); }
+
+/*****************************************************************************
+ *
+ *          Vec4db: Vector of 4 Booleans for use with Vec4d
+ *
+ *****************************************************************************/
+
+class Vec4db
+{
+ protected:
+  __m256d ymm;  // double vector
+ public:
+  // Default constructor:
+  Vec4db() {}
+  // Constructor to build from all elements:
+  Vec4db(bool b0, bool b1, bool b2, bool b3)
+  {
+#if INSTRSET >= 8  // AVX2
+    ymm = _mm256_castsi256_pd(_mm256_setr_epi64x(-(int64_t)b0, -(int64_t)b1, -(int64_t)b2, -(int64_t)b3));
+#else
+    __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1));
+    __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b2, -(int)b2, -(int)b3, -(int)b3));
+    ymm        = _mm256_castps_pd(set_m128r(blo, bhi));
+#endif
+  }
+  // Constructor to build from two Vec2db:
+  Vec4db(Vec2db const& a0, Vec2db const& a1)
+  {
+    ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0), _mm_castpd_ps(a1)));
+    // ymm = _mm256_set_m128d(a1, a0);
+  }
+  // Constructor to convert from type __m256d used in intrinsics:
+  Vec4db(__m256d const& x) { ymm = x; }
+  // Assignment operator to convert from type __m256d used in intrinsics:
+  Vec4db& operator=(__m256d const& x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Constructor to broadcast the same value into all elements:
+  Vec4db(bool b)
+  {
+#if INSTRSET >= 8  // AVX2
+    ymm = _mm256_castsi256_pd(_mm256_set1_epi64x(-(int64_t)b));
+#else
+    __m128 b1  = _mm_castsi128_ps(_mm_set1_epi32(-(int)b));
+    ymm        = _mm256_castps_pd(set_m128r(b1, b1));
+#endif
+  }
+  // Assignment operator to broadcast scalar value:
+  Vec4db& operator=(bool b)
+  {
+    ymm = _mm256_castsi256_pd(_mm256_set1_epi32(-int32_t(b)));
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec4db(int b);
+  Vec4db& operator=(int x);
+
+ public:
+  // Type cast operator to convert to __m256d used in intrinsics
+  operator __m256d() const { return ymm; }
+#ifdef VECTORI256_H
+#if VECTORI256_H == 2  // 256 bit integer vectors are available, AVX2
+  // Constructor to convert from type Vec4qb used as Boolean for integer vectors
+  Vec4db(Vec4qb const& x) { ymm = _mm256_castsi256_pd(x); }
+  // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
+  Vec4db& operator=(Vec4qb const& x)
+  {
+    ymm = _mm256_castsi256_pd(x);
+    return *this;
+  }
+#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+  // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
+  operator Vec4qb() const { return _mm256_castpd_si256(ymm); }
+#endif
+#else  // 256 bit integer vectors emulated without AVX2
+  // Constructor to convert from type Vec4qb used as Boolean for integer vectors
+  Vec4db(Vec4qb const& x) { *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high())); }
+  // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
+  Vec4db& operator=(Vec4qb const& x)
+  {
+    *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high()));
+    return *this;
+  }
+  // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
+  operator Vec4qb() const { return Vec4q(_mm_castpd_si128(get_low()), _mm_castpd_si128(get_high())); }
+#endif
+#endif                                         // VECTORI256_H
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4db const& insert(uint32_t index, bool value)
+  {
+    static const int32_t maskl[16] = {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0};
+    __m256d mask = _mm256_loadu_pd((double const*)(maskl + 8 - (index & 3) * 2));  // mask with FFFFFFFFFFFFFFFF at index position
+    if(value)
+      {
+        ymm = _mm256_or_pd(ymm, mask);
+      }
+    else
+      {
+        ymm = _mm256_andnot_pd(mask, ymm);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const
+  {
+    union
+    {
+      double f[8];
+      int32_t i[16];
+    } u;
+    _mm256_storeu_pd(u.f, ymm);
+    return u.i[(index & 3) * 2 + 1] != 0;
+  }
+  // Extract a single element. Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4fb:
+  Vec2db get_low() const { return _mm256_castpd256_pd128(ymm); }
+  Vec2db get_high() const { return _mm256_extractf128_pd(ymm, 1); }
+  static int size() { return 4; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec4db
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4db operator&(Vec4db const& a, Vec4db const& b) { return _mm256_and_pd(a, b); }
+static inline Vec4db operator&&(Vec4db const& a, Vec4db const& b) { return a & b; }
+
+// vector operator &= : bitwise and
+static inline Vec4db& operator&=(Vec4db& a, Vec4db const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4db operator|(Vec4db const& a, Vec4db const& b) { return _mm256_or_pd(a, b); }
+static inline Vec4db operator||(Vec4db const& a, Vec4db const& b) { return a | b; }
+
+// vector operator |= : bitwise or
+static inline Vec4db& operator|=(Vec4db& a, Vec4db const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4db operator^(Vec4db const& a, Vec4db const& b) { return _mm256_xor_pd(a, b); }
+
+// vector operator ^= : bitwise xor
+static inline Vec4db& operator^=(Vec4db& a, Vec4db const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4db operator~(Vec4db const& a)
+{
+  return _mm256_xor_pd(a, _mm256_castps_pd(constant8f<-1, -1, -1, -1, -1, -1, -1, -1>()));
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec4db operator!(Vec4db const& a) { return Vec4db(!Vec4qb(a)); }
+
+// Functions for Vec8fb
+
+// andnot: a & ~ b
+static inline Vec4db andnot(Vec4db const& a, Vec4db const& b) { return _mm256_andnot_pd(b, a); }
+
+/*****************************************************************************
+ *
+ *          Horizontal Boolean functions
+ *
+ *****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec4db const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  return horizontal_and(Vec256b(_mm256_castpd_si256(a)));
+#else                                          // split into 128 bit vectors
+  return horizontal_and(a.get_low() & a.get_high());
+#endif
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec4db const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  return horizontal_or(Vec256b(_mm256_castpd_si256(a)));
+#else                                          // split into 128 bit vectors
+  return horizontal_or(a.get_low() | a.get_high());
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Vec8f: Vector of 8 single precision floating point values
+ *
+ *****************************************************************************/
+
+class Vec8f
+{
+ protected:
+  __m256 ymm;  // Float vector
+ public:
+  // Default constructor:
+  Vec8f() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8f(float f) { ymm = _mm256_set1_ps(f); }
+  // Constructor to build from all elements:
+  Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7)
+  {
+    ymm = _mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7);
+  }
+  // Constructor to build from two Vec4f:
+  Vec8f(Vec4f const& a0, Vec4f const& a1)
+  {
+    ymm = set_m128r(a0, a1);
+    // ymm = _mm256_set_m128(a1, a0);
+  }
+  // Constructor to convert from type __m256 used in intrinsics:
+  Vec8f(__m256 const& x) { ymm = x; }
+  // Assignment operator to convert from type __m256 used in intrinsics:
+  Vec8f& operator=(__m256 const& x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m256 used in intrinsics
+  operator __m256() const { return ymm; }
+  // Member function to load from array (unaligned)
+  Vec8f& load(float const* p)
+  {
+    ymm = _mm256_loadu_ps(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 32.
+  Vec8f& load_a(float const* p)
+  {
+    ymm = _mm256_load_ps(p);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(float* p) const { _mm256_storeu_ps(p, ymm); }
+  // Member function to store into array, aligned by 32
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 32.
+  void store_a(float* p) const { _mm256_store_ps(p, ymm); }
+  // Partial load. Load n elements and set the rest to 0
+  Vec8f& load_partial(int n, float const* p)
+  {
+    if(n > 0 && n <= 4)
+      {
+        *this = Vec8f(Vec4f().load_partial(n, p), _mm_setzero_ps());
+        // ymm = _mm256_castps128_ps256(Vec4f().load_partial<n>(p)); (this doesn't work on MS compiler due to sloppy definition of the
+        // cast)
+      }
+    else if(n > 4 && n <= 8)
+      {
+        *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4));
+      }
+    else
+      {
+        ymm = _mm256_setzero_ps();
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, float* p) const
+  {
+    if(n <= 4)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n <= 8)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 4, p + 4);
+      }
+  }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec8f& cutoff(int n)
+  {
+    if(uint32_t(n) >= 8)
+      return *this;
+    static const union
+    {
+      int32_t i[16];
+      float f[16];
+    } mask = {{-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}};
+    *this  = Vec8fb(*this) & Vec8fb(Vec8f().load(mask.f + 8 - n));
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8f const& insert(uint32_t index, float value)
+  {
+    __m256 v0 = _mm256_broadcast_ss(&value);
+    switch(index)
+      {
+        case 0:
+          ymm = _mm256_blend_ps(ymm, v0, 1);
+          break;
+        case 1:
+          ymm = _mm256_blend_ps(ymm, v0, 2);
+          break;
+        case 2:
+          ymm = _mm256_blend_ps(ymm, v0, 4);
+          break;
+        case 3:
+          ymm = _mm256_blend_ps(ymm, v0, 8);
+          break;
+        case 4:
+          ymm = _mm256_blend_ps(ymm, v0, 0x10);
+          break;
+        case 5:
+          ymm = _mm256_blend_ps(ymm, v0, 0x20);
+          break;
+        case 6:
+          ymm = _mm256_blend_ps(ymm, v0, 0x40);
+          break;
+        default:
+          ymm = _mm256_blend_ps(ymm, v0, 0x80);
+          break;
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  float extract(uint32_t index) const
+  {
+    float x[8];
+    store(x);
+    return x[index & 7];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  float operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4f:
+  Vec4f get_low() const { return _mm256_castps256_ps128(ymm); }
+  Vec4f get_high() const { return _mm256_extractf128_ps(ymm, 1); }
+  static int size() { return 8; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec8f
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8f operator+(Vec8f const& a, Vec8f const& b) { return _mm256_add_ps(a, b); }
+
+// vector operator + : add vector and scalar
+static inline Vec8f operator+(Vec8f const& a, float b) { return a + Vec8f(b); }
+static inline Vec8f operator+(float a, Vec8f const& b) { return Vec8f(a) + b; }
+
+// vector operator += : add
+static inline Vec8f& operator+=(Vec8f& a, Vec8f const& b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec8f operator++(Vec8f& a, int)
+{
+  Vec8f a0 = a;
+  a        = a + 1.0f;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec8f& operator++(Vec8f& a)
+{
+  a = a + 1.0f;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8f operator-(Vec8f const& a, Vec8f const& b) { return _mm256_sub_ps(a, b); }
+
+// vector operator - : subtract vector and scalar
+static inline Vec8f operator-(Vec8f const& a, float b) { return a - Vec8f(b); }
+static inline Vec8f operator-(float a, Vec8f const& b) { return Vec8f(a) - b; }
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8f operator-(Vec8f const& a)
+{
+  return _mm256_xor_ps(a, constant8f<(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000,
+                                     (int)0x80000000, (int)0x80000000, (int)0x80000000>());
+}
+
+// vector operator -= : subtract
+static inline Vec8f& operator-=(Vec8f& a, Vec8f const& b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec8f operator--(Vec8f& a, int)
+{
+  Vec8f a0 = a;
+  a        = a - 1.0f;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec8f& operator--(Vec8f& a)
+{
+  a = a - 1.0f;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8f operator*(Vec8f const& a, Vec8f const& b) { return _mm256_mul_ps(a, b); }
+
+// vector operator * : multiply vector and scalar
+static inline Vec8f operator*(Vec8f const& a, float b) { return a * Vec8f(b); }
+static inline Vec8f operator*(float a, Vec8f const& b) { return Vec8f(a) * b; }
+
+// vector operator *= : multiply
+static inline Vec8f& operator*=(Vec8f& a, Vec8f const& b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8f operator/(Vec8f const& a, Vec8f const& b) { return _mm256_div_ps(a, b); }
+
+// vector operator / : divide vector and scalar
+static inline Vec8f operator/(Vec8f const& a, float b) { return a / Vec8f(b); }
+static inline Vec8f operator/(float a, Vec8f const& b) { return Vec8f(a) / b; }
+
+// vector operator /= : divide
+static inline Vec8f& operator/=(Vec8f& a, Vec8f const& b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8fb operator==(Vec8f const& a, Vec8f const& b) { return _mm256_cmp_ps(a, b, 0); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8fb operator!=(Vec8f const& a, Vec8f const& b) { return _mm256_cmp_ps(a, b, 4); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8fb operator<(Vec8f const& a, Vec8f const& b) { return _mm256_cmp_ps(a, b, 1); }
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8fb operator<=(Vec8f const& a, Vec8f const& b) { return _mm256_cmp_ps(a, b, 2); }
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8fb operator>(Vec8f const& a, Vec8f const& b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8fb operator>=(Vec8f const& a, Vec8f const& b) { return b <= a; }
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8f operator&(Vec8f const& a, Vec8f const& b) { return _mm256_and_ps(a, b); }
+
+// vector operator &= : bitwise and
+static inline Vec8f& operator&=(Vec8f& a, Vec8f const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator & : bitwise and of Vec8f and Vec8fb
+static inline Vec8f operator&(Vec8f const& a, Vec8fb const& b) { return _mm256_and_ps(a, b); }
+static inline Vec8f operator&(Vec8fb const& a, Vec8f const& b) { return _mm256_and_ps(a, b); }
+
+// vector operator | : bitwise or
+static inline Vec8f operator|(Vec8f const& a, Vec8f const& b) { return _mm256_or_ps(a, b); }
+
+// vector operator |= : bitwise or
+static inline Vec8f& operator|=(Vec8f& a, Vec8f const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8f operator^(Vec8f const& a, Vec8f const& b) { return _mm256_xor_ps(a, b); }
+
+// vector operator ^= : bitwise xor
+static inline Vec8f& operator^=(Vec8f& a, Vec8f const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8fb operator!(Vec8f const& a) { return a == Vec8f(0.0f); }
+
+/*****************************************************************************
+ *
+ *          Functions for Vec8f
+ *
+ *****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec8f select(Vec8fb const& s, Vec8f const& a, Vec8f const& b) { return _mm256_blendv_ps(b, a, s); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8f if_add(Vec8fb const& f, Vec8f const& a, Vec8f const& b) { return a + (Vec8f(f) & b); }
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec8f if_mul(Vec8fb const& f, Vec8f const& a, Vec8f const& b) { return a * select(f, b, 1.f); }
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add(Vec8f const& a)
+{
+  __m256 t1 = _mm256_hadd_ps(a, a);
+  __m256 t2 = _mm256_hadd_ps(t1, t1);
+  __m128 t3 = _mm256_extractf128_ps(t2, 1);
+  __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3);
+  return _mm_cvtss_f32(t4);
+}
+
+// function max: a > b ? a : b
+static inline Vec8f max(Vec8f const& a, Vec8f const& b) { return _mm256_max_ps(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec8f min(Vec8f const& a, Vec8f const& b) { return _mm256_min_ps(a, b); }
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec8f abs(Vec8f const& a)
+{
+  __m256 mask = constant8f<0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF>();
+  return _mm256_and_ps(a, mask);
+}
+
+// function sqrt: square root
+static inline Vec8f sqrt(Vec8f const& a) { return _mm256_sqrt_ps(a); }
+
+// function square: a * a
+static inline Vec8f square(Vec8f const& a) { return a * a; }
+
+// pow(Vec8f, int):
+template <typename TT>
+static Vec8f pow(Vec8f const& a, TT const& n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8f pow<int>(Vec8f const& x0, int const& n)
+{
+  return pow_template_i<Vec8f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8f pow<uint32_t>(Vec8f const& x0, uint32_t const& n)
+{
+  return pow_template_i<Vec8f>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec8f pow_n(Vec8f const& a)
+{
+  if(n < 0)
+    return Vec8f(1.0f) / pow_n<-n>(a);
+  if(n == 0)
+    return Vec8f(1.0f);
+  if(n >= 256)
+    return pow(a, n);
+  Vec8f x = a;                           // a^(2^i)
+  Vec8f y;                               // accumulator
+  const int lowest = n - (n & (n - 1));  // lowest set bit in n
+  if(n & 1)
+    y = x;
+  if(n < 2)
+    return y;
+  x = x * x;  // x^2
+  if(n & 2)
+    {
+      if(lowest == 2)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 4)
+    return y;
+  x = x * x;  // x^4
+  if(n & 4)
+    {
+      if(lowest == 4)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 8)
+    return y;
+  x = x * x;  // x^8
+  if(n & 8)
+    {
+      if(lowest == 8)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 16)
+    return y;
+  x = x * x;  // x^16
+  if(n & 16)
+    {
+      if(lowest == 16)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 32)
+    return y;
+  x = x * x;  // x^32
+  if(n & 32)
+    {
+      if(lowest == 32)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 64)
+    return y;
+  x = x * x;  // x^64
+  if(n & 64)
+    {
+      if(lowest == 64)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 128)
+    return y;
+  x = x * x;  // x^128
+  if(n & 128)
+    {
+      if(lowest == 128)
+        y = x;
+      else
+        y *= x;
+    }
+  return y;
+}
+
+template <int n>
+static inline Vec8f pow(Vec8f const& a, Const_int_t<n>)
+{
+  return pow_n<n>(a);
+}
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec8f round(Vec8f const& a) { return _mm256_round_ps(a, 0 + 8); }
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec8f truncate(Vec8f const& a) { return _mm256_round_ps(a, 3 + 8); }
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec8f floor(Vec8f const& a) { return _mm256_round_ps(a, 1 + 8); }
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec8f ceil(Vec8f const& a) { return _mm256_round_ps(a, 2 + 8); }
+
+#ifdef VECTORI256_H   // 256 bit integer vectors are available
+#if VECTORI256_H > 1  // AVX2
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int(Vec8f const& a)
+{
+  // Note: assume MXCSR control register is set to rounding
+  return _mm256_cvtps_epi32(a);
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int(Vec8f const& a) { return _mm256_cvttps_epi32(a); }
+
+// function to_float: convert integer vector to float vector
+static inline Vec8f to_float(Vec8i const& a) { return _mm256_cvtepi32_ps(a); }
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec8f to_float(Vec8ui const& a)
+{
+#ifdef __AVX512VL__
+  return _mm256_cvtepu32_ps(a);
+#else
+  Vec8f b = to_float(Vec8i(a & 0x7FFFFFFF));                      // 31 bits
+  Vec8i c = Vec8i(a) >> 31;                                       // generate mask from highest bit
+  Vec8f d = Vec8f(2147483648.f) & Vec8f(_mm256_castsi256_ps(c));  // mask floating point constant 2^31
+  return b + d;
+#endif
+}
+
+#else  // no AVX2
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int(Vec8f const& a)
+{
+  // Note: assume MXCSR control register is set to rounding
+  return Vec8i(_mm_cvtps_epi32(a.get_low()), _mm_cvtps_epi32(a.get_high()));
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int(Vec8f const& a) { return Vec8i(_mm_cvttps_epi32(a.get_low()), _mm_cvttps_epi32(a.get_high())); }
+
+// function to_float: convert integer vector to float vector
+static inline Vec8f to_float(Vec8i const& a) { return Vec8f(_mm_cvtepi32_ps(a.get_low()), _mm_cvtepi32_ps(a.get_high())); }
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec8f to_float(Vec8ui const& a) { return Vec8f(to_float(a.get_low()), to_float(a.get_high())); }
+#endif
+#endif  // VECTORI256_H
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8f mul_add(Vec8f const& a, Vec8f const& b, Vec8f const& c)
+{
+#ifdef __FMA__
+  return _mm256_fmadd_ps(a, b, c);
+#elif defined(__FMA4__)
+  return _mm256_macc_ps(a, b, c);
+#else
+  return a * b + c;
+#endif
+}
+
+// Multiply and subtract
+static inline Vec8f mul_sub(Vec8f const& a, Vec8f const& b, Vec8f const& c)
+{
+#ifdef __FMA__
+  return _mm256_fmsub_ps(a, b, c);
+#elif defined(__FMA4__)
+  return _mm256_msub_ps(a, b, c);
+#else
+  return a * b - c;
+#endif
+}
+
+// Multiply and inverse subtract
+static inline Vec8f nmul_add(Vec8f const& a, Vec8f const& b, Vec8f const& c)
+{
+#ifdef __FMA__
+  return _mm256_fnmadd_ps(a, b, c);
+#elif defined(__FMA4__)
+  return _mm256_nmacc_ps(a, b, c);
+#else
+  return c - a * b;
+#endif
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec8f mul_sub_x(Vec8f const& a, Vec8f const& b, Vec8f const& c)
+{
+#ifdef __FMA__
+  return _mm256_fmsub_ps(a, b, c);
+#elif defined(__FMA4__)
+  return _mm256_msub_ps(a, b, c);
+#else
+  // calculate a * b - c with extra precision
+  const int b12    = -(1 << 12);  // mask to remove lower 12 bits
+  Vec8f upper_mask = constant8f<b12, b12, b12, b12, b12, b12, b12, b12>();
+  Vec8f a_high     = a & upper_mask;  // split into high and low parts
+  Vec8f b_high     = b & upper_mask;
+  Vec8f a_low      = a - a_high;
+  Vec8f b_low      = b - b_high;
+  Vec8f r1         = a_high * b_high;                                         // this product is exact
+  Vec8f r2         = r1 - c;                                                  // subtract c from high product
+  Vec8f r3         = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low;  // add rest of product
+  return r3;                                                                  // + ((r2 - r1) + c);
+#endif
+}
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
+static inline Vec8f approx_recipr(Vec8f const& a)
+{
+#if INSTRSET >= 9           // use more accurate version if available. (none of these will raise exceptions on zero)
+#ifdef __AVX512ER__         // AVX512ER: full precision
+  // todo: if future processors have both AVX512ER and AVX512VL: _mm256_rcp28_round_ps(a, _MM_FROUND_NO_EXC);
+  return _mm512_castps512_ps256(_mm512_rcp28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC));
+#elif defined __AVX512VL__  // AVX512VL: 14 bit precision
+  return _mm256_rcp14_ps(a);
+#else                       // AVX512F: 14 bit precision
+  return _mm512_castps512_ps256(_mm512_rcp14_ps(_mm512_castps256_ps512(a)));
+#endif
+#else  // AVX: 11 bit precision
+  return _mm256_rcp_ps(a);
+#endif
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
+static inline Vec8f approx_rsqrt(Vec8f const& a)
+{
+#if INSTRSET >= 9           // use more accurate version if available. (none of these will raise exceptions on zero)
+#ifdef __AVX512ER__         // AVX512ER: full precision
+  // todo: if future processors have both AVX512ER and AVX521VL: _mm256_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
+  return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC));
+#elif defined __AVX512VL__  // AVX512VL: 14 bit precision
+  return _mm256_rsqrt14_ps(a);
+#else                       // AVX512F: 14 bit precision
+  return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a)));
+#endif
+#else  // AVX: 11 bit precision
+  return _mm256_rsqrt_ps(a);
+#endif
+}
+
+// Math functions using fast bit manipulation
+
+#ifdef VECTORI256_H   // 256 bit integer vectors are available, AVX2
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec8i exponent(Vec8f const& a)
+{
+#if VECTORI256_H > 1  // AVX2
+  Vec8ui t1 = _mm256_castps_si256(a);  // reinterpret as 32-bit integer
+  Vec8ui t2 = t1 << 1;                 // shift out sign bit
+  Vec8ui t3 = t2 >> 24;                // shift down logical to position 0
+  Vec8i t4  = Vec8i(t3) - 0x7F;        // subtract bias from exponent
+  return t4;
+#else                 // no AVX2
+  return Vec8i(exponent(a.get_low()), exponent(a.get_high()));
+#endif
+}
+#endif
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
+static inline Vec8f fraction(Vec8f const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 2  // 256 bit integer vectors are available, AVX2
+  Vec8ui t1 = _mm256_castps_si256(a);          // reinterpret as 32-bit integer
+  Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F800000;  // set exponent to 0 + bias
+  return _mm256_castsi256_ps(t2);
+#else
+  return Vec8f(fraction(a.get_low()), fraction(a.get_high()));
+#endif
+}
+
+#ifdef VECTORI256_H   // 256 bit integer vectors are available, AVX2
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8f exp2(Vec8i const& n)
+{
+#if VECTORI256_H > 1  // AVX2
+  Vec8i t1 = max(n, -0x7F);  // limit to allowed range
+  Vec8i t2 = min(t1, 0x80);
+  Vec8i t3 = t2 + 0x7F;            // add bias
+  Vec8i t4 = t3 << 23;             // put exponent into position 23
+  return _mm256_castsi256_ps(t4);  // reinterpret as float
+#else
+  return Vec8f(exp2(n.get_low()), exp2(n.get_high()));
+#endif
+}
+// static inline Vec8f exp2(Vec8f const & x); // defined in vectormath_exp.h
+
+#endif                                         // VECTORI256_H
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec8f(-0.0f)) gives true, while Vec8f(-0.0f) < Vec8f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb sign_bit(Vec8f const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec8i t1 = _mm256_castps_si256(a);  // reinterpret as 32-bit integer
+  Vec8i t2 = t1 >> 31;                // extend sign bit
+  return _mm256_castsi256_ps(t2);     // reinterpret as 32-bit Boolean
+#else
+  return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high()));
+#endif
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8f sign_combine(Vec8f const& a, Vec8f const& b)
+{
+  Vec8f signmask = constant8f<(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000,
+                              (int)0x80000000, (int)0x80000000>();  // -0.0
+  return a ^ (b & signmask);
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_finite(Vec8f const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec8i t1  = _mm256_castps_si256(a);                // reinterpret as 32-bit integer
+  Vec8i t2  = t1 << 1;                               // shift out sign bit
+  Vec8ib t3 = Vec8i(t2 & 0xFF000000) != 0xFF000000;  // exponent field is not all 1s
+  return t3;
+#else
+  return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high()));
+#endif
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_inf(Vec8f const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec8i t1 = _mm256_castps_si256(a);  // reinterpret as 32-bit integer
+  Vec8i t2 = t1 << 1;                 // shift out sign bit
+  return t2 == 0xFF000000;            // exponent is all 1s, fraction is 0
+#else
+  return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high()));
+#endif
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_nan(Vec8f const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec8i t1 = _mm256_castps_si256(a);       // reinterpret as 32-bit integer
+  Vec8i t2 = t1 << 1;                      // shift out sign bit
+  Vec8i t3 = 0xFF000000;                   // exponent mask
+  Vec8i t4 = t2 & t3;                      // exponent
+  Vec8i t5 = _mm256_andnot_si256(t3, t2);  // fraction
+  return Vec8ib(t4 == t3 && t5 != 0);      // exponent = all 1s and fraction != 0
+#else
+  return Vec8fb(is_nan(a.get_low()), is_nan(a.get_high()));
+#endif
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8fb is_subnormal(Vec8f const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec8i t1 = _mm256_castps_si256(a);       // reinterpret as 32-bit integer
+  Vec8i t2 = t1 << 1;                      // shift out sign bit
+  Vec8i t3 = 0xFF000000;                   // exponent mask
+  Vec8i t4 = t2 & t3;                      // exponent
+  Vec8i t5 = _mm256_andnot_si256(t3, t2);  // fraction
+  return Vec8ib(t4 == 0 && t5 != 0);       // exponent = 0 and fraction != 0
+#else
+  return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+#endif
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8fb is_zero_or_subnormal(Vec8f const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec8i t = _mm256_castps_si256(a);  // reinterpret as 32-bit integer
+  t &= 0x7F800000;                   // isolate exponent
+  return t == 0;                     // exponent = 0
+#else
+  return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+#endif
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec8f infinite8f()
+{
+  return constant8f<0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000>();
+}
+
+// Function nan4f: returns a vector where all elements are +NAN (quiet)
+static inline Vec8f nan8f(int n = 0x10) { return _mm256_castsi256_ps(_mm256_set1_epi32(0x7FC00000 + n)); }
+
+// change signs on vectors Vec8f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f change_sign(Vec8f const& a)
+{
+  if((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0)
+    return a;
+  __m256 mask = constant8f < i0 ? (int)0x80000000 : 0, i1 ? (int)0x80000000 : 0, i2 ? (int)0x80000000 : 0, i3 ? (int)0x80000000 : 0,
+         i4 ? (int)0x80000000 : 0, i5 ? (int)0x80000000 : 0, i6 ? (int)0x80000000 : 0, i7 ? (int)0x80000000 : 0 > ();
+  return _mm256_xor_ps(a, mask);
+}
+
+/*****************************************************************************
+ *
+ *          Vec4d: Vector of 4 double precision floating point values
+ *
+ *****************************************************************************/
+
+class Vec4d
+{
+ protected:
+  __m256d ymm;  // double vector
+ public:
+  // Default constructor:
+  Vec4d() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec4d(double d) { ymm = _mm256_set1_pd(d); }
+  // Constructor to build from all elements:
+  Vec4d(double d0, double d1, double d2, double d3) { ymm = _mm256_setr_pd(d0, d1, d2, d3); }
+  // Constructor to build from two Vec2d:
+  Vec4d(Vec2d const& a0, Vec2d const& a1)
+  {
+    ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0), _mm_castpd_ps(a1)));
+    // ymm = _mm256_set_m128d(a1, a0);
+  }
+  // Constructor to convert from type __m256d used in intrinsics:
+  Vec4d(__m256d const& x) { ymm = x; }
+  // Assignment operator to convert from type __m256d used in intrinsics:
+  Vec4d& operator=(__m256d const& x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m256d used in intrinsics
+  operator __m256d() const { return ymm; }
+  // Member function to load from array (unaligned)
+  Vec4d& load(double const* p)
+  {
+    ymm = _mm256_loadu_pd(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 32
+  Vec4d& load_a(double const* p)
+  {
+    ymm = _mm256_load_pd(p);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(double* p) const { _mm256_storeu_pd(p, ymm); }
+  // Member function to store into array, aligned by 32
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 32
+  void store_a(double* p) const { _mm256_store_pd(p, ymm); }
+  // Partial load. Load n elements and set the rest to 0
+  Vec4d& load_partial(int n, double const* p)
+  {
+    if(n > 0 && n <= 2)
+      {
+        *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd());
+      }
+    else if(n > 2 && n <= 4)
+      {
+        *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2));
+      }
+    else
+      {
+        ymm = _mm256_setzero_pd();
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, double* p) const
+  {
+    if(n <= 2)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n <= 4)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 2, p + 2);
+      }
+  }
+  // cut off vector to n elements. The last 4-n elements are set to zero
+  Vec4d& cutoff(int n)
+  {
+    ymm = _mm256_castps_pd(Vec8f(_mm256_castpd_ps(ymm)).cutoff(n * 2));
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4d const& insert(uint32_t index, double value)
+  {
+    __m256d v0 = _mm256_broadcast_sd(&value);
+    switch(index)
+      {
+        case 0:
+          ymm = _mm256_blend_pd(ymm, v0, 1);
+          break;
+        case 1:
+          ymm = _mm256_blend_pd(ymm, v0, 2);
+          break;
+        case 2:
+          ymm = _mm256_blend_pd(ymm, v0, 4);
+          break;
+        default:
+          ymm = _mm256_blend_pd(ymm, v0, 8);
+          break;
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  double extract(uint32_t index) const
+  {
+    double x[4];
+    store(x);
+    return x[index & 3];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  double operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec2d:
+  Vec2d get_low() const { return _mm256_castpd256_pd128(ymm); }
+  Vec2d get_high() const { return _mm256_extractf128_pd(ymm, 1); }
+  static int size() { return 4; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec4d
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4d operator+(Vec4d const& a, Vec4d const& b) { return _mm256_add_pd(a, b); }
+
+// vector operator + : add vector and scalar
+static inline Vec4d operator+(Vec4d const& a, double b) { return a + Vec4d(b); }
+static inline Vec4d operator+(double a, Vec4d const& b) { return Vec4d(a) + b; }
+
+// vector operator += : add
+static inline Vec4d& operator+=(Vec4d& a, Vec4d const& b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec4d operator++(Vec4d& a, int)
+{
+  Vec4d a0 = a;
+  a        = a + 1.0;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec4d& operator++(Vec4d& a)
+{
+  a = a + 1.0;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4d operator-(Vec4d const& a, Vec4d const& b) { return _mm256_sub_pd(a, b); }
+
+// vector operator - : subtract vector and scalar
+static inline Vec4d operator-(Vec4d const& a, double b) { return a - Vec4d(b); }
+static inline Vec4d operator-(double a, Vec4d const& b) { return Vec4d(a) - b; }
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec4d operator-(Vec4d const& a)
+{
+  return _mm256_xor_pd(a,
+                       _mm256_castps_pd(constant8f<0, (int)0x80000000, 0, (int)0x80000000, 0, (int)0x80000000, 0, (int)0x80000000>()));
+}
+
+// vector operator -= : subtract
+static inline Vec4d& operator-=(Vec4d& a, Vec4d const& b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec4d operator--(Vec4d& a, int)
+{
+  Vec4d a0 = a;
+  a        = a - 1.0;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec4d& operator--(Vec4d& a)
+{
+  a = a - 1.0;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4d operator*(Vec4d const& a, Vec4d const& b) { return _mm256_mul_pd(a, b); }
+
+// vector operator * : multiply vector and scalar
+static inline Vec4d operator*(Vec4d const& a, double b) { return a * Vec4d(b); }
+static inline Vec4d operator*(double a, Vec4d const& b) { return Vec4d(a) * b; }
+
+// vector operator *= : multiply
+static inline Vec4d& operator*=(Vec4d& a, Vec4d const& b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec4d operator/(Vec4d const& a, Vec4d const& b) { return _mm256_div_pd(a, b); }
+
+// vector operator / : divide vector and scalar
+static inline Vec4d operator/(Vec4d const& a, double b) { return a / Vec4d(b); }
+static inline Vec4d operator/(double a, Vec4d const& b) { return Vec4d(a) / b; }
+
+// vector operator /= : divide
+static inline Vec4d& operator/=(Vec4d& a, Vec4d const& b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4db operator==(Vec4d const& a, Vec4d const& b) { return _mm256_cmp_pd(a, b, 0); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4db operator!=(Vec4d const& a, Vec4d const& b) { return _mm256_cmp_pd(a, b, 4); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4db operator<(Vec4d const& a, Vec4d const& b) { return _mm256_cmp_pd(a, b, 1); }
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec4db operator<=(Vec4d const& a, Vec4d const& b) { return _mm256_cmp_pd(a, b, 2); }
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4db operator>(Vec4d const& a, Vec4d const& b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec4db operator>=(Vec4d const& a, Vec4d const& b) { return b <= a; }
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec4d operator&(Vec4d const& a, Vec4d const& b) { return _mm256_and_pd(a, b); }
+
+// vector operator &= : bitwise and
+static inline Vec4d& operator&=(Vec4d& a, Vec4d const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator & : bitwise and of Vec4d and Vec4db
+static inline Vec4d operator&(Vec4d const& a, Vec4db const& b) { return _mm256_and_pd(a, b); }
+static inline Vec4d operator&(Vec4db const& a, Vec4d const& b) { return _mm256_and_pd(a, b); }
+
+// vector operator | : bitwise or
+static inline Vec4d operator|(Vec4d const& a, Vec4d const& b) { return _mm256_or_pd(a, b); }
+
+// vector operator |= : bitwise or
+static inline Vec4d& operator|=(Vec4d& a, Vec4d const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4d operator^(Vec4d const& a, Vec4d const& b) { return _mm256_xor_pd(a, b); }
+
+// vector operator ^= : bitwise xor
+static inline Vec4d& operator^=(Vec4d& a, Vec4d const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec4db operator!(Vec4d const& a) { return a == Vec4d(0.0); }
+
+/*****************************************************************************
+ *
+ *          Functions for Vec4d
+ *
+ *****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true).
+// No other values are allowed.
+static inline Vec4d select(Vec4db const& s, Vec4d const& a, Vec4d const& b) { return _mm256_blendv_pd(b, a, s); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4d if_add(Vec4db const& f, Vec4d const& a, Vec4d const& b) { return a + (Vec4d(f) & b); }
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec4d if_mul(Vec4db const& f, Vec4d const& a, Vec4d const& b) { return a * select(f, b, 1.); }
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add(Vec4d const& a)
+{
+  __m256d t1 = _mm256_hadd_pd(a, a);
+  __m128d t2 = _mm256_extractf128_pd(t1, 1);
+  __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1), t2);
+  return _mm_cvtsd_f64(t3);
+}
+
+// function max: a > b ? a : b
+static inline Vec4d max(Vec4d const& a, Vec4d const& b) { return _mm256_max_pd(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec4d min(Vec4d const& a, Vec4d const& b) { return _mm256_min_pd(a, b); }
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec4d abs(Vec4d const& a)
+{
+  __m256d mask = _mm256_castps_pd(constant8f<-1, 0x7FFFFFFF, -1, 0x7FFFFFFF, -1, 0x7FFFFFFF, -1, 0x7FFFFFFF>());
+  return _mm256_and_pd(a, mask);
+}
+
+// function sqrt: square root
+static inline Vec4d sqrt(Vec4d const& a) { return _mm256_sqrt_pd(a); }
+
+// function square: a * a
+static inline Vec4d square(Vec4d const& a) { return a * a; }
+
+// pow(Vec4d, int):
+template <typename TT>
+static Vec4d pow(Vec4d const& a, TT const& n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec4d pow<int>(Vec4d const& x0, int const& n)
+{
+  return pow_template_i<Vec4d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec4d pow<uint32_t>(Vec4d const& x0, uint32_t const& n)
+{
+  return pow_template_i<Vec4d>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec4d pow_n(Vec4d const& a)
+{
+  if(n < 0)
+    return Vec4d(1.0) / pow_n<-n>(a);
+  if(n == 0)
+    return Vec4d(1.0);
+  if(n >= 256)
+    return pow(a, n);
+  Vec4d x = a;                           // a^(2^i)
+  Vec4d y;                               // accumulator
+  const int lowest = n - (n & (n - 1));  // lowest set bit in n
+  if(n & 1)
+    y = x;
+  if(n < 2)
+    return y;
+  x = x * x;  // x^2
+  if(n & 2)
+    {
+      if(lowest == 2)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 4)
+    return y;
+  x = x * x;  // x^4
+  if(n & 4)
+    {
+      if(lowest == 4)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 8)
+    return y;
+  x = x * x;  // x^8
+  if(n & 8)
+    {
+      if(lowest == 8)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 16)
+    return y;
+  x = x * x;  // x^16
+  if(n & 16)
+    {
+      if(lowest == 16)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 32)
+    return y;
+  x = x * x;  // x^32
+  if(n & 32)
+    {
+      if(lowest == 32)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 64)
+    return y;
+  x = x * x;  // x^64
+  if(n & 64)
+    {
+      if(lowest == 64)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 128)
+    return y;
+  x = x * x;  // x^128
+  if(n & 128)
+    {
+      if(lowest == 128)
+        y = x;
+      else
+        y *= x;
+    }
+  return y;
+}
+
+template <int n>
+static inline Vec4d pow(Vec4d const& a, Const_int_t<n>)
+{
+  return pow_n<n>(a);
+}
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec4d round(Vec4d const& a) { return _mm256_round_pd(a, 0 + 8); }
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec4d truncate(Vec4d const& a) { return _mm256_round_pd(a, 3 + 8); }
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec4d floor(Vec4d const& a) { return _mm256_round_pd(a, 1 + 8); }
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec4d ceil(Vec4d const& a) { return _mm256_round_pd(a, 2 + 8); }
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec4i round_to_int(Vec4d const& a)
+{
+  // Note: assume MXCSR control register is set to rounding
+  return _mm256_cvtpd_epi32(a);
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec4i truncate_to_int(Vec4d const& a) { return _mm256_cvttpd_epi32(a); }
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+
+// function truncate_to_int64: round towards zero. (inefficient)
+static inline Vec4q truncate_to_int64(Vec4d const& a)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  // return _mm256_maskz_cvttpd_epi64( __mmask8(0xFF), a);
+  return _mm256_cvttpd_epi64(a);
+#else
+  double aa[4];
+  a.store(aa);
+  return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]));
+#endif
+}
+
+// function truncate_to_int64_limited: round towards zero.
+// result as 64-bit integer vector, but with limited range. Deprecated!
+static inline Vec4q truncate_to_int64_limited(Vec4d const& a)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return truncate_to_int64(a);
+#elif VECTORI256_H > 1
+  // Note: assume MXCSR control register is set to rounding
+  Vec2q b = _mm256_cvttpd_epi32(a);                      // round to 32-bit integers
+  __m256i c = permute4q<0, -256, 1, -256>(Vec4q(b, b));  // get bits 64-127 to position 128-191
+  __m256i s = _mm256_srai_epi32(c, 31);                  // sign extension bits
+  return _mm256_unpacklo_epi32(c, s);                    // interleave with sign extensions
+#else
+  return Vec4q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high()));
+#endif
+}
+
+// function round_to_int64: round to nearest or even. (inefficient)
+static inline Vec4q round_to_int64(Vec4d const& a)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return _mm256_cvtpd_epi64(a);
+#else
+  return truncate_to_int64(round(a));
+#endif
+}
+
+// function round_to_int64_limited: round to nearest integer (even)
+// result as 64-bit integer vector, but with limited range. Deprecated!
+static inline Vec4q round_to_int64_limited(Vec4d const& a)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return round_to_int64(a);
+#elif VECTORI256_H > 1
+  // Note: assume MXCSR control register is set to rounding
+  Vec2q b = _mm256_cvtpd_epi32(a);                       // round to 32-bit integers
+  __m256i c = permute4q<0, -256, 1, -256>(Vec4q(b, b));  // get bits 64-127 to position 128-191
+  __m256i s = _mm256_srai_epi32(c, 31);                  // sign extension bits
+  return _mm256_unpacklo_epi32(c, s);                    // interleave with sign extensions
+#else
+  return Vec4q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high()));
+#endif
+}
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec4d to_double(Vec4q const& a)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return _mm256_maskz_cvtepi64_pd(__mmask16(0xFF), a);
+#else
+  int64_t aa[4];
+  a.store(aa);
+  return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]));
+#endif
+}
+
+// function to_double_limited: convert integer vector elements to double vector
+// limited to abs(x) < 2^31. Deprecated!
+static inline Vec4d to_double_limited(Vec4q const& x)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return to_double(x);
+#else
+  Vec8i compressed = permute8i<0, 2, 4, 6, -256, -256, -256, -256>(Vec8i(x));
+  return _mm256_cvtepi32_pd(compressed.get_low());  // AVX
+#endif
+}
+
+#endif  // VECTORI256_H
+
+// function to_double: convert integer vector to double vector
+static inline Vec4d to_double(Vec4i const& a) { return _mm256_cvtepi32_pd(a); }
+
+// function compress: convert two Vec4d to one Vec8f
+static inline Vec8f compress(Vec4d const& low, Vec4d const& high)
+{
+  __m128 t1 = _mm256_cvtpd_ps(low);
+  __m128 t2 = _mm256_cvtpd_ps(high);
+  return Vec8f(t1, t2);
+}
+
+// Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d
+static inline Vec4d extend_low(Vec8f const& a) { return _mm256_cvtps_pd(_mm256_castps256_ps128(a)); }
+
+// Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d
+static inline Vec4d extend_high(Vec8f const& a) { return _mm256_cvtps_pd(_mm256_extractf128_ps(a, 1)); }
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec4d mul_add(Vec4d const& a, Vec4d const& b, Vec4d const& c)
+{
+#ifdef __FMA__
+  return _mm256_fmadd_pd(a, b, c);
+#elif defined(__FMA4__)
+  return _mm256_macc_pd(a, b, c);
+#else
+  return a * b + c;
+#endif
+}
+
+// Multiply and subtract
+static inline Vec4d mul_sub(Vec4d const& a, Vec4d const& b, Vec4d const& c)
+{
+#ifdef __FMA__
+  return _mm256_fmsub_pd(a, b, c);
+#elif defined(__FMA4__)
+  return _mm256_msub_pd(a, b, c);
+#else
+  return a * b - c;
+#endif
+}
+
+// Multiply and inverse subtract
+static inline Vec4d nmul_add(Vec4d const& a, Vec4d const& b, Vec4d const& c)
+{
+#ifdef __FMA__
+  return _mm256_fnmadd_pd(a, b, c);
+#elif defined(__FMA4__)
+  return _mm256_nmacc_pd(a, b, c);
+#else
+  return c - a * b;
+#endif
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec4d mul_sub_x(Vec4d const& a, Vec4d const& b, Vec4d const& c)
+{
+#ifdef __FMA__
+  return _mm256_fmsub_pd(a, b, c);
+#elif defined(__FMA4__)
+  return _mm256_msub_pd(a, b, c);
+#else
+  // calculate a * b - c with extra precision
+  // mask to remove lower 27 bits
+  Vec4d upper_mask =
+      _mm256_castps_pd(constant8f<(int)0xF8000000, -1, (int)0xF8000000, -1, (int)0xF8000000, -1, (int)0xF8000000, -1>());
+  Vec4d a_high = a & upper_mask;  // split into high and low parts
+  Vec4d b_high = b & upper_mask;
+  Vec4d a_low  = a - a_high;
+  Vec4d b_low  = b - b_high;
+  Vec4d r1     = a_high * b_high;                                         // this product is exact
+  Vec4d r2     = r1 - c;                                                  // subtract c from high product
+  Vec4d r3     = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low;  // add rest of product
+  return r3;                                                              // + ((r2 - r1) + c);
+#endif
+}
+
+// Math functions using fast bit manipulation
+
+#ifdef VECTORI256_H   // 256 bit integer vectors are available
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec4q exponent(Vec4d const& a)
+{
+#if VECTORI256_H > 1  // AVX2
+  Vec4uq t1 = _mm256_castpd_si256(a);  // reinterpret as 64-bit integer
+  Vec4uq t2 = t1 << 1;                 // shift out sign bit
+  Vec4uq t3 = t2 >> 53;                // shift down logical to position 0
+  Vec4q t4  = Vec4q(t3) - 0x3FF;       // subtract bias from exponent
+  return t4;
+#else
+  return Vec4q(exponent(a.get_low()), exponent(a.get_high()));
+#endif
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25
+static inline Vec4d fraction(Vec4d const& a)
+{
+#if VECTORI256_H > 1  // AVX2
+  Vec4uq t1 = _mm256_castpd_si256(a);                                  // reinterpret as 64-bit integer
+  Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFF) | 0x3FF0000000000000);  // set exponent to 0 + bias
+  return _mm256_castsi256_pd(t2);
+#else
+  return Vec4d(fraction(a.get_low()), fraction(a.get_high()));
+#endif
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec4d exp2(Vec4q const& n)
+{
+#if VECTORI256_H > 1  // AVX2
+  Vec4q t1 = max(n, -0x3FF);  // limit to allowed range
+  Vec4q t2 = min(t1, 0x400);
+  Vec4q t3 = t2 + 0x3FF;           // add bias
+  Vec4q t4 = t3 << 52;             // put exponent into position 52
+  return _mm256_castsi256_pd(t4);  // reinterpret as double
+#else
+  return Vec4d(exp2(n.get_low()), exp2(n.get_high()));
+#endif
+}
+// static inline Vec4d exp2(Vec4d const & x); // defined in vectormath_exp.h
+#endif
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false
+static inline Vec4db sign_bit(Vec4d const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec4q t1 = _mm256_castpd_si256(a);  // reinterpret as 64-bit integer
+  Vec4q t2 = t1 >> 63;                // extend sign bit
+  return _mm256_castsi256_pd(t2);     // reinterpret as 64-bit Boolean
+#else
+  return Vec4db(sign_bit(a.get_low()), sign_bit(a.get_high()));
+#endif
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec4d sign_combine(Vec4d const& a, Vec4d const& b)
+{
+  Vec4d signmask =
+      _mm256_castps_pd(constant8f<0, (int)0x80000000, 0, (int)0x80000000, 0, (int)0x80000000, 0, (int)0x80000000>());  // -0.0
+  return a ^ (b & signmask);
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+static inline Vec4db is_finite(Vec4d const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec4q t1  = _mm256_castpd_si256(a);  // reinterpret as 64-bit integer
+  Vec4q t2  = t1 << 1;                 // shift out sign bit
+  Vec4q t3  = 0xFFE0000000000000;      // exponent mask
+  Vec4qb t4 = Vec4q(t2 & t3) != t3;    // exponent field is not all 1s
+  return t4;
+#else
+  return Vec4db(is_finite(a.get_low()), is_finite(a.get_high()));
+#endif
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec4db is_inf(Vec4d const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec4q t1 = _mm256_castpd_si256(a);  // reinterpret as 64-bit integer
+  Vec4q t2 = t1 << 1;                 // shift out sign bit
+  return t2 == 0xFFE0000000000000;    // exponent is all 1s, fraction is 0
+#else
+  return Vec4db(is_inf(a.get_low()), is_inf(a.get_high()));
+#endif
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec4db is_nan(Vec4d const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec4q t1 = _mm256_castpd_si256(a);       // reinterpret as 64-bit integer
+  Vec4q t2 = t1 << 1;                      // shift out sign bit
+  Vec4q t3 = 0xFFE0000000000000;           // exponent mask
+  Vec4q t4 = t2 & t3;                      // exponent
+  Vec4q t5 = _mm256_andnot_si256(t3, t2);  // fraction
+  return Vec4qb(t4 == t3 && t5 != 0);      // exponent = all 1s and fraction != 0
+#else
+  return Vec4db(is_nan(a.get_low()), is_nan(a.get_high()));
+#endif
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec4db is_subnormal(Vec4d const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec4q t1 = _mm256_castpd_si256(a);       // reinterpret as 64-bit integer
+  Vec4q t2 = t1 << 1;                      // shift out sign bit
+  Vec4q t3 = 0xFFE0000000000000;           // exponent mask
+  Vec4q t4 = t2 & t3;                      // exponent
+  Vec4q t5 = _mm256_andnot_si256(t3, t2);  // fraction
+  return Vec4qb(t4 == 0 && t5 != 0);       // exponent = 0 and fraction != 0
+#else
+  return Vec4db(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+#endif
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec4db is_zero_or_subnormal(Vec4d const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  Vec4q t = _mm256_castpd_si256(a);  // reinterpret as 32-bit integer
+  t &= 0x7FF0000000000000ll;         // isolate exponent
+  return t == 0;                     // exponent = 0
+#else
+  return Vec4db(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+#endif
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec4d infinite4d() { return _mm256_castps_pd(constant8f<0, 0x7FF00000, 0, 0x7FF00000, 0, 0x7FF00000, 0, 0x7FF00000>()); }
+
+// Function nan4d: returns a vector where all elements are +NAN (quiet)
+static inline Vec4d nan4d(int n = 0x10)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+  return _mm256_castsi256_pd(Vec4q(0x7FF8000000000000 + n));
+#else
+  return Vec4d(nan2d(n), nan2d(n));
+#endif
+}
+
+// change signs on vectors Vec4d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d change_sign(Vec4d const& a)
+{
+  if((i0 | i1 | i2 | i3) == 0)
+    return a;
+  __m256 mask = constant8f < 0, i0 ? (int)0x80000000 : 0, 0, i1 ? (int)0x80000000 : 0, 0, i2 ? (int)0x80000000 : 0, 0,
+         i3 ? (int)0x80000000 : 0 > ();
+  return _mm256_xor_pd(a, _mm256_castps_pd(mask));
+}
+
+/*****************************************************************************
+ *
+ *          Functions for reinterpretation between vector types
+ *
+ *****************************************************************************/
+
+#if defined(VECTORI256_H) && VECTORI256_H >= 2
+// AVX2 vectors defined
+
+// ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors.
+// It is recommended to compile with -fabi-version=0 to get the latest abi version
+#if !defined(GCC_VERSION) || (defined(__GXX_ABI_VERSION) && __GXX_ABI_VERSION >= 1004)
+static inline __m256i reinterpret_i(__m256i const& x) { return x; }
+
+static inline __m256i reinterpret_i(__m256 const& x) { return _mm256_castps_si256(x); }
+
+static inline __m256i reinterpret_i(__m256d const& x) { return _mm256_castpd_si256(x); }
+
+static inline __m256 reinterpret_f(__m256i const& x) { return _mm256_castsi256_ps(x); }
+
+static inline __m256 reinterpret_f(__m256 const& x) { return x; }
+
+static inline __m256 reinterpret_f(__m256d const& x) { return _mm256_castpd_ps(x); }
+
+static inline __m256d reinterpret_d(__m256i const& x) { return _mm256_castsi256_pd(x); }
+
+static inline __m256d reinterpret_d(__m256 const& x) { return _mm256_castps_pd(x); }
+
+static inline __m256d reinterpret_d(__m256d const& x) { return x; }
+
+#else  // __GXX_ABI_VERSION < 1004
+
+static inline __m256i reinterpret_i(Vec32c const& x) { return x; }
+
+static inline __m256i reinterpret_i(Vec16s const& x) { return x; }
+
+static inline __m256i reinterpret_i(Vec8i const& x) { return x; }
+
+static inline __m256i reinterpret_i(Vec4q const& x) { return x; }
+
+static inline __m256i reinterpret_i(Vec8f const& x) { return _mm256_castps_si256(x); }
+
+static inline __m256i reinterpret_i(Vec4d const& x) { return _mm256_castpd_si256(x); }
+
+static inline __m256 reinterpret_f(Vec32c const& x) { return _mm256_castsi256_ps(x); }
+
+static inline __m256 reinterpret_f(Vec16s const& x) { return _mm256_castsi256_ps(x); }
+
+static inline __m256 reinterpret_f(Vec8i const& x) { return _mm256_castsi256_ps(x); }
+
+static inline __m256 reinterpret_f(Vec4q const& x) { return _mm256_castsi256_ps(x); }
+
+static inline __m256 reinterpret_f(Vec8f const& x) { return x; }
+
+static inline __m256 reinterpret_f(Vec4d const& x) { return _mm256_castpd_ps(x); }
+
+static inline __m256d reinterpret_d(Vec32c const& x) { return _mm256_castsi256_pd(x); }
+
+static inline __m256d reinterpret_d(Vec16s const& x) { return _mm256_castsi256_pd(x); }
+
+static inline __m256d reinterpret_d(Vec8i const& x) { return _mm256_castsi256_pd(x); }
+
+static inline __m256d reinterpret_d(Vec4q const& x) { return _mm256_castsi256_pd(x); }
+
+static inline __m256d reinterpret_d(Vec8f const& x) { return _mm256_castps_pd(x); }
+
+static inline __m256d reinterpret_d(Vec4d const& x) { return x; }
+
+#endif  // __GXX_ABI_VERSION
+
+#else
+// AVX2 emulated in vectori256e.h, AVX supported
+
+// ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors.
+// It is recommended to compile with -fabi-version=0 to get the latest abi version
+#if !defined(GCC_VERSION) || (defined(__GXX_ABI_VERSION) && __GXX_ABI_VERSION >= 1004)
+
+static inline Vec256ie reinterpret_i(__m256 const& x)
+{
+  Vec8f xx(x);
+  return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
+}
+
+static inline Vec256ie reinterpret_i(__m256d const& x)
+{
+  Vec4d xx(x);
+  return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
+}
+
+static inline __m256 reinterpret_f(__m256 const& x) { return x; }
+
+static inline __m256 reinterpret_f(__m256d const& x) { return _mm256_castpd_ps(x); }
+
+static inline __m256d reinterpret_d(__m256 const& x) { return _mm256_castps_pd(x); }
+
+static inline __m256d reinterpret_d(__m256d const& x) { return x; }
+
+#else  // __GXX_ABI_VERSION < 1004
+
+static inline Vec256ie reinterpret_i(Vec8f const& x)
+{
+  Vec8f xx(x);
+  return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
+}
+
+static inline Vec256ie reinterpret_i(Vec4d const& x)
+{
+  Vec4d xx(x);
+  return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
+}
+
+static inline __m256 reinterpret_f(Vec8f const& x) { return x; }
+
+static inline __m256 reinterpret_f(Vec4d const& x) { return _mm256_castpd_ps(x); }
+
+static inline __m256d reinterpret_d(Vec8f const& x) { return _mm256_castps_pd(x); }
+
+static inline __m256d reinterpret_d(Vec4d const& x) { return x; }
+
+#endif  // __GXX_ABI_VERSION
+
+static inline Vec256ie reinterpret_i(Vec256ie const& x) { return x; }
+
+static inline __m256 reinterpret_f(Vec256ie const& x)
+{
+  return Vec8f(Vec4f(reinterpret_f(x.get_low())), Vec4f(reinterpret_f(x.get_high())));
+}
+
+static inline __m256d reinterpret_d(Vec256ie const& x)
+{
+  return Vec4d(Vec2d(reinterpret_d(x.get_low())), Vec2d(reinterpret_d(x.get_high())));
+}
+
+#endif             // VECTORI256_H
+
+/*****************************************************************************
+ *
+ *          Vector permute and blend functions
+ *
+ ******************************************************************************
+ *
+ * The permute function can reorder the elements of a vector and optionally
+ * set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select. An index of -1 will generate zero. An index of -256 means don't care.
+ *
+ * Example:
+ * Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
+ * Vec4d b;
+ * b = permute4d<1,0,-1,3>(a);     // b is (11, 10,  0, 13)
+ *
+ *
+ * The blend function can mix elements from two different vectors and
+ * optionally set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select, where indexes 0 - 3 indicate an element from the first source
+ * vector and indexes 4 - 7 indicate an element from the second source vector.
+ * A negative index will generate zero.
+ *
+ *
+ * Example:
+ * Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
+ * Vec4d b(20., 21., 22., 23.);    // a is (20, 21, 22, 23)
+ * Vec4d c;
+ * c = blend4d<4,3,7,-1> (a,b);    // c is (20, 13, 23,  0)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+// permute vector Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d permute4d(Vec4d const& a)
+{
+  const int ior = i0 | i1 | i2 | i3;  // OR indexes
+
+  // is zeroing needed
+  const bool do_zero = ior < 0 && (ior & 0x80);  // at least one index is negative, and not -0x100
+
+  // is shuffling needed
+  const bool do_shuffle = (i0 > 0) || (i1 != 1 && i1 >= 0) || (i2 != 2 && i2 >= 0) || (i3 != 3 && i3 >= 0);
+
+  if(!do_shuffle)
+    {  // no shuffling needed
+      if(do_zero)
+        {  // zeroing
+          if((i0 & i1 & i2 & i3) < 0)
+            {
+              return _mm256_setzero_pd();  // zero everything
+            }
+          // zero some elements
+          __m256d const mask = _mm256_castps_pd(constant8f<-int(i0 >= 0), -int(i0 >= 0), -int(i1 >= 0), -int(i1 >= 0), -int(i2 >= 0),
+                                                           -int(i2 >= 0), -int(i3 >= 0), -int(i3 >= 0)>());
+          return _mm256_and_pd(a, mask);  // zero with AND mask
+        }
+      else
+        {
+          return a;  // do nothing
+        }
+    }
+#if INSTRSET >= 8  // AVX2: use VPERMPD
+  __m256d x = _mm256_permute4x64_pd(a, (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6);
+  if(do_zero)
+    {  // zeroing
+      // zero some elements
+      __m256d const mask2 = _mm256_castps_pd(constant8f<-int(i0 >= 0), -int(i0 >= 0), -int(i1 >= 0), -int(i1 >= 0), -int(i2 >= 0),
+                                                        -int(i2 >= 0), -int(i3 >= 0), -int(i3 >= 0)>());
+      x                   = _mm256_and_pd(x, mask2);  // zero with AND mask
+    }
+  return x;
+#else              // AVX
+
+  // Needed contents of low/high part of each source register in VSHUFPD
+  // 0: a.low, 1: a.high, 3: zero
+  const int s1 = (i0 < 0 ? 3 : (i0 & 2) >> 1) | (i2 < 0 ? 0x30 : (i2 & 2) << 3);
+  const int s2 = (i1 < 0 ? 3 : (i1 & 2) >> 1) | (i3 < 0 ? 0x30 : (i3 & 2) << 3);
+  // permute mask
+  const int sm = (i0 < 0 ? 0 : (i0 & 1)) | (i1 < 0 ? 1 : (i1 & 1)) << 1 | (i2 < 0 ? 0 : (i2 & 1)) << 2 | (i3 < 0 ? 1 : (i3 & 1)) << 3;
+
+  if(s1 == 0x01 || s1 == 0x11 || s2 == 0x01 || s2 == 0x11)
+    {
+      // too expensive to use 256 bit permute, split into two 128 bit permutes
+      Vec2d alo = a.get_low();
+      Vec2d ahi = a.get_high();
+      Vec2d rlo = blend2d<i0, i1>(alo, ahi);
+      Vec2d rhi = blend2d<i2, i3>(alo, ahi);
+      return Vec4d(rlo, rhi);
+    }
+
+  // make operands for VSHUFPD
+  __m256d r1, r2;
+
+  switch(s1)
+    {
+      case 0x00:  // LL
+        r1 = _mm256_insertf128_pd(a, _mm256_castpd256_pd128(a), 1);
+        break;
+      case 0x03:  // LZ
+        r1 = _mm256_insertf128_pd(do_zero ? _mm256_setzero_pd() : __m256d(a), _mm256_castpd256_pd128(a), 1);
+        break;
+      case 0x10:  // LH
+        r1 = a;
+        break;
+      case 0x13:  // ZH
+        r1 = do_zero ? _mm256_and_pd(a, _mm256_castps_pd(constant8f<0, 0, 0, 0, -1, -1, -1, -1>())) : __m256d(a);
+        break;
+      case 0x30:  // LZ
+        if(do_zero)
+          {
+            __m128d t = _mm256_castpd256_pd128(a);
+            t = _mm_and_pd(t, t);
+            r1 = _mm256_castpd128_pd256(t);
+          }
+        else
+          r1 = a;
+        break;
+      case 0x31:  // HZ
+        r1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1));
+        break;
+      case 0x33:  // ZZ
+        r1 = do_zero ? _mm256_setzero_pd() : __m256d(a);
+        break;
+      default:;  // Not needed. Avoid warning in Clang
+    }
+
+  if(s2 == s1)
+    {
+      if(sm == 0x0A)
+        return r1;
+      r2 = r1;
+    }
+  else
+    {
+      switch(s2)
+        {
+          case 0x00:  // LL
+            r2 = _mm256_insertf128_pd(a, _mm256_castpd256_pd128(a), 1);
+            break;
+          case 0x03:  // ZL
+            r2 = _mm256_insertf128_pd(do_zero ? _mm256_setzero_pd() : __m256d(a), _mm256_castpd256_pd128(a), 1);
+            break;
+          case 0x10:  // LH
+            r2 = a;
+            break;
+          case 0x13:  // ZH
+            r2 = do_zero ? _mm256_and_pd(a, _mm256_castps_pd(constant8f<0, 0, 0, 0, -1, -1, -1, -1>())) : __m256d(a);
+            break;
+          case 0x30:  // LZ
+            if(do_zero)
+              {
+                __m128d t = _mm256_castpd256_pd128(a);
+                t = _mm_and_pd(t, t);
+                r2 = _mm256_castpd128_pd256(t);
+              }
+            else
+              r2 = a;
+            break;
+          case 0x31:  // HZ
+            r2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1));
+            break;
+          case 0x33:  // ZZ
+            r2 = do_zero ? _mm256_setzero_pd() : __m256d(a);
+            break;
+          default:;  // Not needed. Avoid warning in Clang
+        }
+    }
+  return _mm256_shuffle_pd(r1, r2, sm);
+
+#endif  // INSTRSET >= 8
+}
+
+// blend vectors Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d blend4d(Vec4d const& a, Vec4d const& b)
+{
+  // Combine all the indexes into a single bitfield, with 8 bits for each
+  const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24;
+
+  // Mask to zero out negative indexes
+  const uint32_t mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24;
+
+  if(mz == 0)
+    return _mm256_setzero_pd();  // all zero
+
+  __m256d t1;
+  if((((m1 & 0xFEFEFEFE) ^ 0x06020400) & mz) == 0)
+    {
+      // fits VSHUFPD(a,b)
+      t1 = _mm256_shuffle_pd(a, b, (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3);
+      if(mz == 0xFFFFFFFF)
+        return t1;
+      return permute4d < i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3 > (t1);
+    }
+  if((((m1 & 0xFEFEFEFE) ^ 0x02060004) & mz) == 0)
+    {
+      // fits VSHUFPD(b,a)
+      t1 = _mm256_shuffle_pd(b, a, (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3);
+      if(mz == 0xFFFFFFFF)
+        return t1;
+      return permute4d < i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3 > (t1);
+    }
+  if((((m1 & 0x03030303) ^ 0x03020100) & mz) == 0)
+    {
+      // blend and zero, no permute
+      if((m1 & 0x04040404 & mz) == 0)
+        {
+          t1 = a;
+        }
+      else if(((m1 ^ 0x04040404) & 0x04040404 & mz) == 0)
+        {
+          t1 = b;
+        }
+      else
+        {
+          t1 = _mm256_blend_pd(a, b, (i0 & 4) >> 2 | (i1 & 4) >> 1 | (i2 & 4) | (i3 & 4) << 1);
+        }
+      if(mz == 0xFFFFFFFF)
+        return t1;
+      return permute4d < i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3 > (t1);
+    }
+  if((m1 & 0x04040404 & mz) == 0)
+    {
+      // all from a
+      return permute4d<i0, i1, i2, i3>(a);
+    }
+  if(((m1 ^ 0x04040404) & 0x04040404 & mz) == 0)
+    {
+      // all from b
+      return permute4d<i0 ^ 4, i1 ^ 4, i2 ^ 4, i3 ^ 4>(b);
+    }
+  // check if we can do 128-bit blend/permute
+  if(((m1 ^ 0x01000100) & 0x01010101 & mz) == 0)
+    {
+      const uint32_t j0 = uint32_t((i0 >= 0 ? i0 : i1 >= 0 ? i1 : -1) >> 1);
+      const uint32_t j1 = uint32_t((i2 >= 0 ? i2 : i3 >= 0 ? i3 : -1) >> 1);
+      if(((m1 ^ ((j0 & 3) * 0x00000202 | (j1 & 3) * 0x02020000)) & 0x06060606 & mz) == 0)
+        {
+          t1                     = _mm256_permute2f128_pd(a, b, (j0 & 0x0F) | (j1 & 0x0F) << 4);
+          const bool partialzero = (((i0 | i1) ^ j0) & 0x80) != 0 || (((i2 | i3) ^ j1) & 0x80) != 0;
+          if(partialzero)
+            {
+              // zero some elements
+              __m256d mask = _mm256_castps_pd(constant8f < i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1,
+                                              i2 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > ());
+              return _mm256_and_pd(t1, mask);
+            }
+          else
+            return t1;
+        }
+    }
+  // general case. combine two permutes
+  Vec4d a1 = permute4d < (uint32_t)i0 < 4 ? i0 : -0x100, (uint32_t)i1 < 4 ? i1 : -0x100, (uint32_t)i2 < 4 ? i2 : -0x100,
+        (uint32_t)i3 < 4 ? i3 : -0x100 > (a);
+  Vec4d b1 = permute4d < (uint32_t)(i0 ^ 4) < 4 ? (i0 ^ 4) : -0x100, (uint32_t)(i1 ^ 4) < 4 ? (i1 ^ 4) : -0x100,
+        (uint32_t)(i2 ^ 4) < 4 ? (i2 ^ 4) : -0x100, (uint32_t)(i3 ^ 4) < 4 ? (i3 ^ 4) : -0x100 > (b);
+  t1 = _mm256_blend_pd(a1, b1, (i0 & 4) >> 2 | (i1 & 4) >> 1 | (i2 & 4) | (i3 & 4) << 1);
+  if(mz == 0xFFFFFFFF)
+    return t1;
+  return permute4d < i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3 > (t1);
+}
+
+/*****************************************************************************
+ *
+ *          Vector Vec8f permute and blend functions
+ *
+ *****************************************************************************/
+
+// permute vector Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f permute8f(Vec8f const& a)
+{
+  __m256 t1, mask;
+
+  const int ior = i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7;  // OR indexes
+
+  // is zeroing needed
+  const bool do_zero = ior < 0 && (ior & 0x80);  // at least one index is negative, and not -0x100
+
+  // is shuffling needed
+  const bool do_shuffle = (i0 > 0) || (i1 != 1 && i1 >= 0) || (i2 != 2 && i2 >= 0) || (i3 != 3 && i3 >= 0) || (i4 != 4 && i4 >= 0) ||
+                          (i5 != 5 && i5 >= 0) || (i6 != 6 && i6 >= 0) || (i7 != 7 && i7 >= 0);
+
+  if(!do_shuffle)
+    {  // no shuffling needed
+      if(do_zero)
+        {  // zeroing
+          if((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0)
+            {
+              return _mm256_setzero_ps();  // zero everything
+            }
+          // zero some elements
+          mask = constant8f<-int(i0 >= 0), -int(i1 >= 0), -int(i2 >= 0), -int(i3 >= 0), -int(i4 >= 0), -int(i5 >= 0), -int(i6 >= 0),
+                            -int(i7 >= 0)>();
+          return _mm256_and_ps(a, mask);  // zero with AND mask
+        }
+      else
+        {
+          return a;  // do nothing
+        }
+    }
+
+#if INSTRSET >= 8  // AVX2: use VPERMPS
+  if(do_shuffle)
+    {  // shuffling
+      mask = constant8f<i0 & 7, i1 & 7, i2 & 7, i3 & 7, i4 & 7, i5 & 7, i6 & 7, i7 & 7>();
+#if defined(_MSC_VER) && _MSC_VER < 1700 && !defined(__INTEL_COMPILER)
+      // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0
+      t1 = _mm256_permutevar8x32_ps(mask, _mm256_castps_si256(a));  //  problem in immintrin.h
+#elif defined(GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+      // Gcc 4.7.0 has wrong parameter type and operands in wrong order. fixed in version 4.7.1
+      t1 = _mm256_permutevar8x32_ps(mask, a);
+#else  // no bug version
+      t1 = _mm256_permutevar8x32_ps(a, _mm256_castps_si256(mask));
+#endif
+    }
+  else
+    {
+      t1 = a;  // no shuffling
+    }
+  if(do_zero)
+    {  // zeroing
+      if((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0)
+        {
+          return _mm256_setzero_ps();  // zero everything
+        }
+      // zero some elements
+      mask = constant8f<-int(i0 >= 0), -int(i1 >= 0), -int(i2 >= 0), -int(i3 >= 0), -int(i4 >= 0), -int(i5 >= 0), -int(i6 >= 0),
+                        -int(i7 >= 0)>();
+      t1   = _mm256_and_ps(t1, mask);  // zero with AND mask
+    }
+  return t1;
+#else  // AVX
+
+  // Combine all the indexes into a single bitfield, with 4 bits for each
+  const int m1 =
+      (i0 & 7) | (i1 & 7) << 4 | (i2 & 7) << 8 | (i3 & 7) << 12 | (i4 & 7) << 16 | (i5 & 7) << 20 | (i6 & 7) << 24 | (i7 & 7) << 28;
+
+  // Mask to zero out negative indexes
+  const int m2 = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12 |
+                 (i4 < 0 ? 0 : 0xF) << 16 | (i5 < 0 ? 0 : 0xF) << 20 | (i6 < 0 ? 0 : 0xF) << 24 | (i7 < 0 ? 0 : 0xF) << 28;
+
+  // Check if it is possible to use VSHUFPS. Index n must match index n+4 on bit 0-1, and even index n must match odd index n+1 on bit
+  // 2
+  const bool sps = ((m1 ^ (m1 >> 16)) & 0x3333 & m2 & (m2 >> 16)) == 0 && ((m1 ^ (m1 >> 4)) & 0x04040404 & m2 & m2 >> 4) == 0;
+
+  if(sps)
+    {  // can use VSHUFPS
+
+      // Index of each pair (i[n],i[n+1])
+      const int j0 = i0 >= 0 ? i0 : i1;
+      const int j1 = i2 >= 0 ? i2 : i3;
+      const int j2 = i4 >= 0 ? i4 : i5;
+      const int j3 = i6 >= 0 ? i6 : i7;
+
+      // Index of each pair (i[n],i[n+4])
+      const int k0 = i0 >= 0 ? i0 : i4;
+      const int k1 = i1 >= 0 ? i1 : i5;
+      const int k2 = i2 >= 0 ? i2 : i6;
+      const int k3 = i3 >= 0 ? i3 : i7;
+
+      // Needed contents of low/high part of each source register in VSHUFPS
+      // 0: a.low, 1: a.high, 3: zero or don't care
+      const int s1 = (j0 < 0 ? 3 : (j0 & 4) >> 2) | (j2 < 0 ? 0x30 : (j2 & 4) << 2);
+      const int s2 = (j1 < 0 ? 3 : (j1 & 4) >> 2) | (j3 < 0 ? 0x30 : (j3 & 4) << 2);
+
+      // calculate cost of using VSHUFPS
+      const int cost1 = (s1 == 0x01 || s1 == 0x11) ? 2 : (s1 == 0x00 || s1 == 0x03 || s1 == 0x31) ? 1 : 0;
+      const int cost2 = (s2 == s1)
+                            ? 0
+                            : (s2 == 0x01 || s2 == 0x11)
+                                  ? 2
+                                  : (s2 == 0x00 || (s2 == 0x03 && (s1 & 0xF0) != 0x00) || (s2 == 0x31 && (s1 & 0x0F) != 0x01)) ? 1 : 0;
+
+      if(cost1 + cost2 <= 3)
+        {
+          // permute mask
+          const int sm =
+              (k0 < 0 ? 0 : (k0 & 3)) | (k1 < 0 ? 1 : (k1 & 3)) << 2 | (k2 < 0 ? 2 : (k2 & 3)) << 4 | (k3 < 0 ? 3 : (k3 & 3)) << 6;
+
+          // make operands for VSHUFPS
+          __m256 r1, r2;
+
+          switch(s1)
+            {
+              case 0x00:  // LL
+              case 0x03:  // ZL
+                r1 = _mm256_insertf128_ps(a, _mm256_castps256_ps128(a), 1);
+                break;
+              case 0x01:  // HL
+                r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1));
+                r1 = _mm256_insertf128_ps(r1, _mm256_castps256_ps128(a), 1);
+                break;
+              case 0x10:  // LH
+              case 0x13:  // ZH
+              case 0x30:  // LZ
+              case 0x33:  // ZZ
+                r1 = a;
+                break;
+              case 0x11:  // HH
+                r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1));
+                r1 = _mm256_insertf128_ps(r1, _mm256_castps256_ps128(r1), 1);
+                break;
+              case 0x31:  // HZ
+                r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1));
+                break;
+            }
+
+          if(s2 == s1)
+            {
+              if(sm == 0xE4)
+                return r1;
+              r2 = r1;
+            }
+          else
+            {
+              switch(s2)
+                {
+                  case 0x00:  // LL
+                    r2 = _mm256_insertf128_ps(a, _mm256_castps256_ps128(a), 1);
+                    break;
+                  case 0x03:  // ZL
+                    if((s1 & 0xF0) == 0x00)
+                      r2 = r1;
+                    else
+                      {
+                        r2 = _mm256_insertf128_ps(a, _mm256_castps256_ps128(a), 1);
+                      }
+                    break;
+                  case 0x01:  // HL
+                    r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1));
+                    r2 = _mm256_insertf128_ps(r2, _mm256_castps256_ps128(a), 1);
+                    break;
+                  case 0x10:  // LH
+                  case 0x13:  // ZH
+                  case 0x30:  // LZ
+                  case 0x33:  // ZZ
+                    r2 = a;
+                    break;
+                  case 0x11:  // HH
+                    r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1));
+                    r2 = _mm256_insertf128_ps(r2, _mm256_castps256_ps128(r2), 1);
+                    break;
+                  case 0x31:  // HZ
+                    if((s1 & 0x0F) == 0x01)
+                      r2 = r1;
+                    else
+                      {
+                        r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1));
+                      }
+                    break;
+                }
+            }
+
+          // now the permute instruction
+          t1 = _mm256_shuffle_ps(r1, r2, sm);
+
+          if(do_zero)
+            {
+              // zero some elements
+              mask = constant8f<-int(i0 >= 0), -int(i1 >= 0), -int(i2 >= 0), -int(i3 >= 0), -int(i4 >= 0), -int(i5 >= 0),
+                                -int(i6 >= 0), -int(i7 >= 0)>();
+              t1 = _mm256_and_ps(t1, mask);  // zero with AND mask
+            }
+          return t1;
+        }
+    }
+  // not using VSHUFPS. Split into low and high part
+  Vec4f alo = a.get_low();
+  Vec4f ahi = a.get_high();
+  Vec4f rlo = blend4f<i0, i1, i2, i3>(alo, ahi);
+  Vec4f rhi = blend4f<i4, i5, i6, i7>(alo, ahi);
+  return Vec8f(rlo, rhi);
+#endif
+}
+
+// blend vectors Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f blend8f(Vec8f const& a, Vec8f const& b)
+{
+  const int ior = i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7;  // OR indexes
+
+  // is zeroing needed
+  const bool do_zero = ior < 0 && (ior & 0x80);  // at least one index is negative, and not -0x100
+
+  // Combine all the indexes into a single bitfield, with 4 bits for each
+  const int m1 = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                 (i6 & 0xF) << 24 | (i7 & 0xF) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12 |
+                 (i4 < 0 ? 0 : 0xF) << 16 | (i5 < 0 ? 0 : 0xF) << 20 | (i6 < 0 ? 0 : 0xF) << 24 | (i7 < 0 ? 0 : 0xF) << 28;
+
+  __m256 t1, mask;
+
+  if(mz == 0)
+    return _mm256_setzero_ps();  // all zero
+
+  if((m1 & 0x88888888 & mz) == 0)
+    {
+      // all from a
+      return permute8f<i0, i1, i2, i3, i4, i5, i6, i7>(a);
+    }
+
+  if(((m1 ^ 0x88888888) & 0x88888888 & mz) == 0)
+    {
+      // all from b
+      return permute8f<i0 & ~8, i1 & ~8, i2 & ~8, i3 & ~8, i4 & ~8, i5 & ~8, i6 & ~8, i7 & ~8>(b);
+    }
+
+  if((((m1 & 0x77777777) ^ 0x76543210) & mz) == 0)
+    {
+      // blend and zero, no permute
+      mask = constant8f < (i0 & 8) ? 0 : -1, (i1 & 8) ? 0 : -1, (i2 & 8) ? 0 : -1, (i3 & 8) ? 0 : -1, (i4 & 8) ? 0 : -1,
+      (i5 & 8) ? 0 : -1, (i6 & 8) ? 0 : -1, (i7 & 8) ? 0 : -1 > ();
+      t1 = select(mask, a, b);
+      if(!do_zero)
+        return t1;
+      // zero some elements
+      mask = constant8f < (i0 < 0 && (i0 & 8)) ? 0 : -1, (i1 < 0 && (i1 & 8)) ? 0 : -1, (i2 < 0 && (i2 & 8)) ? 0 : -1,
+      (i3 < 0 && (i3 & 8)) ? 0 : -1, (i4 < 0 && (i4 & 8)) ? 0 : -1, (i5 < 0 && (i5 & 8)) ? 0 : -1, (i6 < 0 && (i6 & 8)) ? 0 : -1,
+      (i7 < 0 && (i7 & 8)) ? 0 : -1 > ();
+      return _mm256_and_ps(t1, mask);
+    }
+
+  // check if we can do 128-bit blend/permute
+  if(((m1 ^ 0x32103210) & 0x33333333 & mz) == 0)
+    {
+      const uint32_t j0 = (i0 >= 0 ? i0 : i1 >= 0 ? i1 : i2 >= 0 ? i2 : i3 >= 0 ? i3 : -1) >> 2;
+      const uint32_t j1 = (i4 >= 0 ? i4 : i5 >= 0 ? i5 : i6 >= 0 ? i6 : i7 >= 0 ? i7 : -1) >> 2;
+      if(((m1 ^ ((j0 & 3) * 0x00004444 | (j1 & 3) * 0x44440000)) & 0xCCCCCCCC & mz) == 0)
+        {
+          t1                     = _mm256_permute2f128_ps(a, b, (j0 & 0x0F) | (j1 & 0x0F) << 4);
+          const bool partialzero = (((i0 | i1 | i2 | i3) ^ j0) & 0x80) != 0 || (((i4 | i5 | i6 | i7) ^ j1) & 0x80) != 0;
+          if(partialzero)
+            {
+              // zero some elements
+              mask = constant8f < i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1,
+              i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > ();
+              return _mm256_and_ps(t1, mask);
+            }
+          else
+            return t1;
+        }
+    }
+  // Not checking special cases for vunpckhps, vunpcklps: they are too rare
+
+  // Check if it is possible to use VSHUFPS.
+  // Index n must match index n+4 on bit 0-1, and even index n must match odd index n+1 on bit 2-3
+  const bool sps = ((m1 ^ (m1 >> 16)) & 0x3333 & mz & (mz >> 16)) == 0 && ((m1 ^ (m1 >> 4)) & 0x0C0C0C0C & mz & mz >> 4) == 0;
+
+  if(sps)
+    {  // can use VSHUFPS
+
+      // Index of each pair (i[n],i[n+1])
+      const int j0 = i0 >= 0 ? i0 : i1;
+      const int j1 = i2 >= 0 ? i2 : i3;
+      const int j2 = i4 >= 0 ? i4 : i5;
+      const int j3 = i6 >= 0 ? i6 : i7;
+
+      // Index of each pair (i[n],i[n+4])
+      const int k0 = i0 >= 0 ? i0 : i4;
+      const int k1 = i1 >= 0 ? i1 : i5;
+      const int k2 = i2 >= 0 ? i2 : i6;
+      const int k3 = i3 >= 0 ? i3 : i7;
+
+      // Needed contents of low/high part of each source register in VSHUFPS
+      // 0: a.low, 1: a.high, 2: b.low, 3: b.high, 4: zero or don't care
+      const int s1 = (j0 < 0 ? 4 : (j0 & 0xC) >> 2) | (j2 < 0 ? 0x30 : (j2 & 0xC) << 2);
+      const int s2 = (j1 < 0 ? 3 : (j1 & 0xC) >> 2) | (j3 < 0 ? 0x30 : (j3 & 0xC) << 2);
+
+      // permute mask
+      const int sm =
+          (k0 < 0 ? 0 : (k0 & 3)) | (k1 < 0 ? 1 : (k1 & 3)) << 2 | (k2 < 0 ? 2 : (k2 & 3)) << 4 | (k3 < 0 ? 3 : (k3 & 3)) << 6;
+
+      __m256 r1, r2;
+      __m128 ahi = _mm256_extractf128_ps(a, 1);  // 1
+      __m128 bhi = _mm256_extractf128_ps(b, 1);  // 3
+
+      switch(s1)
+        {
+          case 0x00:
+          case 0x04:
+            r1 = _mm256_insertf128_ps(a, _mm256_castps256_ps128(a), 1);
+            break;
+          case 0x01:
+          case 0x41:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi), _mm256_castps256_ps128(a), 1);
+            break;
+          case 0x02:
+            r1 = _mm256_insertf128_ps(b, _mm256_castps256_ps128(a), 1);
+            break;
+          case 0x03:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi), _mm256_castps256_ps128(a), 1);
+            break;
+          case 0x10:
+          case 0x14:
+          case 0x40:
+          case 0x44:
+            r1 = a;
+            break;
+          case 0x11:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi), ahi, 1);
+            break;
+          case 0x12:
+            r1 = _mm256_insertf128_ps(b, ahi, 1);
+            break;
+          case 0x13:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi), ahi, 1);
+            break;
+          case 0x20:
+            r1 = _mm256_insertf128_ps(a, _mm256_castps256_ps128(b), 1);
+            break;
+          case 0x21:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi), _mm256_castps256_ps128(b), 1);
+            break;
+          case 0x22:
+          case 0x24:
+          case 0x42:
+            r1 = _mm256_insertf128_ps(b, _mm256_castps256_ps128(b), 1);
+            break;
+          case 0x23:
+          case 0x43:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi), _mm256_castps256_ps128(b), 1);
+            break;
+          case 0x30:
+            r1 = _mm256_insertf128_ps(a, bhi, 1);
+            break;
+          case 0x31:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi), bhi, 1);
+            break;
+          case 0x32:
+          case 0x34:
+            r1 = b;
+            break;
+          case 0x33:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi), bhi, 1);
+            break;
+        }
+      if(s2 == s1 || ((s2 & 0x04) && ((s1 ^ s2) & 0xF0) == 0) || ((s2 & 0x40) && ((s1 ^ s2) & 0x0F) == 0))
+        {
+          // can use r2 = r1
+          if(sm == 0xE4)
+            return r1;  // no shuffling needed
+          r2 = r1;
+        }
+      else
+        {
+          switch(s2)
+            {
+              case 0x00:
+              case 0x04:
+                r2 = _mm256_insertf128_ps(a, _mm256_castps256_ps128(a), 1);
+                break;
+              case 0x01:
+              case 0x41:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi), _mm256_castps256_ps128(a), 1);
+                break;
+              case 0x02:
+                r2 = _mm256_insertf128_ps(b, _mm256_castps256_ps128(a), 1);
+                break;
+              case 0x03:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi), _mm256_castps256_ps128(a), 1);
+                break;
+              case 0x10:
+              case 0x14:
+              case 0x40:
+              case 0x44:
+                r2 = a;
+                break;
+              case 0x11:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi), ahi, 1);
+                break;
+              case 0x12:
+                r2 = _mm256_insertf128_ps(b, ahi, 1);
+                break;
+              case 0x13:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi), ahi, 1);
+                break;
+              case 0x20:
+                r2 = _mm256_insertf128_ps(a, _mm256_castps256_ps128(b), 1);
+                break;
+              case 0x21:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi), _mm256_castps256_ps128(b), 1);
+                break;
+              case 0x22:
+              case 0x24:
+              case 0x42:
+                r2 = _mm256_insertf128_ps(b, _mm256_castps256_ps128(b), 1);
+                break;
+              case 0x23:
+              case 0x43:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi), _mm256_castps256_ps128(b), 1);
+                break;
+              case 0x30:
+                r2 = _mm256_insertf128_ps(a, bhi, 1);
+                break;
+              case 0x31:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi), bhi, 1);
+                break;
+              case 0x32:
+              case 0x34:
+                r2 = b;
+                break;
+              case 0x33:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi), bhi, 1);
+                break;
+            }
+        }
+
+      // now the shuffle instruction
+      t1 = _mm256_shuffle_ps(r1, r2, sm);
+
+      if(do_zero)
+        {
+          // zero some elements
+          mask = constant8f<-int(i0 >= 0), -int(i1 >= 0), -int(i2 >= 0), -int(i3 >= 0), -int(i4 >= 0), -int(i5 >= 0), -int(i6 >= 0),
+                            -int(i7 >= 0)>();
+          t1   = _mm256_and_ps(t1, mask);  // zero with AND mask
+        }
+      return t1;
+    }
+
+  // Check if we can use 64-bit blend. Even numbered indexes must be even and odd numbered
+  // indexes must be equal to the preceding index + 1, except for negative indexes.
+  if(((m1 ^ 0x10101010) & 0x11111111 & mz) == 0 && ((m1 ^ m1 >> 4) & 0x0E0E0E0E & mz & mz >> 4) == 0)
+    {
+      const bool partialzero = int((i0 ^ i1) | (i2 ^ i3) | (i4 ^ i5) | (i6 ^ i7)) < 0;  // part of a 64-bit block is zeroed
+      const int blank1       = partialzero ? -0x100 : -1;                               // ignore or zero
+      const int n0           = i0 > 0 ? i0 / 2 : i1 > 0 ? i1 / 2 : blank1;              // indexes for 64 bit blend
+      const int n1           = i2 > 0 ? i2 / 2 : i3 > 0 ? i3 / 2 : blank1;
+      const int n2           = i4 > 0 ? i4 / 2 : i5 > 0 ? i5 / 2 : blank1;
+      const int n3           = i6 > 0 ? i6 / 2 : i7 > 0 ? i7 / 2 : blank1;
+      t1                     = _mm256_castpd_ps(blend4d<n0, n1, n2, n3>(_mm256_castps_pd(a), _mm256_castps_pd(b)));
+      if(blank1 == -1 || !do_zero)
+        {
+          return t1;
+        }
+      // need more zeroing
+      mask = constant8f<-int(i0 >= 0), -int(i1 >= 0), -int(i2 >= 0), -int(i3 >= 0), -int(i4 >= 0), -int(i5 >= 0), -int(i6 >= 0),
+                        -int(i7 >= 0)>();
+      return _mm256_and_ps(t1, mask);  // zero with AND mask
+    }
+
+  // general case: permute and blend and possible zero
+  const int blank2 = do_zero ? -1 : -0x100;  // ignore or zero
+
+  Vec8f ta = permute8f < (uint32_t)i0 < 8 ? i0 : blank2, (uint32_t)i1 < 8 ? i1 : blank2, (uint32_t)i2 < 8 ? i2 : blank2,
+        (uint32_t)i3 < 8 ? i3 : blank2, (uint32_t)i4 < 8 ? i4 : blank2, (uint32_t)i5 < 8 ? i5 : blank2, (uint32_t)i6 < 8 ? i6 : blank2,
+        (uint32_t)i7 < 8 ? i7 : blank2 > (a);
+  Vec8f tb = permute8f < (uint32_t)(i0 ^ 8) < 8 ? (i0 ^ 8) : blank2, (uint32_t)(i1 ^ 8) < 8 ? (i1 ^ 8) : blank2,
+        (uint32_t)(i2 ^ 8) < 8 ? (i2 ^ 8) : blank2, (uint32_t)(i3 ^ 8) < 8 ? (i3 ^ 8) : blank2,
+        (uint32_t)(i4 ^ 8) < 8 ? (i4 ^ 8) : blank2, (uint32_t)(i5 ^ 8) < 8 ? (i5 ^ 8) : blank2,
+        (uint32_t)(i6 ^ 8) < 8 ? (i6 ^ 8) : blank2, (uint32_t)(i7 ^ 8) < 8 ? (i7 ^ 8) : blank2 > (b);
+
+  if(blank2 == -1)
+    {
+      return _mm256_or_ps(ta, tb);
+    }
+  // no zeroing, need to blend
+  const int maskb = ((i0 >> 3) & 1) | ((i1 >> 2) & 2) | ((i2 >> 1) & 4) | (i3 & 8) | ((i4 << 1) & 0x10) | ((i5 << 2) & 0x20) |
+                    ((i6 << 3) & 0x40) | ((i7 << 4) & 0x80);
+  return _mm256_blend_ps(ta, tb, maskb);  // blend
+}
+
+/*****************************************************************************
+ *
+ *          Vector lookup functions
+ *
+ ******************************************************************************
+ *
+ * These functions use vector elements as indexes into a table.
+ * The table is given as one or more vectors or as an array.
+ *
+ * This can be used for several purposes:
+ *  - table lookup
+ *  - permute or blend with variable indexes
+ *  - blend from more than two sources
+ *  - gather non-contiguous data
+ *
+ * An index out of range may produce any value - the actual value produced is
+ * implementation dependent and may be different for different instruction
+ * sets. An index out of range does not produce an error message or exception.
+ *
+ * Example:
+ * Vec4i a(2,0,0,3);               // index  a is (  2,   0,   0,   3)
+ * Vec4f b(1.0f,1.1f,1.2f,1.3f);   // table  b is (1.0, 1.1, 1.2, 1.3)
+ * Vec4f c;
+ * c = lookup4 (a,b);              // result c is (1.2, 1.0, 1.0, 1.3)
+ *
+ *****************************************************************************/
+
+#ifdef VECTORI256_H                    // Vec8i and Vec4q must be defined
+
+static inline Vec8f lookup8(Vec8i const& index, Vec8f const& table)
+{
+#if INSTRSET >= 8 && VECTORI256_H > 1  // AVX2
+#if defined(_MSC_VER) && _MSC_VER < 1700 && !defined(__INTEL_COMPILER)
+  // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0
+  return _mm256_permutevar8x32_ps(_mm256_castsi256_ps(index), _mm256_castps_si256(table));
+#elif defined(GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+  // Gcc 4.7.0 has wrong parameter type and operands in wrong order. fixed in version 4.7.1
+  return _mm256_permutevar8x32_ps(_mm256_castsi256_ps(index), table);
+#else
+  // no bug version
+  return _mm256_permutevar8x32_ps(table, index);
+#endif
+
+#else  // AVX
+  // swap low and high part of table
+  __m256 t1 = _mm256_castps128_ps256(_mm256_extractf128_ps(table, 1));
+  __m256 t2 = _mm256_insertf128_ps(t1, _mm256_castps256_ps128(table), 1);
+  // join index parts
+  __m256i index2 = _mm256_insertf128_si256(_mm256_castsi128_si256(index.get_low()), index.get_high(), 1);
+  // permute within each 128-bit part
+  __m256 r0 = _mm256_permutevar_ps(table, index2);
+  __m256 r1 = _mm256_permutevar_ps(t2, index2);
+  // high index bit for blend
+  __m128i k1 = _mm_slli_epi32(index.get_high() ^ 4, 29);
+  __m128i k0 = _mm_slli_epi32(index.get_low(), 29);
+  __m256 kk = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)), _mm_castsi128_ps(k1), 1);
+  // blend the two permutes
+  return _mm256_blendv_ps(r0, r1, kk);
+#endif
+}
+
+template <int n>
+static inline Vec8f lookup(Vec8i const& index, float const* table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 4)
+    {
+      Vec4f table1 = Vec4f().load(table);
+      return Vec8f(lookup4(index.get_low(), table1), lookup4(index.get_high(), table1));
+    }
+#if INSTRSET < 8  // not AVX2
+  if(n <= 8)
+    {
+      return lookup8(index, Vec8f().load(table));
+    }
+#endif
+  // Limit index
+  Vec8ui index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec8ui(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec8ui(index), n - 1);
+    }
+#if INSTRSET >= 8 && VECTORI256_H > 1  // AVX2
+  return _mm256_i32gather_ps(table, index1, 4);
+#else                                  // AVX
+  return Vec8f(table[index1[0]], table[index1[1]], table[index1[2]], table[index1[3]], table[index1[4]], table[index1[5]],
+               table[index1[6]], table[index1[7]]);
+#endif
+}
+
+static inline Vec4d lookup4(Vec4q const& index, Vec4d const& table)
+{
+#if INSTRSET >= 8 && VECTORI256_H > 1  // AVX2
+  // We can't use VPERMPD because it has constant indexes.
+  // Convert the index to fit VPERMPS
+  Vec8i index1 = permute8i<0, 0, 2, 2, 4, 4, 6, 6>(Vec8i(index + index));
+  Vec8i index2 = index1 + Vec8i(constant8i<0, 1, 0, 1, 0, 1, 0, 1>());
+#if defined(_MSC_VER) && _MSC_VER < 1700 && !defined(__INTEL_COMPILER)
+  // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0
+  return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(index2), _mm256_castpd_si256(table)));
+#elif defined(GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+  // Gcc 4.7.0 has wrong parameter type and operands in wrong order
+  return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(index2), _mm256_castpd_ps(table)));
+#else
+  // no bug version
+  return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castpd_ps(table), index2));
+#endif
+
+#else  // AVX
+  // swap low and high part of table
+  __m256d t1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(table, 1));
+  __m256d t2 = _mm256_insertf128_pd(t1, _mm256_castpd256_pd128(table), 1);
+  // index << 1
+  __m128i index2lo = index.get_low() + index.get_low();
+  __m128i index2hi = index.get_high() + index.get_high();
+  // join index parts
+  __m256i index3 = _mm256_insertf128_si256(_mm256_castsi128_si256(index2lo), index2hi, 1);
+  // permute within each 128-bit part
+  __m256d r0 = _mm256_permutevar_pd(table, index3);
+  __m256d r1 = _mm256_permutevar_pd(t2, index3);
+  // high index bit for blend
+  __m128i k1 = _mm_slli_epi64(index.get_high() ^ 2, 62);
+  __m128i k0 = _mm_slli_epi64(index.get_low(), 62);
+  __m256d kk = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_castsi128_pd(k0)), _mm_castsi128_pd(k1), 1);
+  // blend the two permutes
+  return _mm256_blendv_pd(r0, r1, kk);
+#endif
+}
+
+template <int n>
+static inline Vec4d lookup(Vec4q const& index, double const* table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 2)
+    {
+      Vec2d table1 = Vec2d().load(table);
+      return Vec4d(lookup2(index.get_low(), table1), lookup2(index.get_high(), table1));
+    }
+#if INSTRSET < 8  // not AVX2
+  if(n <= 4)
+    {
+      return lookup4(index, Vec4d().load(table));
+    }
+#endif
+  // Limit index
+  Vec8ui index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec8ui(index) & constant8i<n - 1, 0, n - 1, 0, n - 1, 0, n - 1, 0>();
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec8ui(index), constant8i<n - 1, 0, n - 1, 0, n - 1, 0, n - 1, 0>());
+    }
+#if INSTRSET >= 8 && VECTORI256_H > 1  // AVX2
+  return _mm256_i64gather_pd(table, index1, 8);
+#else                                  // AVX
+  Vec4q index2 = Vec4q(index1);
+  return Vec4d(table[index2[0]], table[index2[1]], table[index2[2]], table[index2[3]]);
+#endif
+}
+#endif  // VECTORI256_H
+
+/*****************************************************************************
+ *
+ *          Gather functions with fixed indexes
+ *
+ *****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3, ..
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f gather8f(void const* a)
+{
+  return reinterpret_f(gather8i<i0, i1, i2, i3, i4, i5, i6, i7>(a));
+}
+
+// Load elements from array a with indices i0, i1, i2, i3
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d gather4d(void const* a)
+{
+  return reinterpret_d(gather4q<i0, i1, i2, i3>(a));
+}
+
+/*****************************************************************************
+ *
+ *          Vector scatter functions
+ *
+ ******************************************************************************
+ *
+ * These functions write the elements of a vector to arbitrary positions in an
+ * array in memory. Each vector element is written to an array position
+ * determined by an index. An element is not written if the corresponding
+ * index is out of range.
+ * The indexes can be specified as constant template parameters or as an
+ * integer vector.
+ *
+ * The scatter functions are useful if the data are distributed in a sparce
+ * manner into the array. If the array is dense then it is more efficient
+ * to permute the data into the right positions and then write the whole
+ * permuted vector into the array.
+ *
+ * Example:
+ * Vec8d a(10,11,12,13,14,15,16,17);
+ * double b[16] = {0};
+ * scatter<0,2,14,10,1,-1,5,9>(a,b);
+ * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
+ *
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8f const& data, float* array)
+{
+#if defined(__AVX512VL__)
+  __m256i indx   = constant8i<i0, i1, i2, i3, i4, i5, i6, i7>();
+  __mmask16 mask = uint16_t(i0 >= 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                            (i6 >= 0) << 6 | (i7 >= 0) << 7);
+  _mm256_mask_i32scatter_ps(array, mask, indx, data, 4);
+#elif defined(__AVX512F__)
+  __m512i indx = _mm512_castsi256_si512(constant8i<i0, i1, i2, i3, i4, i5, i6, i7>());
+  __mmask16 mask = uint16_t(i0 >= 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                            (i6 >= 0) << 6 | (i7 >= 0) << 7);
+  _mm512_mask_i32scatter_ps(array, mask, indx, _mm512_castps256_ps512(data), 4);
+#else
+  const int index[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+  for(int i = 0; i < 8; i++)
+    {
+      if(index[i] >= 0)
+        array[index[i]] = data[i];
+    }
+#endif
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4d const& data, double* array)
+{
+#if defined(__AVX512VL__)
+  __m128i indx   = constant4i<i0, i1, i2, i3>();
+  __mmask16 mask = uint16_t(i0 >= 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3);
+  _mm256_mask_i32scatter_pd(array, mask, indx, data, 8);
+#elif defined(__AVX512F__)
+  __m256i indx = _mm256_castsi128_si256(constant4i<i0, i1, i2, i3>());
+  __mmask16 mask = uint16_t(i0 >= 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3);
+  _mm512_mask_i32scatter_pd(array, mask, indx, _mm512_castpd256_pd512(data), 8);
+#else
+  const int index[4] = {i0, i1, i2, i3};
+  for(int i = 0; i < 4; i++)
+    {
+      if(index[i] >= 0)
+        array[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec8i const& index, uint32_t limit, Vec8f const& data, float* array)
+{
+#if defined(__AVX512VL__)
+  __mmask16 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit));
+  _mm256_mask_i32scatter_ps(array, mask, index, data, 4);
+#elif defined(__AVX512F__)
+  // 16 bit mask. upper 8 bits are (0<0) = false
+  __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
+  _mm512_mask_i32scatter_ps(array, mask, _mm512_castsi256_si512(index), _mm512_castps256_ps512(data), 4);
+#else
+  for(int i = 0; i < 8; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        array[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec4q const& index, uint32_t limit, Vec4d const& data, double* array)
+{
+#if defined(__AVX512VL__)
+  __mmask16 mask = _mm256_cmplt_epu64_mask(index, Vec4uq(uint64_t(limit)));
+  _mm256_mask_i64scatter_pd(array, mask, index, data, 8);
+#elif defined(__AVX512F__)
+  // 16 bit mask. upper 8 bits are (0<0) = false
+  __mmask16 mask = _mm512_cmplt_epu64_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec4uq(uint64_t(limit))));
+  _mm512_mask_i64scatter_pd(array, mask, _mm512_castsi256_si512(index), _mm512_castpd256_pd512(data), 8);
+#else
+  for(int i = 0; i < 4; i++)
+    {
+      if(uint64_t(index[i]) < uint64_t(limit))
+        array[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec4i const& index, uint32_t limit, Vec4d const& data, double* array)
+{
+#if defined(__AVX512VL__)
+  __mmask16 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit));
+  _mm256_mask_i32scatter_pd(array, mask, index, data, 8);
+#elif defined(__AVX512F__)
+  // 16 bit mask. upper 12 bits are (0<0) = false
+  __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi128_si512(index), _mm512_castsi128_si512(Vec4ui(limit)));
+  _mm512_mask_i32scatter_pd(array, mask, _mm256_castsi128_si256(index), _mm512_castpd256_pd512(data), 8);
+#else
+  for(int i = 0; i < 4; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        array[index[i]] = data[i];
+    }
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal scan functions
+ *
+ *****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec8fb const& x) { return horizontal_find_first(Vec8ib(x)); }
+
+static inline int horizontal_find_first(Vec4db const& x) { return horizontal_find_first(Vec4qb(x)); }
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec8fb const& x) { return horizontal_count(Vec8ib(x)); }
+
+static inline uint32_t horizontal_count(Vec4db const& x) { return horizontal_count(Vec4qb(x)); }
+
+/*****************************************************************************
+ *
+ *          Boolean <-> bitfield conversion functions
+ *
+ *****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8fb const& x) { return to_bits(Vec8ib(x)); }
+
+// to_Vec8fb: convert integer bitfield to boolean vector
+static inline Vec8fb to_Vec8fb(uint8_t x) { return Vec8fb(to_Vec8ib(x)); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4db const& x) { return to_bits(Vec4qb(x)); }
+
+// to_Vec4db: convert integer bitfield to boolean vector
+static inline Vec4db to_Vec4db(uint8_t x) { return Vec4db(to_Vec4qb(x)); }
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORF256_H
diff --git a/src/vectorclass/vectorf256e.h b/src/vectorclass/vectorf256e.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf82eea0b461f6cc5649f452fa2791b1b5c6b0c4
--- /dev/null
+++ b/src/vectorclass/vectorf256e.h
@@ -0,0 +1,2079 @@
+/****************************  vectorf256e.h   *******************************
+ * Author:        Agner Fog
+ * Date created:  2012-05-30
+ * Last modified: 2017-02-19
+ * Version:       1.27
+ * Project:       vector classes
+ * Description:
+ * Header file defining 256-bit floating point vector classes as interface
+ * to intrinsic functions. Emulated for processors without AVX instruction set.
+ *
+ * The following vector classes are defined here:
+ * Vec8f     Vector of 8 single precision floating point numbers
+ * Vec8fb    Vector of 8 Booleans for use with Vec8f
+ * Vec4d     Vector of 4 double precision floating point numbers
+ * Vec4db    Vector of 4 Booleans for use with Vec4d
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
+ *****************************************************************************/
+
+// check combination of header files
+#ifdef VECTORF256_H
+#if VECTORF256_H != 1
+#error Two different versions of vectorf256.h included
+#endif
+#else
+#define VECTORF256_H 1
+
+#if defined(VECTORI256_H) && VECTORI256_H >= 2
+#error wrong combination of header files. Use vectorf256.h instead of vectorf256e.h if you have AVX2
+#endif
+
+#include "vectorf128.h"  // Define 128-bit vectors
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/*****************************************************************************
+ *
+ *          base class Vec256fe and Vec256de
+ *
+ *****************************************************************************/
+// base class to replace __m256 when AVX is not supported
+class Vec256fe
+{
+ protected:
+  __m128 y0;  // low half
+  __m128 y1;  // high half
+ public:
+  Vec256fe(void){};  // default constructor
+  Vec256fe(__m128 x0, __m128 x1)
+  {  // constructor to build from two __m128
+    y0 = x0;
+    y1 = x1;
+  }
+  __m128 get_low() const
+  {  // get low half
+    return y0;
+  }
+  __m128 get_high() const
+  {  // get high half
+    return y1;
+  }
+};
+
+// base class to replace __m256d when AVX is not supported
+class Vec256de
+{
+ public:
+  Vec256de(){};  // default constructor
+  Vec256de(__m128d x0, __m128d x1)
+  {  // constructor to build from two __m128d
+    y0 = x0;
+    y1 = x1;
+  }
+  __m128d get_low() const
+  {  // get low half
+    return y0;
+  }
+  __m128d get_high() const
+  {  // get high half
+    return y1;
+  }
+
+ protected:
+  __m128d y0;  // low half
+  __m128d y1;  // high half
+};
+
+/*****************************************************************************
+ *
+ *          select functions
+ *
+ *****************************************************************************/
+// Select between two Vec256fe sources, element by element. Used in various functions
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFF (true).
+static inline Vec256fe selectf(Vec256fe const& s, Vec256fe const& a, Vec256fe const& b)
+{
+  return Vec256fe(selectf(b.get_low(), a.get_low(), s.get_low()), selectf(b.get_high(), a.get_high(), s.get_high()));
+}
+
+// Same, with two Vec256de sources.
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other
+// values are allowed.
+static inline Vec256de selectd(Vec256de const& s, Vec256de const& a, Vec256de const& b)
+{
+  return Vec256de(selectd(b.get_low(), a.get_low(), s.get_low()), selectd(b.get_high(), a.get_high(), s.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Generate compile-time constant vector
+ *
+ *****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory,
+// load as __m256
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec256fe constant8f()
+{
+  static const union
+  {
+    int i[8];
+    __m128 y[2];
+  } u = {{i0, i1, i2, i3, i4, i5, i6, i7}};
+  return Vec256fe(u.y[0], u.y[1]);
+}
+
+/*****************************************************************************
+ *
+ *          Vec8fb: Vector of 8 Booleans for use with Vec8f
+ *
+ *****************************************************************************/
+
+class Vec8fb : public Vec256fe
+{
+ public:
+  // Default constructor:
+  Vec8fb() {}
+  // Constructor to build from all elements:
+  Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7)
+  {
+    y0 = Vec4fb(b0, b1, b2, b3);
+    y1 = Vec4fb(b4, b5, b6, b7);
+  }
+  // Constructor to build from two Vec4fb:
+  Vec8fb(Vec4fb const& a0, Vec4fb const& a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256fe
+  Vec8fb(Vec256fe const& x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256fe
+  Vec8fb& operator=(Vec256fe const& x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+  // Constructor to convert from type Vec8ib used as Boolean for integer vectors
+  Vec8fb(Vec8ib const& x)
+  {
+    y0 = _mm_castsi128_ps(Vec8i(x).get_low());
+    y1 = _mm_castsi128_ps(Vec8i(x).get_high());
+  }
+  // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
+  Vec8fb& operator=(Vec8ib const& x)
+  {
+    y0 = _mm_castsi128_ps(Vec8i(x).get_low());
+    y1 = _mm_castsi128_ps(Vec8i(x).get_high());
+    return *this;
+  }
+  // Constructor to broadcast the same value into all elements:
+  Vec8fb(bool b) { y1 = y0 = Vec4fb(b); }
+  // Assignment operator to broadcast scalar value:
+  Vec8fb& operator=(bool b)
+  {
+    y0 = y1 = Vec4fb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec8fb(int b);
+  Vec8fb& operator=(int x);
+
+ public:
+  // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
+  operator Vec8ib() const { return Vec8i(_mm_castps_si128(y0), _mm_castps_si128(y1)); }
+#endif               // VECTORI256_H
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8fb const& insert(uint32_t index, bool value)
+  {
+    if(index < 4)
+      {
+        y0 = Vec4fb(y0).insert(index, value);
+      }
+    else
+      {
+        y1 = Vec4fb(y1).insert(index - 4, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  // Note: This function is inefficient. Use store function if extracting more than one element
+  bool extract(uint32_t index) const
+  {
+    if(index < 4)
+      {
+        return Vec4fb(y0).extract(index);
+      }
+    else
+      {
+        return Vec4fb(y1).extract(index - 4);
+      }
+  }
+  // Extract a single element. Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4fb:
+  Vec4fb get_low() const { return y0; }
+  Vec4fb get_high() const { return y1; }
+  static int size() { return 8; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec8fb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8fb operator&(Vec8fb const& a, Vec8fb const& b)
+{
+  return Vec8fb(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+static inline Vec8fb operator&&(Vec8fb const& a, Vec8fb const& b) { return a & b; }
+
+// vector operator &= : bitwise and
+static inline Vec8fb& operator&=(Vec8fb& a, Vec8fb const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8fb operator|(Vec8fb const& a, Vec8fb const& b)
+{
+  return Vec8fb(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec8fb operator||(Vec8fb const& a, Vec8fb const& b) { return a | b; }
+
+// vector operator |= : bitwise or
+static inline Vec8fb& operator|=(Vec8fb& a, Vec8fb const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8fb operator^(Vec8fb const& a, Vec8fb const& b)
+{
+  return Vec8fb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8fb& operator^=(Vec8fb& a, Vec8fb const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8fb operator~(Vec8fb const& a) { return Vec8fb(~a.get_low(), ~a.get_high()); }
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec8fb operator!(Vec8fb const& a) { return Vec8fb(!a.get_low(), !a.get_high()); }
+
+// Functions for Vec8fb
+
+// andnot: a & ~ b
+static inline Vec8fb andnot(Vec8fb const& a, Vec8fb const& b)
+{
+  return Vec8fb(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal Boolean functions
+ *
+ *****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec8fb const& a) { return horizontal_and(a.get_low() & a.get_high()); }
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec8fb const& a) { return horizontal_or(a.get_low() | a.get_high()); }
+
+/*****************************************************************************
+ *
+ *          Vec4db: Vector of 4 Booleans for use with Vec4d
+ *
+ *****************************************************************************/
+
+class Vec4db : public Vec256de
+{
+ public:
+  // Default constructor:
+  Vec4db() {}
+  // Constructor to build from all elements:
+  Vec4db(bool b0, bool b1, bool b2, bool b3)
+  {
+    y0 = Vec2db(b0, b1);
+    y1 = Vec2db(b2, b3);
+  }
+  // Constructor to build from two Vec2db:
+  Vec4db(Vec2db const& a0, Vec2db const& a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256de
+  Vec4db(Vec256de const& x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256de
+  Vec4db& operator=(Vec256de const& x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+  // Constructor to convert from type Vec4qb used as Boolean for integer vectors
+  Vec4db(Vec4qb const& x)
+  {
+    y0 = _mm_castsi128_pd(Vec4q(x).get_low());
+    y1 = _mm_castsi128_pd(Vec4q(x).get_high());
+  }
+  // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
+  Vec4db& operator=(Vec4qb const& x)
+  {
+    y0 = _mm_castsi128_pd(Vec4q(x).get_low());
+    y1 = _mm_castsi128_pd(Vec4q(x).get_high());
+    return *this;
+  }
+  // Constructor to broadcast the same value into all elements:
+  Vec4db(bool b) { y1 = y0 = Vec2db(b); }
+  // Assignment operator to broadcast scalar value:
+  Vec4db& operator=(bool b)
+  {
+    y0 = y1 = Vec2db(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec4db(int b);
+  Vec4db& operator=(int x);
+
+ public:
+  // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
+  operator Vec4qb() const { return Vec4q(_mm_castpd_si128(y0), _mm_castpd_si128(y1)); }
+#endif               // VECTORI256_H
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4db const& insert(uint32_t index, bool value)
+  {
+    if(index < 2)
+      {
+        y0 = Vec2db(y0).insert(index, value);
+      }
+    else
+      {
+        y1 = Vec2db(y1).insert(index - 2, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  // Note: This function is inefficient. Use store function if extracting more than one element
+  bool extract(uint32_t index) const
+  {
+    if(index < 2)
+      {
+        return Vec2db(y0).extract(index);
+      }
+    else
+      {
+        return Vec2db(y1).extract(index - 2);
+      }
+  }
+  // Extract a single element. Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4fb:
+  Vec2db get_low() const { return y0; }
+  Vec2db get_high() const { return y1; }
+  static int size() { return 4; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec4db
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4db operator&(Vec4db const& a, Vec4db const& b)
+{
+  return Vec4db(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec4db operator&&(Vec4db const& a, Vec4db const& b) { return a & b; }
+
+// vector operator &= : bitwise and
+static inline Vec4db& operator&=(Vec4db& a, Vec4db const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4db operator|(Vec4db const& a, Vec4db const& b)
+{
+  return Vec4db(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec4db operator||(Vec4db const& a, Vec4db const& b) { return a | b; }
+
+// vector operator |= : bitwise or
+static inline Vec4db& operator|=(Vec4db& a, Vec4db const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4db operator^(Vec4db const& a, Vec4db const& b)
+{
+  return Vec4db(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4db& operator^=(Vec4db& a, Vec4db const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4db operator~(Vec4db const& a) { return Vec4db(~a.get_low(), ~a.get_high()); }
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec4db operator!(Vec4db const& a) { return Vec4db(!a.get_low(), !a.get_high()); }
+
+// Functions for Vec4db
+
+// andnot: a & ~ b
+static inline Vec4db andnot(Vec4db const& a, Vec4db const& b)
+{
+  return Vec4db(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal Boolean functions
+ *
+ *****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec4db const& a) { return horizontal_and(a.get_low() & a.get_high()); }
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec4db const& a) { return horizontal_or(a.get_low() | a.get_high()); }
+
+/*****************************************************************************
+ *
+ *          Vec8f: Vector of 8 single precision floating point values
+ *
+ *****************************************************************************/
+
+class Vec8f : public Vec256fe
+{
+ public:
+  // Default constructor:
+  Vec8f() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8f(float f) { y1 = y0 = _mm_set1_ps(f); }
+  // Constructor to build from all elements:
+  Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7)
+  {
+    y0 = _mm_setr_ps(f0, f1, f2, f3);
+    y1 = _mm_setr_ps(f4, f5, f6, f7);
+  }
+  // Constructor to build from two Vec4f:
+  Vec8f(Vec4f const& a0, Vec4f const& a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256fe
+  Vec8f(Vec256fe const& x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256fe
+  Vec8f& operator=(Vec256fe const& x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec8f& load(float const* p)
+  {
+    y0 = _mm_loadu_ps(p);
+    y1 = _mm_loadu_ps(p + 4);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 32.
+  Vec8f& load_a(float const* p)
+  {
+    y0 = _mm_load_ps(p);
+    y1 = _mm_load_ps(p + 4);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(float* p) const
+  {
+    _mm_storeu_ps(p, y0);
+    _mm_storeu_ps(p + 4, y1);
+  }
+  // Member function to store into array, aligned by 32
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 32.
+  void store_a(float* p) const
+  {
+    _mm_store_ps(p, y0);
+    _mm_store_ps(p + 4, y1);
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec8f& load_partial(int n, float const* p)
+  {
+    if(n > 0 && n <= 4)
+      {
+        *this = Vec8f(Vec4f().load_partial(n, p), _mm_setzero_ps());
+      }
+    else if(n > 4 && n <= 8)
+      {
+        *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4));
+      }
+    else
+      {
+        y1 = y0 = _mm_setzero_ps();
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, float* p) const
+  {
+    if(n <= 4)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n <= 8)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 4, p + 4);
+      }
+  }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec8f& cutoff(int n)
+  {
+    if(uint32_t(n) >= 8)
+      return *this;
+    else if(n >= 4)
+      {
+        y1 = Vec4f(y1).cutoff(n - 4);
+      }
+    else
+      {
+        y0 = Vec4f(y0).cutoff(n);
+        y1 = Vec4f(0.0f);
+      }
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8f const& insert(uint32_t index, float value)
+  {
+    if(index < 4)
+      {
+        y0 = Vec4f(y0).insert(index, value);
+      }
+    else
+      {
+        y1 = Vec4f(y1).insert(index - 4, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  // Note: This function is inefficient. Use store function if extracting more than one element
+  float extract(uint32_t index) const
+  {
+    if(index < 4)
+      {
+        return Vec4f(y0).extract(index);
+      }
+    else
+      {
+        return Vec4f(y1).extract(index - 4);
+      }
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  float operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4f:
+  Vec4f get_low() const { return y0; }
+  Vec4f get_high() const { return y1; }
+  static int size() { return 8; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec8f
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8f operator+(Vec8f const& a, Vec8f const& b) { return Vec8f(a.get_low() + b.get_low(), a.get_high() + b.get_high()); }
+
+// vector operator + : add vector and scalar
+static inline Vec8f operator+(Vec8f const& a, float b) { return a + Vec8f(b); }
+static inline Vec8f operator+(float a, Vec8f const& b) { return Vec8f(a) + b; }
+
+// vector operator += : add
+static inline Vec8f& operator+=(Vec8f& a, Vec8f const& b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec8f operator++(Vec8f& a, int)
+{
+  Vec8f a0 = a;
+  a        = a + 1.0f;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec8f& operator++(Vec8f& a)
+{
+  a = a + 1.0f;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8f operator-(Vec8f const& a, Vec8f const& b) { return Vec8f(a.get_low() - b.get_low(), a.get_high() - b.get_high()); }
+
+// vector operator - : subtract vector and scalar
+static inline Vec8f operator-(Vec8f const& a, float b) { return a - Vec8f(b); }
+static inline Vec8f operator-(float a, Vec8f const& b) { return Vec8f(a) - b; }
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8f operator-(Vec8f const& a) { return Vec8f(-a.get_low(), -a.get_high()); }
+
+// vector operator -= : subtract
+static inline Vec8f& operator-=(Vec8f& a, Vec8f const& b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec8f operator--(Vec8f& a, int)
+{
+  Vec8f a0 = a;
+  a        = a - 1.0f;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec8f& operator--(Vec8f& a)
+{
+  a = a - 1.0f;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8f operator*(Vec8f const& a, Vec8f const& b) { return Vec8f(a.get_low() * b.get_low(), a.get_high() * b.get_high()); }
+
+// vector operator * : multiply vector and scalar
+static inline Vec8f operator*(Vec8f const& a, float b) { return a * Vec8f(b); }
+static inline Vec8f operator*(float a, Vec8f const& b) { return Vec8f(a) * b; }
+
+// vector operator *= : multiply
+static inline Vec8f& operator*=(Vec8f& a, Vec8f const& b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8f operator/(Vec8f const& a, Vec8f const& b) { return Vec8f(a.get_low() / b.get_low(), a.get_high() / b.get_high()); }
+
+// vector operator / : divide vector and scalar
+static inline Vec8f operator/(Vec8f const& a, float b) { return a / Vec8f(b); }
+static inline Vec8f operator/(float a, Vec8f const& b) { return Vec8f(a) / b; }
+
+// vector operator /= : divide
+static inline Vec8f& operator/=(Vec8f& a, Vec8f const& b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8fb operator==(Vec8f const& a, Vec8f const& b)
+{
+  return Vec8fb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8fb operator!=(Vec8f const& a, Vec8f const& b)
+{
+  return Vec8fb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8fb operator<(Vec8f const& a, Vec8f const& b)
+{
+  return Vec8fb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8fb operator<=(Vec8f const& a, Vec8f const& b)
+{
+  return Vec8fb(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8fb operator>(Vec8f const& a, Vec8f const& b)
+{
+  return Vec8fb(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8fb operator>=(Vec8f const& a, Vec8f const& b)
+{
+  return Vec8fb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8f operator&(Vec8f const& a, Vec8f const& b) { return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high()); }
+
+// vector operator &= : bitwise and
+static inline Vec8f& operator&=(Vec8f& a, Vec8f const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator & : bitwise and of Vec8f and Vec8fb
+static inline Vec8f operator&(Vec8f const& a, Vec8fb const& b)
+{
+  return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec8f operator&(Vec8fb const& a, Vec8f const& b)
+{
+  return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator | : bitwise or
+static inline Vec8f operator|(Vec8f const& a, Vec8f const& b) { return Vec8f(a.get_low() | b.get_low(), a.get_high() | b.get_high()); }
+
+// vector operator |= : bitwise or
+static inline Vec8f& operator|=(Vec8f& a, Vec8f const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8f operator^(Vec8f const& a, Vec8f const& b) { return Vec8f(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); }
+
+// vector operator ^= : bitwise xor
+static inline Vec8f& operator^=(Vec8f& a, Vec8f const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8fb operator!(Vec8f const& a) { return Vec8fb(!a.get_low(), !a.get_high()); }
+
+/*****************************************************************************
+ *
+ *          Functions for Vec8f
+ *
+ *****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec8f select(Vec8fb const& s, Vec8f const& a, Vec8f const& b)
+{
+  return Vec8f(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8f if_add(Vec8fb const& f, Vec8f const& a, Vec8f const& b) { return a + (Vec8f(f) & b); }
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec8f if_mul(Vec8fb const& f, Vec8f const& a, Vec8f const& b) { return a * select(f, b, 1.f); }
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add(Vec8f const& a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// function max: a > b ? a : b
+static inline Vec8f max(Vec8f const& a, Vec8f const& b)
+{
+  return Vec8f(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8f min(Vec8f const& a, Vec8f const& b)
+{
+  return Vec8f(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec8f abs(Vec8f const& a) { return Vec8f(abs(a.get_low()), abs(a.get_high())); }
+
+// function sqrt: square root
+static inline Vec8f sqrt(Vec8f const& a) { return Vec8f(sqrt(a.get_low()), sqrt(a.get_high())); }
+
+// function square: a * a
+static inline Vec8f square(Vec8f const& a) { return Vec8f(square(a.get_low()), square(a.get_high())); }
+
+// pow(Vec8f, int):
+template <typename TT>
+static Vec8f pow(Vec8f const& a, TT const& n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8f pow<int>(Vec8f const& x0, int const& n)
+{
+  return pow_template_i<Vec8f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8f pow<uint32_t>(Vec8f const& x0, uint32_t const& n)
+{
+  return pow_template_i<Vec8f>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec8f pow_n(Vec8f const& a)
+{
+  return Vec8f(pow_n<n>(a.get_low()), pow_n<n>(a.get_high()));
+}
+
+template <int n>
+static inline Vec8f pow(Vec8f const& a, Const_int_t<n>)
+{
+  return pow_n<n>(a);
+}
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec8f round(Vec8f const& a) { return Vec8f(round(a.get_low()), round(a.get_high())); }
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec8f truncate(Vec8f const& a) { return Vec8f(truncate(a.get_low()), truncate(a.get_high())); }
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec8f floor(Vec8f const& a) { return Vec8f(floor(a.get_low()), floor(a.get_high())); }
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec8f ceil(Vec8f const& a) { return Vec8f(ceil(a.get_low()), ceil(a.get_high())); }
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int(Vec8f const& a)
+{
+  // Note: assume MXCSR control register is set to rounding
+  return Vec8i(round_to_int(a.get_low()), round_to_int(a.get_high()));
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int(Vec8f const& a) { return Vec8i(truncate_to_int(a.get_low()), truncate_to_int(a.get_high())); }
+
+// function to_float: convert integer vector to float vector
+static inline Vec8f to_float(Vec8i const& a) { return Vec8f(to_float(a.get_low()), to_float(a.get_high())); }
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec8f to_float(Vec8ui const& a) { return Vec8f(to_float(a.get_low()), to_float(a.get_high())); }
+
+#endif  // VECTORI256_H
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
+static inline Vec8f approx_recipr(Vec8f const& a) { return Vec8f(approx_recipr(a.get_low()), approx_recipr(a.get_high())); }
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
+static inline Vec8f approx_rsqrt(Vec8f const& a) { return Vec8f(approx_rsqrt(a.get_low()), approx_rsqrt(a.get_high())); }
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8f mul_add(Vec8f const& a, Vec8f const& b, Vec8f const& c)
+{
+  return Vec8f(mul_add(a.get_low(), b.get_low(), c.get_low()), mul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec8f mul_sub(Vec8f const& a, Vec8f const& b, Vec8f const& c)
+{
+  return Vec8f(mul_sub(a.get_low(), b.get_low(), c.get_low()), mul_sub(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec8f nmul_add(Vec8f const& a, Vec8f const& b, Vec8f const& c)
+{
+  return Vec8f(nmul_add(a.get_low(), b.get_low(), c.get_low()), nmul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec8f mul_sub_x(Vec8f const& a, Vec8f const& b, Vec8f const& c)
+{
+  return Vec8f(mul_sub_x(a.get_low(), b.get_low(), c.get_low()), mul_sub_x(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Math functions using fast bit manipulation
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec8i exponent(Vec8f const& a) { return Vec8i(exponent(a.get_low()), exponent(a.get_high())); }
+#endif
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
+static inline Vec8f fraction(Vec8f const& a) { return Vec8f(fraction(a.get_low()), fraction(a.get_high())); }
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8f exp2(Vec8i const& a) { return Vec8f(exp2(a.get_low()), exp2(a.get_high())); }
+// static Vec8f exp2(Vec8f const & x); // defined in vectormath_exp.h
+#endif               // VECTORI256_H
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec8f(-0.0f)) gives true, while Vec8f(-0.0f) < Vec8f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb sign_bit(Vec8f const& a) { return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high())); }
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8f sign_combine(Vec8f const& a, Vec8f const& b)
+{
+  return Vec8f(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_finite(Vec8f const& a) { return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high())); }
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_inf(Vec8f const& a) { return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high())); }
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_nan(Vec8f const& a) { return Vec8fb(is_nan(a.get_low()), is_nan(a.get_high())); }
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8fb is_subnormal(Vec8f const& a) { return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high())); }
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8fb is_zero_or_subnormal(Vec8f const& a)
+{
+  return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec8f infinite8f()
+{
+  return constant8f<0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000>();
+}
+
+// Function nan4f: returns a vector where all elements are +NAN (quiet)
+static inline Vec8f nan8f(int n = 0x10) { return Vec8f(nan4f(n), nan4f(n)); }
+
+// change signs on vectors Vec8f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f change_sign(Vec8f const& a)
+{
+  if((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0)
+    return a;
+  Vec4f lo = change_sign<i0, i1, i2, i3>(a.get_low());
+  Vec4f hi = change_sign<i4, i5, i6, i7>(a.get_high());
+  return Vec8f(lo, hi);
+}
+
+/*****************************************************************************
+ *
+ *          Vec2d: Vector of 2 double precision floating point values
+ *
+ *****************************************************************************/
+
+class Vec4d : public Vec256de
+{
+ public:
+  // Default constructor:
+  Vec4d() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec4d(double d) { y1 = y0 = _mm_set1_pd(d); }
+  // Constructor to build from all elements:
+  Vec4d(double d0, double d1, double d2, double d3)
+  {
+    y0 = _mm_setr_pd(d0, d1);
+    y1 = _mm_setr_pd(d2, d3);
+  }
+  // Constructor to build from two Vec4f:
+  Vec4d(Vec2d const& a0, Vec2d const& a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256de
+  Vec4d(Vec256de const& x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256de
+  Vec4d& operator=(Vec256de const& x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec4d& load(double const* p)
+  {
+    y0 = _mm_loadu_pd(p);
+    y1 = _mm_loadu_pd(p + 2);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 32
+  Vec4d& load_a(double const* p)
+  {
+    y0 = _mm_load_pd(p);
+    y1 = _mm_load_pd(p + 2);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(double* p) const
+  {
+    _mm_storeu_pd(p, y0);
+    _mm_storeu_pd(p + 2, y1);
+  }
+  // Member function to store into array, aligned by 32
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 32
+  void store_a(double* p) const
+  {
+    _mm_store_pd(p, y0);
+    _mm_store_pd(p + 2, y1);
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec4d& load_partial(int n, double const* p)
+  {
+    if(n > 0 && n <= 2)
+      {
+        *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd());
+      }
+    else if(n > 2 && n <= 4)
+      {
+        *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2));
+      }
+    else
+      {
+        y1 = y0 = _mm_setzero_pd();
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, double* p) const
+  {
+    if(n <= 2)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n <= 4)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 2, p + 2);
+      }
+  }
+  Vec4d& cutoff(int n)
+  {
+    if(uint32_t(n) >= 4)
+      return *this;
+    else if(n >= 2)
+      {
+        y1 = Vec2d(y1).cutoff(n - 2);
+      }
+    else
+      {
+        y0 = Vec2d(y0).cutoff(n);
+        y1 = Vec2d(0.0);
+      }
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4d const& insert(uint32_t index, double value)
+  {
+    if(index < 2)
+      {
+        y0 = Vec2d(y0).insert(index, value);
+      }
+    else
+      {
+        y1 = Vec2d(y1).insert(index - 2, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  // Note: This function is inefficient. Use store function if extracting more than one element
+  double extract(uint32_t index) const
+  {
+    if(index < 2)
+      {
+        return Vec2d(y0).extract(index);
+      }
+    else
+      {
+        return Vec2d(y1).extract(index - 2);
+      }
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  double operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec2d:
+  Vec2d get_low() const { return y0; }
+  Vec2d get_high() const { return y1; }
+  static int size() { return 4; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec4d
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4d operator+(Vec4d const& a, Vec4d const& b) { return Vec4d(a.get_low() + b.get_low(), a.get_high() + b.get_high()); }
+
+// vector operator + : add vector and scalar
+static inline Vec4d operator+(Vec4d const& a, double b) { return a + Vec4d(b); }
+static inline Vec4d operator+(double a, Vec4d const& b) { return Vec4d(a) + b; }
+
+// vector operator += : add
+static inline Vec4d& operator+=(Vec4d& a, Vec4d const& b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec4d operator++(Vec4d& a, int)
+{
+  Vec4d a0 = a;
+  a        = a + 1.0;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec4d& operator++(Vec4d& a)
+{
+  a = a + 1.0;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4d operator-(Vec4d const& a, Vec4d const& b) { return Vec4d(a.get_low() - b.get_low(), a.get_high() - b.get_high()); }
+
+// vector operator - : subtract vector and scalar
+static inline Vec4d operator-(Vec4d const& a, double b) { return a - Vec4d(b); }
+static inline Vec4d operator-(double a, Vec4d const& b) { return Vec4d(a) - b; }
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec4d operator-(Vec4d const& a) { return Vec4d(-a.get_low(), -a.get_high()); }
+
+// vector operator -= : subtract
+static inline Vec4d& operator-=(Vec4d& a, Vec4d const& b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec4d operator--(Vec4d& a, int)
+{
+  Vec4d a0 = a;
+  a        = a - 1.0;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec4d& operator--(Vec4d& a)
+{
+  a = a - 1.0;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4d operator*(Vec4d const& a, Vec4d const& b) { return Vec4d(a.get_low() * b.get_low(), a.get_high() * b.get_high()); }
+
+// vector operator * : multiply vector and scalar
+static inline Vec4d operator*(Vec4d const& a, double b) { return a * Vec4d(b); }
+static inline Vec4d operator*(double a, Vec4d const& b) { return Vec4d(a) * b; }
+
+// vector operator *= : multiply
+static inline Vec4d& operator*=(Vec4d& a, Vec4d const& b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec4d operator/(Vec4d const& a, Vec4d const& b) { return Vec4d(a.get_low() / b.get_low(), a.get_high() / b.get_high()); }
+
+// vector operator / : divide vector and scalar
+static inline Vec4d operator/(Vec4d const& a, double b) { return a / Vec4d(b); }
+static inline Vec4d operator/(double a, Vec4d const& b) { return Vec4d(a) / b; }
+
+// vector operator /= : divide
+static inline Vec4d& operator/=(Vec4d& a, Vec4d const& b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4db operator==(Vec4d const& a, Vec4d const& b)
+{
+  return Vec4db(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4db operator!=(Vec4d const& a, Vec4d const& b)
+{
+  return Vec4db(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4db operator<(Vec4d const& a, Vec4d const& b)
+{
+  return Vec4db(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec4db operator<=(Vec4d const& a, Vec4d const& b)
+{
+  return Vec4db(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4db operator>(Vec4d const& a, Vec4d const& b)
+{
+  return Vec4db(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec4db operator>=(Vec4d const& a, Vec4d const& b)
+{
+  return Vec4db(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec4d operator&(Vec4d const& a, Vec4d const& b) { return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high()); }
+
+// vector operator &= : bitwise and
+static inline Vec4d& operator&=(Vec4d& a, Vec4d const& b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator & : bitwise and of Vec4d and Vec4db
+static inline Vec4d operator&(Vec4d const& a, Vec4db const& b)
+{
+  return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec4d operator&(Vec4db const& a, Vec4d const& b)
+{
+  return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator | : bitwise or
+static inline Vec4d operator|(Vec4d const& a, Vec4d const& b) { return Vec4d(a.get_low() | b.get_low(), a.get_high() | b.get_high()); }
+
+// vector operator |= : bitwise or
+static inline Vec4d& operator|=(Vec4d& a, Vec4d const& b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4d operator^(Vec4d const& a, Vec4d const& b) { return Vec4d(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); }
+
+// vector operator ^= : bitwise xor
+static inline Vec4d& operator^=(Vec4d& a, Vec4d const& b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec4db operator!(Vec4d const& a) { return Vec4db(!a.get_low(), !a.get_high()); }
+
+/*****************************************************************************
+ *
+ *          Functions for Vec4d
+ *
+ *****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true).
+// No other values are allowed.
+static inline Vec4d select(Vec4db const& s, Vec4d const& a, Vec4d const& b)
+{
+  return Vec4d(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4d if_add(Vec4db const& f, Vec4d const& a, Vec4d const& b) { return a + (Vec4d(f) & b); }
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec4d if_mul(Vec4db const& f, Vec4d const& a, Vec4d const& b) { return a * select(f, b, 1.f); }
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add(Vec4d const& a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// function max: a > b ? a : b
+static inline Vec4d max(Vec4d const& a, Vec4d const& b)
+{
+  return Vec4d(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec4d min(Vec4d const& a, Vec4d const& b)
+{
+  return Vec4d(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec4d abs(Vec4d const& a) { return Vec4d(abs(a.get_low()), abs(a.get_high())); }
+
+// function sqrt: square root
+static inline Vec4d sqrt(Vec4d const& a) { return Vec4d(sqrt(a.get_low()), sqrt(a.get_high())); }
+
+// function square: a * a
+static inline Vec4d square(Vec4d const& a) { return Vec4d(square(a.get_low()), square(a.get_high())); }
+
+// pow(Vec4d, int):
+// Raise floating point numbers to integer power n
+template <typename TT>
+static Vec4d pow(Vec4d const& a, TT const& n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec4d pow<int>(Vec4d const& x0, int const& n)
+{
+  return pow_template_i<Vec4d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec4d pow<uint32_t>(Vec4d const& x0, uint32_t const& n)
+{
+  return pow_template_i<Vec4d>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec4d pow_n(Vec4d const& a)
+{
+  return Vec4d(pow_n<n>(a.get_low()), pow_n<n>(a.get_high()));
+}
+
+template <int n>
+static inline Vec4d pow(Vec4d const& a, Const_int_t<n>)
+{
+  return pow_n<n>(a);
+}
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec4d round(Vec4d const& a) { return Vec4d(round(a.get_low()), round(a.get_high())); }
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec4d truncate(Vec4d const& a) { return Vec4d(truncate(a.get_low()), truncate(a.get_high())); }
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec4d floor(Vec4d const& a) { return Vec4d(floor(a.get_low()), floor(a.get_high())); }
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec4d ceil(Vec4d const& a) { return Vec4d(ceil(a.get_low()), ceil(a.get_high())); }
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec4i round_to_int(Vec4d const& a)
+{
+  // Note: assume MXCSR control register is set to rounding
+  return round_to_int(a.get_low(), a.get_high());
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec4i truncate_to_int(Vec4d const& a) { return truncate_to_int(a.get_low(), a.get_high()); }
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+
+// function truncate_to_int64: round towards zero. (inefficient)
+static inline Vec4q truncate_to_int64(Vec4d const& a)
+{
+  double aa[4];
+  a.store(aa);
+  return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]));
+}
+
+// function truncate_to_int64_limited: round towards zero.
+// result as 64-bit integer vector, but with limited range
+static inline Vec4q truncate_to_int64_limited(Vec4d const& a)
+{
+  return Vec4q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high()));
+}
+
+// function round_to_int64: round to nearest or even. (inefficient)
+static inline Vec4q round_to_int64(Vec4d const& a) { return truncate_to_int64(round(a)); }
+
+// function round_to_int64_limited: round to nearest integer
+// result as 64-bit integer vector, but with limited range
+static inline Vec4q round_to_int64_limited(Vec4d const& a)
+{
+  return Vec4q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high()));
+}
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec4d to_double(Vec4q const& a)
+{
+  int64_t aa[4];
+  a.store(aa);
+  return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]));
+}
+
+// function to_double_limited: convert integer vector elements to double vector
+// limited to abs(x) < 2^31
+static inline Vec4d to_double_limited(Vec4q const& x)
+{
+  return Vec4d(to_double_limited(x.get_low()), to_double_limited(x.get_high()));
+}
+
+#endif  // VECTORI256_H
+
+// function to_double: convert integer vector to double vector
+static inline Vec4d to_double(Vec4i const& a) { return Vec4d(to_double_low(a), to_double_high(a)); }
+
+// function compress: convert two Vec4d to one Vec8f
+static inline Vec8f compress(Vec4d const& low, Vec4d const& high)
+{
+  return Vec8f(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d
+static inline Vec4d extend_low(Vec8f const& a) { return Vec4d(extend_low(a.get_low()), extend_high(a.get_low())); }
+
+// Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d
+static inline Vec4d extend_high(Vec8f const& a) { return Vec4d(extend_low(a.get_high()), extend_high(a.get_high())); }
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec4d mul_add(Vec4d const& a, Vec4d const& b, Vec4d const& c)
+{
+  return Vec4d(mul_add(a.get_low(), b.get_low(), c.get_low()), mul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec4d mul_sub(Vec4d const& a, Vec4d const& b, Vec4d const& c)
+{
+  return Vec4d(mul_sub(a.get_low(), b.get_low(), c.get_low()), mul_sub(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec4d nmul_add(Vec4d const& a, Vec4d const& b, Vec4d const& c)
+{
+  return Vec4d(nmul_add(a.get_low(), b.get_low(), c.get_low()), nmul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec4d mul_sub_x(Vec4d const& a, Vec4d const& b, Vec4d const& c)
+{
+  return Vec4d(mul_sub_x(a.get_low(), b.get_low(), c.get_low()), mul_sub_x(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Math functions using fast bit manipulation
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available, AVX2
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec4q exponent(Vec4d const& a) { return Vec4q(exponent(a.get_low()), exponent(a.get_high())); }
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25
+static inline Vec4d fraction(Vec4d const& a) { return Vec4d(fraction(a.get_low()), fraction(a.get_high())); }
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec4d exp2(Vec4q const& a) { return Vec4d(exp2(a.get_low()), exp2(a.get_high())); }
+// static Vec4d exp2(Vec4d const & x); // defined in vectormath_exp.h
+#endif
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false
+static inline Vec4db sign_bit(Vec4d const& a) { return Vec4db(sign_bit(a.get_low()), sign_bit(a.get_high())); }
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec4d sign_combine(Vec4d const& a, Vec4d const& b)
+{
+  return Vec4d(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+static inline Vec4db is_finite(Vec4d const& a) { return Vec4db(is_finite(a.get_low()), is_finite(a.get_high())); }
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec4db is_inf(Vec4d const& a) { return Vec4db(is_inf(a.get_low()), is_inf(a.get_high())); }
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec4db is_nan(Vec4d const& a) { return Vec4db(is_nan(a.get_low()), is_nan(a.get_high())); }
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec4db is_subnormal(Vec4d const& a) { return Vec4db(is_subnormal(a.get_low()), is_subnormal(a.get_high())); }
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec4db is_zero_or_subnormal(Vec4d const& a)
+{
+  return Vec4db(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec4d infinite4d() { return Vec4d(infinite2d(), infinite2d()); }
+
+// Function nan2d: returns a vector where all elements are +NAN (quiet)
+static inline Vec4d nan4d(int n = 0x10) { return Vec4d(nan2d(n), nan2d(n)); }
+
+// change signs on vectors Vec4d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d change_sign(Vec4d const& a)
+{
+  if((i0 | i1 | i2 | i3) == 0)
+    return a;
+  Vec2d lo = change_sign<i0, i1>(a.get_low());
+  Vec2d hi = change_sign<i2, i3>(a.get_high());
+  return Vec4d(lo, hi);
+}
+
+/*****************************************************************************
+ *
+ *          Functions for reinterpretation between vector types
+ *
+ *****************************************************************************/
+
+static inline Vec256ie reinterpret_i(Vec256ie const& x) { return x; }
+
+static inline Vec256ie reinterpret_i(Vec256fe const& x) { return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); }
+
+static inline Vec256ie reinterpret_i(Vec256de const& x) { return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); }
+
+static inline Vec256fe reinterpret_f(Vec256ie const& x) { return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high())); }
+
+static inline Vec256fe reinterpret_f(Vec256fe const& x) { return x; }
+
+static inline Vec256fe reinterpret_f(Vec256de const& x) { return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high())); }
+
+static inline Vec256de reinterpret_d(Vec256ie const& x) { return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high())); }
+
+static inline Vec256de reinterpret_d(Vec256fe const& x) { return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high())); }
+
+static inline Vec256de reinterpret_d(Vec256de const& x) { return x; }
+
+/*****************************************************************************
+ *
+ *          Vector permute and blend functions
+ *
+ ******************************************************************************
+ *
+ * The permute function can reorder the elements of a vector and optionally
+ * set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select. An index of -1 will generate zero. An index of -256 means don't care.
+ *
+ * Example:
+ * Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
+ * Vec4d b;
+ * b = permute4d<1,0,-1,3>(a);     // b is (11, 10,  0, 13)
+ *
+ *
+ * The blend function can mix elements from two different vectors and
+ * optionally set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select, where indexes 0 - 3 indicate an element from the first source
+ * vector and indexes 4 - 7 indicate an element from the second source vector.
+ * A negative index will generate zero.
+ *
+ *
+ * Example:
+ * Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
+ * Vec4d b(20., 21., 22., 23.);    // a is (20, 21, 22, 23)
+ * Vec4d c;
+ * c = blend4d<4,3,7,-1> (a,b);    // c is (20, 13, 23,  0)
+ *****************************************************************************/
+
+// permute vector Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d permute4d(Vec4d const& a)
+{
+  return Vec4d(blend2d<i0, i1>(a.get_low(), a.get_high()), blend2d<i2, i3>(a.get_low(), a.get_high()));
+}
+
+// helper function used below
+template <int n>
+static inline Vec2d select4(Vec4d const& a, Vec4d const& b)
+{
+  switch(n)
+    {
+      case 0:
+        return a.get_low();
+      case 1:
+        return a.get_high();
+      case 2:
+        return b.get_low();
+      case 3:
+        return b.get_high();
+    }
+  return _mm_setzero_pd();
+}
+
+// blend vectors Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d blend4d(Vec4d const& a, Vec4d const& b)
+{
+  const int j0 = i0 >= 0 ? i0 / 2 : i0;
+  const int j1 = i1 >= 0 ? i1 / 2 : i1;
+  const int j2 = i2 >= 0 ? i2 / 2 : i2;
+  const int j3 = i3 >= 0 ? i3 / 2 : i3;
+  Vec2d x0, x1;
+
+  if(j0 == j1 || i0 < 0 || i1 < 0)
+    {  // both from same
+      const int k0 = j0 >= 0 ? j0 : j1;
+      x0           = permute2d<i0 & -7, i1 & -7>(select4<k0>(a, b));
+    }
+  else
+    {
+      x0 = blend2d<i0 & -7, (i1 & -7) | 2>(select4<j0>(a, b), select4<j1>(a, b));
+    }
+  if(j2 == j3 || i2 < 0 || i3 < 0)
+    {  // both from same
+      const int k1 = j2 >= 0 ? j2 : j3;
+      x1           = permute2d<i2 & -7, i3 & -7>(select4<k1>(a, b));
+    }
+  else
+    {
+      x1 = blend2d<i2 & -7, (i3 & -7) | 2>(select4<j2>(a, b), select4<j3>(a, b));
+    }
+  return Vec4d(x0, x1);
+}
+
+/*****************************************************************************
+ *
+ *          Vector Vec8f permute and blend functions
+ *
+ *****************************************************************************/
+
+// permute vector Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f permute8f(Vec8f const& a)
+{
+  return Vec8f(blend4f<i0, i1, i2, i3>(a.get_low(), a.get_high()), blend4f<i4, i5, i6, i7>(a.get_low(), a.get_high()));
+}
+
+// helper function used below
+template <int n>
+static inline Vec4f select4(Vec8f const& a, Vec8f const& b)
+{
+  switch(n)
+    {
+      case 0:
+        return a.get_low();
+      case 1:
+        return a.get_high();
+      case 2:
+        return b.get_low();
+      case 3:
+        return b.get_high();
+    }
+  return _mm_setzero_ps();
+}
+
+// blend vectors Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f blend8f(Vec8f const& a, Vec8f const& b)
+{
+  const int j0 = i0 >= 0 ? i0 / 4 : i0;
+  const int j1 = i1 >= 0 ? i1 / 4 : i1;
+  const int j2 = i2 >= 0 ? i2 / 4 : i2;
+  const int j3 = i3 >= 0 ? i3 / 4 : i3;
+  const int j4 = i4 >= 0 ? i4 / 4 : i4;
+  const int j5 = i5 >= 0 ? i5 / 4 : i5;
+  const int j6 = i6 >= 0 ? i6 / 4 : i6;
+  const int j7 = i7 >= 0 ? i7 / 4 : i7;
+  Vec4f x0, x1;
+
+  const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
+  const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
+  const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
+  const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
+
+  // Combine all the indexes into a single bitfield, with 4 bits for each
+  const int m1 = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                 (i6 & 0xF) << 24 | (i7 & 0xF) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12 |
+                 (i4 < 0 ? 0 : 0xF) << 16 | (i5 < 0 ? 0 : 0xF) << 20 | (i6 < 0 ? 0 : 0xF) << 24 | (i7 < 0 ? 0 : 0xF) << 28;
+
+  if(r0 < 0)
+    {
+      x0 = _mm_setzero_ps();
+    }
+  else if(((m1 ^ r0 * 0x4444) & 0xCCCC & mz) == 0)
+    {
+      // i0 - i3 all from same source
+      x0 = permute4f<i0 & -13, i1 & -13, i2 & -13, i3 & -13>(select4<r0>(a, b));
+    }
+  else if((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0))
+    {
+      // i0 - i3 all from two sources
+      const int k0 = i0 >= 0 ? i0 & 3 : i0;
+      const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
+      const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
+      const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
+      x0           = blend4f<k0, k1, k2, k3>(select4<r0>(a, b), select4<s0>(a, b));
+    }
+  else
+    {
+      // i0 - i3 from three or four different sources
+      x0 = blend4f<0, 1, 6, 7>(blend4f<i0 & -13, (i1 & -13) | 4, -0x100, -0x100>(select4<j0>(a, b), select4<j1>(a, b)),
+                               blend4f<-0x100, -0x100, i2 & -13, (i3 & -13) | 4>(select4<j2>(a, b), select4<j3>(a, b)));
+    }
+
+  if(r1 < 0)
+    {
+      x1 = _mm_setzero_ps();
+    }
+  else if(((m1 ^ r1 * 0x44440000u) & 0xCCCC0000 & mz) == 0)
+    {
+      // i4 - i7 all from same source
+      x1 = permute4f<i4 & -13, i5 & -13, i6 & -13, i7 & -13>(select4<r1>(a, b));
+    }
+  else if((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1))
+    {
+      // i4 - i7 all from two sources
+      const int k4 = i4 >= 0 ? i4 & 3 : i4;
+      const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
+      const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
+      const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
+      x1           = blend4f<k4, k5, k6, k7>(select4<r1>(a, b), select4<s1>(a, b));
+    }
+  else
+    {
+      // i4 - i7 from three or four different sources
+      x1 = blend4f<0, 1, 6, 7>(blend4f<i4 & -13, (i5 & -13) | 4, -0x100, -0x100>(select4<j4>(a, b), select4<j5>(a, b)),
+                               blend4f<-0x100, -0x100, i6 & -13, (i7 & -13) | 4>(select4<j6>(a, b), select4<j7>(a, b)));
+    }
+
+  return Vec8f(x0, x1);
+}
+
+/*****************************************************************************
+ *
+ *          Vector lookup functions
+ *
+ ******************************************************************************
+ *
+ * These functions use vector elements as indexes into a table.
+ * The table is given as one or more vectors or as an array.
+ *
+ * This can be used for several purposes:
+ *  - table lookup
+ *  - permute or blend with variable indexes
+ *  - blend from more than two sources
+ *  - gather non-contiguous data
+ *
+ * An index out of range may produce any value - the actual value produced is
+ * implementation dependent and may be different for different instruction
+ * sets. An index out of range does not produce an error message or exception.
+ *
+ * Example:
+ * Vec4i a(2,0,0,3);               // index  a is (  2,   0,   0,   3)
+ * Vec4f b(1.0f,1.1f,1.2f,1.3f);   // table  b is (1.0, 1.1, 1.2, 1.3)
+ * Vec4f c;
+ * c = lookup4 (a,b);              // result c is (1.2, 1.0, 1.0, 1.3)
+ *
+ *****************************************************************************/
+
+#ifdef VECTORI256_H  // Vec8i and Vec4q must be defined
+
+static inline Vec8f lookup8(Vec8i const& index, Vec8f const& table)
+{
+  Vec4f r0 = lookup8(index.get_low(), table.get_low(), table.get_high());
+  Vec4f r1 = lookup8(index.get_high(), table.get_low(), table.get_high());
+  return Vec8f(r0, r1);
+}
+
+template <int n>
+static inline Vec8f lookup(Vec8i const& index, float const* table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 4)
+    {
+      Vec4f table1 = Vec4f().load(table);
+      return Vec8f(lookup4(index.get_low(), table1), lookup4(index.get_high(), table1));
+    }
+  if(n <= 8)
+    {
+      return lookup8(index, Vec8f().load(table));
+    }
+  // Limit index
+  Vec8ui index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec8ui(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec8ui(index), n - 1);
+    }
+  return Vec8f(table[index1[0]], table[index1[1]], table[index1[2]], table[index1[3]], table[index1[4]], table[index1[5]],
+               table[index1[6]], table[index1[7]]);
+}
+
+static inline Vec4d lookup4(Vec4q const& index, Vec4d const& table)
+{
+  Vec2d r0 = lookup4(index.get_low(), table.get_low(), table.get_high());
+  Vec2d r1 = lookup4(index.get_high(), table.get_low(), table.get_high());
+  return Vec4d(r0, r1);
+}
+
+template <int n>
+static inline Vec4d lookup(Vec4q const& index, double const* table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 2)
+    {
+      Vec2d table1 = Vec2d().load(table);
+      return Vec4d(lookup2(index.get_low(), table1), lookup2(index.get_high(), table1));
+    }
+  // Limit index
+  Vec8ui index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec8ui(index) & constant8i<n - 1, 0, n - 1, 0, n - 1, 0, n - 1, 0>();
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec8ui(index), constant8i<n - 1, 0, n - 1, 0, n - 1, 0, n - 1, 0>());
+    }
+  Vec4q index2 = Vec4q(index1);
+  return Vec4d(table[index2[0]], table[index2[1]], table[index2[2]], table[index2[3]]);
+}
+#endif               // VECTORI256_H
+
+/*****************************************************************************
+ *
+ *          Vector scatter functions
+ *
+ ******************************************************************************
+ *
+ * These functions write the elements of a vector to arbitrary positions in an
+ * array in memory. Each vector element is written to an array position
+ * determined by an index. An element is not written if the corresponding
+ * index is out of range.
+ * The indexes can be specified as constant template parameters or as an
+ * integer vector.
+ *
+ * The scatter functions are useful if the data are distributed in a sparce
+ * manner into the array. If the array is dense then it is more efficient
+ * to permute the data into the right positions and then write the whole
+ * permuted vector into the array.
+ *
+ * Example:
+ * Vec8d a(10,11,12,13,14,15,16,17);
+ * double b[16] = {0};
+ * scatter<0,2,14,10,1,-1,5,9>(a,b);
+ * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
+ *
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8f const& data, float* array)
+{
+  const int index[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+  for(int i = 0; i < 8; i++)
+    {
+      if(index[i] >= 0)
+        array[index[i]] = data[i];
+    }
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4d const& data, double* array)
+{
+  const int index[4] = {i0, i1, i2, i3};
+  for(int i = 0; i < 4; i++)
+    {
+      if(index[i] >= 0)
+        array[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec8i const& index, uint32_t limit, Vec8f const& data, float* array)
+{
+  for(int i = 0; i < 8; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        array[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec4q const& index, uint32_t limit, Vec4d const& data, double* array)
+{
+  for(int i = 0; i < 4; i++)
+    {
+      if(uint64_t(index[i]) < uint64_t(limit))
+        array[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec4i const& index, uint32_t limit, Vec4d const& data, double* array)
+{
+  for(int i = 0; i < 4; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        array[index[i]] = data[i];
+    }
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal scan functions
+ *
+ *****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+
+static inline int horizontal_find_first(Vec8fb const& x) { return horizontal_find_first(Vec8ib(x)); }
+
+static inline int horizontal_find_first(Vec4db const& x) { return horizontal_find_first(Vec4qb(x)); }
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec8fb const& x) { return horizontal_count(Vec8ib(x)); }
+
+static inline uint32_t horizontal_count(Vec4db const& x) { return horizontal_count(Vec4qb(x)); }
+
+/*****************************************************************************
+ *
+ *          Boolean <-> bitfield conversion functions
+ *
+ *****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8fb const& x) { return to_bits(Vec8ib(x)); }
+
+// to_Vec8fb: convert integer bitfield to boolean vector
+static inline Vec8fb to_Vec8fb(uint8_t x) { return Vec8fb(to_Vec8ib(x)); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4db const& x) { return to_bits(Vec4qb(x)); }
+
+// to_Vec4db: convert integer bitfield to boolean vector
+static inline Vec4db to_Vec4db(uint8_t x) { return Vec4db(to_Vec4qb(x)); }
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORF256_H
diff --git a/src/vectorclass/vectorf512.h b/src/vectorclass/vectorf512.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a4ccfc145e3cd460675c94a8c1019e8291d27a
--- /dev/null
+++ b/src/vectorclass/vectorf512.h
@@ -0,0 +1,2513 @@
+/****************************  vectorf512.h   *******************************
+ * Author:        Agner Fog
+ * Date created:  2014-07-23
+ * Last modified: 2017-02-19
+ * Version:       1.27
+ * Project:       vector classes
+ * Description:
+ * Header file defining floating point vector classes as interface to intrinsic
+ * functions in x86 microprocessors with AVX512 and later instruction sets.
+ *
+ * Instructions:
+ * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
+ * instruction set, which must be at least AVX512F.
+ *
+ * The following vector classes are defined here:
+ * Vec16f    Vector of  16  single precision floating point numbers
+ * Vec16fb   Vector of  16  Booleans for use with Vec16f
+ * Vec8d     Vector of   8  double precision floating point numbers
+ * Vec8db    Vector of   8  Booleans for use with Vec8d
+ *
+ * Each vector object is represented internally in the CPU as a 512-bit register.
+ * This header file defines operators and functions for these vectors.
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2014-2017 GNU General Public License http://www.gnu.org/licenses
+ *****************************************************************************/
+
+// check combination of header files
+#if defined(VECTORF512_H)
+#if VECTORF512_H != 2
+#error Two different versions of vectorf512.h included
+#endif
+#else
+#define VECTORF512_H 2
+
+#include "vectori512.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+// Define missing intrinsic functions
+#if defined(GCC_VERSION) && GCC_VERSION < 41102 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+
+static inline __m512 _mm512_castpd_ps(__m512d x)
+{
+  union
+  {
+    __m512d a;
+    __m512 b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+static inline __m512d _mm512_castps_pd(__m512 x)
+{
+  union
+  {
+    __m512 a;
+    __m512d b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+static inline __m512i _mm512_castps_si512(__m512 x)
+{
+  union
+  {
+    __m512 a;
+    __m512i b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+static inline __m512 _mm512_castsi512_ps(__m512i x)
+{
+  union
+  {
+    __m512i a;
+    __m512 b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+static inline __m512i _mm512_castpd_si512(__m512d x)
+{
+  union
+  {
+    __m512d a;
+    __m512i b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+static inline __m512d _mm512_castsi512_pd(__m512i x)
+{
+  union
+  {
+    __m512i a;
+    __m512d b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+static inline __m512 _mm512_castps256_ps512(__m256 x)
+{
+  union
+  {
+    __m256 a;
+    __m512 b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+static inline __m256 _mm512_castps512_ps256(__m512 x)
+{
+  union
+  {
+    __m512 a;
+    __m256 b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+static inline __m512d _mm512_castpd256_pd512(__m256d x)
+{
+  union
+  {
+    __m256d a;
+    __m512d b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+static inline __m256d _mm512_castpd512_pd256(__m512d x)
+{
+  union
+  {
+    __m512d a;
+    __m256d b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+#endif
+
+/*****************************************************************************
+ *
+ *          Vec16fb: Vector of 16 Booleans for use with Vec16f
+ *
+ *****************************************************************************/
+class Vec16fb : public Vec16b
+{
+ public:
+  // Default constructor:
+  Vec16fb() {}
+  Vec16fb(Vec16b x) { m16 = x; }
+  // Constructor to build from all elements:
+  Vec16fb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, bool x8, bool x9, bool x10, bool x11, bool x12,
+          bool x13, bool x14, bool x15)
+      : Vec16b(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15)
+  {
+  }
+  // Constructor to convert from type __mmask16 used in intrinsics:
+  Vec16fb(__mmask16 x) { m16 = x; }
+  // Constructor to broadcast single value:
+  Vec16fb(bool b) : Vec16b(b) {}
+
+ private:  // Prevent constructing from int, etc.
+  Vec16fb(int b);
+
+ public:
+  // Constructor to make from two halves
+  Vec16fb(Vec8fb const &x0, Vec8fb const &x1) { m16 = Vec16b(Vec8ib(x0), Vec8ib(x1)); }
+  // Assignment operator to convert from type __mmask16 used in intrinsics:
+  Vec16fb &operator=(__mmask16 x)
+  {
+    m16 = x;
+    return *this;
+  }
+  // Assignment operator to broadcast scalar value:
+  Vec16fb &operator=(bool b)
+  {
+    m16 = Vec16b(b);
+    return *this;
+  }
+
+ private:  // Prevent assigning int because of ambiguity
+  Vec16fb &operator=(int x);
+
+ public:
+};
+
+// Define operators for Vec16fb
+
+// vector operator & : bitwise and
+static inline Vec16fb operator&(Vec16fb a, Vec16fb b) { return Vec16b(a) & Vec16b(b); }
+static inline Vec16fb operator&&(Vec16fb a, Vec16fb b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec16fb operator|(Vec16fb a, Vec16fb b) { return Vec16b(a) | Vec16b(b); }
+static inline Vec16fb operator||(Vec16fb a, Vec16fb b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec16fb operator^(Vec16fb a, Vec16fb b) { return Vec16b(a) ^ Vec16b(b); }
+
+// vector operator ~ : bitwise not
+static inline Vec16fb operator~(Vec16fb a) { return ~Vec16b(a); }
+
+// vector operator ! : element not
+static inline Vec16fb operator!(Vec16fb a) { return ~a; }
+
+// vector operator &= : bitwise and
+static inline Vec16fb &operator&=(Vec16fb &a, Vec16fb b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16fb &operator|=(Vec16fb &a, Vec16fb b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16fb &operator^=(Vec16fb &a, Vec16fb b)
+{
+  a = a ^ b;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Vec8db: Vector of 8 Booleans for use with Vec8d
+ *
+ *****************************************************************************/
+
+class Vec8db : public Vec8b
+{
+ public:
+  // Default constructor:
+  Vec8db() {}
+  Vec8db(Vec16b x) { m16 = x; }
+  // Constructor to build from all elements:
+  Vec8db(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) : Vec8b(x0, x1, x2, x3, x4, x5, x6, x7) {}
+  // Constructor to convert from type __mmask8 used in intrinsics:
+  Vec8db(__mmask8 x) { m16 = x; }
+  // Constructor to convert from type __mmask16 used in intrinsics:
+  Vec8db(__mmask16 x) { m16 = x; }
+  // Constructor to build from two halves
+  Vec8db(Vec4db const &x0, Vec4db const &x1) { m16 = Vec8qb(Vec4qb(x0), Vec4qb(x1)); }
+  // Assignment operator to convert from type __mmask8 used in intrinsics:
+  Vec8db &operator=(__mmask8 x)
+  {
+    m16 = (__mmask16)x;
+    return *this;
+  }
+  // Assignment operator to convert from type __mmask16 used in intrinsics:
+  Vec8db &operator=(__mmask16 x)
+  {
+    m16 = x;
+    return *this;
+  }
+  // Constructor to broadcast single value:
+  Vec8db(bool b) : Vec8b(b) {}
+  // Assignment operator to broadcast scalar:
+  Vec8db &operator=(bool b)
+  {
+    m16 = Vec8b(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec8db(int b);
+  Vec8db &operator=(int x);
+
+ public:
+  static int size() { return 8; }
+};
+
+// Define operators for Vec8db
+
+// vector operator & : bitwise and
+static inline Vec8db operator&(Vec8db a, Vec8db b) { return Vec16b(a) & Vec16b(b); }
+static inline Vec8db operator&&(Vec8db a, Vec8db b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec8db operator|(Vec8db a, Vec8db b) { return Vec16b(a) | Vec16b(b); }
+static inline Vec8db operator||(Vec8db a, Vec8db b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec8db operator^(Vec8db a, Vec8db b) { return Vec16b(a) ^ Vec16b(b); }
+
+// vector operator ~ : bitwise not
+static inline Vec8db operator~(Vec8db a) { return ~Vec16b(a); }
+
+// vector operator ! : element not
+static inline Vec8db operator!(Vec8db a) { return ~a; }
+
+// vector operator &= : bitwise and
+static inline Vec8db &operator&=(Vec8db &a, Vec8db b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8db &operator|=(Vec8db &a, Vec8db b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8db &operator^=(Vec8db &a, Vec8db b)
+{
+  a = a ^ b;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Vec16f: Vector of 16 single precision floating point values
+ *
+ *****************************************************************************/
+
+class Vec16f
+{
+ protected:
+  __m512 zmm;  // Float vector
+ public:
+  // Default constructor:
+  Vec16f() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec16f(float f) { zmm = _mm512_set1_ps(f); }
+  // Constructor to build from all elements:
+  Vec16f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7, float f8, float f9, float f10, float f11,
+         float f12, float f13, float f14, float f15)
+  {
+    zmm = _mm512_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15);
+  }
+  // Constructor to build from two Vec8f:
+  Vec16f(Vec8f const &a0, Vec8f const &a1)
+  {
+    zmm = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(_mm512_castps256_ps512(a0)), _mm256_castps_pd(a1), 1));
+  }
+  // Constructor to convert from type __m512 used in intrinsics:
+  Vec16f(__m512 const &x) { zmm = x; }
+  // Assignment operator to convert from type __m512 used in intrinsics:
+  Vec16f &operator=(__m512 const &x)
+  {
+    zmm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m512 used in intrinsics
+  operator __m512() const { return zmm; }
+  // Member function to load from array (unaligned)
+  Vec16f &load(float const *p)
+  {
+    zmm = _mm512_loadu_ps(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 64.
+  Vec16f &load_a(float const *p)
+  {
+    zmm = _mm512_load_ps(p);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(float *p) const { _mm512_storeu_ps(p, zmm); }
+  // Member function to store into array, aligned by 64
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 64.
+  void store_a(float *p) const { _mm512_store_ps(p, zmm); }
+  // Partial load. Load n elements and set the rest to 0
+  Vec16f &load_partial(int n, float const *p)
+  {
+    zmm = _mm512_maskz_loadu_ps(__mmask16((1 << n) - 1), p);
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, float *p) const { _mm512_mask_storeu_ps(p, __mmask16((1 << n) - 1), zmm); }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec16f &cutoff(int n)
+  {
+    zmm = _mm512_maskz_mov_ps(__mmask16((1 << n) - 1), zmm);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  Vec16f const &insert(uint32_t index, float value)
+  {
+    // zmm = _mm512_mask_set1_ps(zmm, __mmask16(1 << index), value);  // this intrinsic function does not exist (yet?)
+    zmm = _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(_mm512_castps_si512(zmm), __mmask16(1 << index), *(int32_t *)&value));  // ignore warning
+    return *this;
+  }
+  // Member function extract a single element from vector
+  float extract(uint32_t index) const
+  {
+    float a[16];
+    store(a);
+    return a[index & 15];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  float operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4f:
+  Vec8f get_low() const { return _mm512_castps512_ps256(zmm); }
+  Vec8f get_high() const { return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(zmm), 1)); }
+  static int size() { return 16; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec16f
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16f operator+(Vec16f const &a, Vec16f const &b) { return _mm512_add_ps(a, b); }
+
+// vector operator + : add vector and scalar
+static inline Vec16f operator+(Vec16f const &a, float b) { return a + Vec16f(b); }
+static inline Vec16f operator+(float a, Vec16f const &b) { return Vec16f(a) + b; }
+
+// vector operator += : add
+static inline Vec16f &operator+=(Vec16f &a, Vec16f const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec16f operator++(Vec16f &a, int)
+{
+  Vec16f a0 = a;
+  a         = a + 1.0f;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec16f &operator++(Vec16f &a)
+{
+  a = a + 1.0f;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16f operator-(Vec16f const &a, Vec16f const &b) { return _mm512_sub_ps(a, b); }
+
+// vector operator - : subtract vector and scalar
+static inline Vec16f operator-(Vec16f const &a, float b) { return a - Vec16f(b); }
+static inline Vec16f operator-(float a, Vec16f const &b) { return Vec16f(a) - b; }
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec16f operator-(Vec16f const &a) { return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) ^ 0x80000000); }
+
+// vector operator -= : subtract
+static inline Vec16f &operator-=(Vec16f &a, Vec16f const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec16f operator--(Vec16f &a, int)
+{
+  Vec16f a0 = a;
+  a         = a - 1.0f;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec16f &operator--(Vec16f &a)
+{
+  a = a - 1.0f;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16f operator*(Vec16f const &a, Vec16f const &b) { return _mm512_mul_ps(a, b); }
+
+// vector operator * : multiply vector and scalar
+static inline Vec16f operator*(Vec16f const &a, float b) { return a * Vec16f(b); }
+static inline Vec16f operator*(float a, Vec16f const &b) { return Vec16f(a) * b; }
+
+// vector operator *= : multiply
+static inline Vec16f &operator*=(Vec16f &a, Vec16f const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16f operator/(Vec16f const &a, Vec16f const &b) { return _mm512_div_ps(a, b); }
+
+// vector operator / : divide vector and scalar
+static inline Vec16f operator/(Vec16f const &a, float b) { return a / Vec16f(b); }
+static inline Vec16f operator/(float a, Vec16f const &b) { return Vec16f(a) / b; }
+
+// vector operator /= : divide
+static inline Vec16f &operator/=(Vec16f &a, Vec16f const &b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16fb operator==(Vec16f const &a, Vec16f const &b)
+{
+  //    return _mm512_cmpeq_ps_mask(a, b);
+  return _mm512_cmp_ps_mask(a, b, 0);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16fb operator!=(Vec16f const &a, Vec16f const &b)
+{
+  //    return _mm512_cmpneq_ps_mask(a, b);
+  return _mm512_cmp_ps_mask(a, b, 4);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16fb operator<(Vec16f const &a, Vec16f const &b)
+{
+  //    return _mm512_cmplt_ps_mask(a, b);
+  return _mm512_cmp_ps_mask(a, b, 1);
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec16fb operator<=(Vec16f const &a, Vec16f const &b)
+{
+  //    return _mm512_cmple_ps_mask(a, b);
+  return _mm512_cmp_ps_mask(a, b, 2);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16fb operator>(Vec16f const &a, Vec16f const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec16fb operator>=(Vec16f const &a, Vec16f const &b) { return b <= a; }
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec16f operator&(Vec16f const &a, Vec16f const &b)
+{
+  return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) & Vec16i(_mm512_castps_si512(b)));
+}
+
+// vector operator &= : bitwise and
+static inline Vec16f &operator&=(Vec16f &a, Vec16f const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator & : bitwise and of Vec16f and Vec16fb
+static inline Vec16f operator&(Vec16f const &a, Vec16fb const &b) { return _mm512_maskz_mov_ps(b, a); }
+static inline Vec16f operator&(Vec16fb const &a, Vec16f const &b) { return b & a; }
+
+// vector operator | : bitwise or
+static inline Vec16f operator|(Vec16f const &a, Vec16f const &b)
+{
+  return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) | Vec16i(_mm512_castps_si512(b)));
+}
+
+// vector operator |= : bitwise or
+static inline Vec16f &operator|=(Vec16f &a, Vec16f const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16f operator^(Vec16f const &a, Vec16f const &b)
+{
+  return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) ^ Vec16i(_mm512_castps_si512(b)));
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16f &operator^=(Vec16f &a, Vec16f const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec16fb operator!(Vec16f const &a) { return a == Vec16f(0.0f); }
+
+/*****************************************************************************
+ *
+ *          Functions for Vec16f
+ *
+ *****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec16f select(Vec16fb const &s, Vec16f const &a, Vec16f const &b) { return _mm512_mask_mov_ps(b, s, a); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16f if_add(Vec16fb const &f, Vec16f const &a, Vec16f const &b) { return _mm512_mask_add_ps(a, f, a, b); }
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec16f if_mul(Vec16fb const &f, Vec16f const &a, Vec16f const &b) { return _mm512_mask_mul_ps(a, f, a, b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add(Vec16f const &a)
+{
+#if defined(__INTEL_COMPILER)
+  return _mm512_reduce_add_ps(a);
+#else
+  return horizontal_add(a.get_low() + a.get_high());
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec16f max(Vec16f const &a, Vec16f const &b) { return _mm512_max_ps(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec16f min(Vec16f const &a, Vec16f const &b) { return _mm512_min_ps(a, b); }
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec16f abs(Vec16f const &a)
+{
+  union
+  {
+    int32_t i;
+    float f;
+  } u = {0x7FFFFFFF};
+  return a & Vec16f(u.f);
+}
+
+// function sqrt: square root
+static inline Vec16f sqrt(Vec16f const &a) { return _mm512_sqrt_ps(a); }
+
+// function square: a * a
+static inline Vec16f square(Vec16f const &a) { return a * a; }
+
+// pow(Vec16f, int):
+template <typename TT>
+static Vec16f pow(Vec16f const &a, TT const &n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec16f pow<int>(Vec16f const &x0, int const &n)
+{
+  return pow_template_i<Vec16f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec16f pow<uint32_t>(Vec16f const &x0, uint32_t const &n)
+{
+  return pow_template_i<Vec16f>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec16f pow_n(Vec16f const &a)
+{
+  if(n < 0)
+    return Vec16f(1.0f) / pow_n<-n>(a);
+  if(n == 0)
+    return Vec16f(1.0f);
+  if(n >= 256)
+    return pow(a, n);
+  Vec16f x = a;                          // a^(2^i)
+  Vec16f y;                              // accumulator
+  const int lowest = n - (n & (n - 1));  // lowest set bit in n
+  if(n & 1)
+    y = x;
+  if(n < 2)
+    return y;
+  x = x * x;  // x^2
+  if(n & 2)
+    {
+      if(lowest == 2)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 4)
+    return y;
+  x = x * x;  // x^4
+  if(n & 4)
+    {
+      if(lowest == 4)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 8)
+    return y;
+  x = x * x;  // x^8
+  if(n & 8)
+    {
+      if(lowest == 8)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 16)
+    return y;
+  x = x * x;  // x^16
+  if(n & 16)
+    {
+      if(lowest == 16)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 32)
+    return y;
+  x = x * x;  // x^32
+  if(n & 32)
+    {
+      if(lowest == 32)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 64)
+    return y;
+  x = x * x;  // x^64
+  if(n & 64)
+    {
+      if(lowest == 64)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 128)
+    return y;
+  x = x * x;  // x^128
+  if(n & 128)
+    {
+      if(lowest == 128)
+        y = x;
+      else
+        y *= x;
+    }
+  return y;
+}
+
+template <int n>
+static inline Vec16f pow(Vec16f const &a, Const_int_t<n>)
+{
+  return pow_n<n>(a);
+}
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec16f round(Vec16f const &a) { return _mm512_roundscale_ps(a, 0 + 8); }
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec16f truncate(Vec16f const &a) { return _mm512_roundscale_ps(a, 3 + 8); }
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec16f floor(Vec16f const &a) { return _mm512_roundscale_ps(a, 1 + 8); }
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec16f ceil(Vec16f const &a) { return _mm512_roundscale_ps(a, 2 + 8); }
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec16i round_to_int(Vec16f const &a) { return _mm512_cvt_roundps_epi32(a, 0 + 8 /*_MM_FROUND_NO_EXC*/); }
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec16i truncate_to_int(Vec16f const &a) { return _mm512_cvtt_roundps_epi32(a, 0 + 8 /*_MM_FROUND_NO_EXC*/); }
+
+// function to_float: convert integer vector to float vector
+static inline Vec16f to_float(Vec16i const &a) { return _mm512_cvtepi32_ps(a); }
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec16f to_float(Vec16ui const &a) { return _mm512_cvtepu32_ps(a); }
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a.
+// relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512F, full precision with AVX512ER)
+static inline Vec16f approx_recipr(Vec16f const &a)
+{
+#ifdef __AVX512ER__  // AVX512ER instruction set includes fast reciprocal with better precision
+  return _mm512_rcp28_round_ps(a, _MM_FROUND_NO_EXC);
+#else
+  return _mm512_rcp14_ps(a);
+#endif
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a).
+// Relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512F, full precision with AVX512ER)
+static inline Vec16f approx_rsqrt(Vec16f const &a)
+{
+#ifdef __AVX512ER__  // AVX512ER instruction set includes fast reciprocal squareroot with better precision
+  return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
+#else
+  return _mm512_rsqrt14_ps(a);
+#endif
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec16f mul_add(Vec16f const &a, Vec16f const &b, Vec16f const &c) { return _mm512_fmadd_ps(a, b, c); }
+
+// Multiply and subtract
+static inline Vec16f mul_sub(Vec16f const &a, Vec16f const &b, Vec16f const &c) { return _mm512_fmsub_ps(a, b, c); }
+
+// Multiply and inverse subtract
+static inline Vec16f nmul_add(Vec16f const &a, Vec16f const &b, Vec16f const &c) { return _mm512_fnmadd_ps(a, b, c); }
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+static inline Vec16f mul_sub_x(Vec16f const &a, Vec16f const &b, Vec16f const &c) { return _mm512_fmsub_ps(a, b, c); }
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec16i exponent(Vec16f const &a)
+{
+  // return round_to_int(Vec16i(_mm512_getexp_ps(a)));
+  Vec16ui t1 = _mm512_castps_si512(a);  // reinterpret as 32-bit integers
+  Vec16ui t2 = t1 << 1;                 // shift out sign bit
+  Vec16ui t3 = t2 >> 24;                // shift down logical to position 0
+  Vec16i t4  = Vec16i(t3) - 0x7F;       // subtract bias from exponent
+  return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
+static inline Vec16f fraction(Vec16f const &a)
+{
+#if 1
+  return _mm512_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero);
+#else
+  Vec8ui t1 = _mm512_castps_si512(a);          // reinterpret as 32-bit integer
+  Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F800000;  // set exponent to 0 + bias
+  return _mm512_castsi512_ps(t2);
+#endif
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec16f exp2(Vec16i const &n)
+{
+  Vec16i t1 = max(n, -0x7F);  // limit to allowed range
+  Vec16i t2 = min(t1, 0x80);
+  Vec16i t3 = t2 + 0x7F;           // add bias
+  Vec16i t4 = t3 << 23;            // put exponent into position 23
+  return _mm512_castsi512_ps(t4);  // reinterpret as float
+}
+// static Vec16f exp2(Vec16f const & x); // defined in vectormath_exp.h
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec16f(-0.0f)) gives true, while Vec16f(-0.0f) < Vec16f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb sign_bit(Vec16f const &a)
+{
+  Vec16i t1 = _mm512_castps_si512(a);  // reinterpret as 32-bit integer
+  return Vec16fb(t1 < 0);
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec16f sign_combine(Vec16f const &a, Vec16f const &b)
+{
+  union
+  {
+    uint32_t i;
+    float f;
+  } signmask = {0x80000000};
+  return a ^ (b & Vec16f(signmask.f));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_finite(Vec16f const &a)
+{
+#ifdef __AVX512DQ__
+  __mmask16 f = _mm512_fpclass_ps_mask(a, 0x99);
+  return _mm512_knot(f);
+#else
+  Vec16i t1  = _mm512_castps_si512(a);                 // reinterpret as 32-bit integer
+  Vec16i t2  = t1 << 1;                                // shift out sign bit
+  Vec16ib t3 = Vec16i(t2 & 0xFF000000) != 0xFF000000;  // exponent field is not all 1s
+  return Vec16fb(t3);
+#endif
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_inf(Vec16f const &a)
+{
+  Vec16i t1 = _mm512_castps_si512(a);  // reinterpret as 32-bit integer
+  Vec16i t2 = t1 << 1;                 // shift out sign bit
+  return Vec16fb(t2 == 0xFF000000);    // exponent is all 1s, fraction is 0
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_nan(Vec16f const &a)
+{
+  Vec16i t1 = _mm512_castps_si512(a);       // reinterpret as 32-bit integer
+  Vec16i t2 = t1 << 1;                      // shift out sign bit
+  Vec16i t3 = 0xFF000000;                   // exponent mask
+  Vec16i t4 = t2 & t3;                      // exponent
+  Vec16i t5 = _mm512_andnot_si512(t3, t2);  // fraction
+  return Vec16fb(t4 == t3 && t5 != 0);      // exponent = all 1s and fraction != 0
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec16fb is_subnormal(Vec16f const &a)
+{
+  Vec16i t1 = _mm512_castps_si512(a);       // reinterpret as 32-bit integer
+  Vec16i t2 = t1 << 1;                      // shift out sign bit
+  Vec16i t3 = 0xFF000000;                   // exponent mask
+  Vec16i t4 = t2 & t3;                      // exponent
+  Vec16i t5 = _mm512_andnot_si512(t3, t2);  // fraction
+  return Vec16fb(t4 == 0 && t5 != 0);       // exponent = 0 and fraction != 0
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec16fb is_zero_or_subnormal(Vec16f const &a)
+{
+  Vec16i t = _mm512_castps_si512(a);  // reinterpret as 32-bit integer
+  t &= 0x7F800000;                    // isolate exponent
+  return Vec16fb(t == 0);             // exponent = 0
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec16f infinite16f()
+{
+  union
+  {
+    int32_t i;
+    float f;
+  } inf = {0x7F800000};
+  return Vec16f(inf.f);
+}
+
+// Function nan4f: returns a vector where all elements are +NAN (quiet)
+static inline Vec16f nan16f(int n = 0x10)
+{
+  union
+  {
+    int32_t i;
+    float f;
+  } nanf = {0x7FC00000 + n};
+  return Vec16f(nanf.f);
+}
+
+// change signs on vectors Vec16f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16f change_sign(Vec16f const &a)
+{
+  const __mmask16 m = __mmask16((i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3 | (i4 & 1) << 4 | (i5 & 1) << 5 |
+                                (i6 & 1) << 6 | (i7 & 1) << 7 | (i8 & 1) << 8 | (i9 & 1) << 9 | (i10 & 1) << 10 | (i11 & 1) << 11 |
+                                (i12 & 1) << 12 | (i13 & 1) << 13 | (i14 & 1) << 14 | (i15 & 1) << 15);
+  if((uint16_t)m == 0)
+    return a;
+  __m512 s = _mm512_castsi512_ps(_mm512_maskz_set1_epi32(m, 0x80000000));
+  return a ^ s;
+}
+
+/*****************************************************************************
+ *
+ *          Vec8d: Vector of 8 double precision floating point values
+ *
+ *****************************************************************************/
+
+class Vec8d
+{
+ protected:
+  __m512d zmm;  // double vector
+ public:
+  // Default constructor:
+  Vec8d() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8d(double d) { zmm = _mm512_set1_pd(d); }
+  // Constructor to build from all elements:
+  Vec8d(double d0, double d1, double d2, double d3, double d4, double d5, double d6, double d7)
+  {
+    zmm = _mm512_setr_pd(d0, d1, d2, d3, d4, d5, d6, d7);
+  }
+  // Constructor to build from two Vec4d:
+  Vec8d(Vec4d const &a0, Vec4d const &a1) { zmm = _mm512_insertf64x4(_mm512_castpd256_pd512(a0), a1, 1); }
+  // Constructor to convert from type __m512d used in intrinsics:
+  Vec8d(__m512d const &x) { zmm = x; }
+  // Assignment operator to convert from type __m512d used in intrinsics:
+  Vec8d &operator=(__m512d const &x)
+  {
+    zmm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m512d used in intrinsics
+  operator __m512d() const { return zmm; }
+  // Member function to load from array (unaligned)
+  Vec8d &load(double const *p)
+  {
+    zmm = _mm512_loadu_pd(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 64
+  Vec8d &load_a(double const *p)
+  {
+    zmm = _mm512_load_pd(p);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(double *p) const { _mm512_storeu_pd(p, zmm); }
+  // Member function to store into array, aligned by 64
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 64
+  void store_a(double *p) const { _mm512_store_pd(p, zmm); }
+  // Partial load. Load n elements and set the rest to 0
+  Vec8d &load_partial(int n, double const *p)
+  {
+    zmm = _mm512_maskz_loadu_pd(__mmask16((1 << n) - 1), p);
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, double *p) const { _mm512_mask_storeu_pd(p, __mmask16((1 << n) - 1), zmm); }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec8d &cutoff(int n)
+  {
+    zmm = _mm512_maskz_mov_pd(__mmask16((1 << n) - 1), zmm);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8d const &insert(uint32_t index, double value)
+  {
+    // zmm = _mm512_mask_set1_pd(zmm, __mmask16(1 << index), value);  // this intrinsic function does not exist (yet?)
+    zmm = _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(_mm512_castpd_si512(zmm), __mmask16(1 << index), *(int64_t *)&value));  // ignore warning
+    return *this;
+  }
+  // Member function extract a single element from vector
+  double extract(uint32_t index) const
+  {
+    double a[8];
+    store(a);
+    return a[index & 7];
+  }
+
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  double operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4d:
+  Vec4d get_low() const { return _mm512_castpd512_pd256(zmm); }
+  Vec4d get_high() const { return _mm512_extractf64x4_pd(zmm, 1); }
+  static int size() { return 8; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec8d
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8d operator+(Vec8d const &a, Vec8d const &b) { return _mm512_add_pd(a, b); }
+
+// vector operator + : add vector and scalar
+static inline Vec8d operator+(Vec8d const &a, double b) { return a + Vec8d(b); }
+static inline Vec8d operator+(double a, Vec8d const &b) { return Vec8d(a) + b; }
+
+// vector operator += : add
+static inline Vec8d &operator+=(Vec8d &a, Vec8d const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec8d operator++(Vec8d &a, int)
+{
+  Vec8d a0 = a;
+  a        = a + 1.0;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec8d &operator++(Vec8d &a)
+{
+  a = a + 1.0;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8d operator-(Vec8d const &a, Vec8d const &b) { return _mm512_sub_pd(a, b); }
+
+// vector operator - : subtract vector and scalar
+static inline Vec8d operator-(Vec8d const &a, double b) { return a - Vec8d(b); }
+static inline Vec8d operator-(double a, Vec8d const &b) { return Vec8d(a) - b; }
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8d operator-(Vec8d const &a)
+{
+  return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) ^ Vec8q(0x8000000000000000));
+}
+
+// vector operator -= : subtract
+static inline Vec8d &operator-=(Vec8d &a, Vec8d const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec8d operator--(Vec8d &a, int)
+{
+  Vec8d a0 = a;
+  a        = a - 1.0;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec8d &operator--(Vec8d &a)
+{
+  a = a - 1.0;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8d operator*(Vec8d const &a, Vec8d const &b) { return _mm512_mul_pd(a, b); }
+
+// vector operator * : multiply vector and scalar
+static inline Vec8d operator*(Vec8d const &a, double b) { return a * Vec8d(b); }
+static inline Vec8d operator*(double a, Vec8d const &b) { return Vec8d(a) * b; }
+
+// vector operator *= : multiply
+static inline Vec8d &operator*=(Vec8d &a, Vec8d const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8d operator/(Vec8d const &a, Vec8d const &b) { return _mm512_div_pd(a, b); }
+
+// vector operator / : divide vector and scalar
+static inline Vec8d operator/(Vec8d const &a, double b) { return a / Vec8d(b); }
+static inline Vec8d operator/(double a, Vec8d const &b) { return Vec8d(a) / b; }
+
+// vector operator /= : divide
+static inline Vec8d &operator/=(Vec8d &a, Vec8d const &b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8db operator==(Vec8d const &a, Vec8d const &b) { return _mm512_cmp_pd_mask(a, b, 0); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8db operator!=(Vec8d const &a, Vec8d const &b) { return _mm512_cmp_pd_mask(a, b, 4); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8db operator<(Vec8d const &a, Vec8d const &b) { return _mm512_cmp_pd_mask(a, b, 1); }
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8db operator<=(Vec8d const &a, Vec8d const &b) { return _mm512_cmp_pd_mask(a, b, 2); }
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8db operator>(Vec8d const &a, Vec8d const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8db operator>=(Vec8d const &a, Vec8d const &b) { return b <= a; }
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8d operator&(Vec8d const &a, Vec8d const &b)
+{
+  return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) & Vec8q(_mm512_castpd_si512(b)));
+}
+
+// vector operator &= : bitwise and
+static inline Vec8d &operator&=(Vec8d &a, Vec8d const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator & : bitwise and of Vec8d and Vec8db
+static inline Vec8d operator&(Vec8d const &a, Vec8db const &b) { return _mm512_maskz_mov_pd(b, a); }
+
+static inline Vec8d operator&(Vec8db const &a, Vec8d const &b) { return b & a; }
+
+// vector operator | : bitwise or
+static inline Vec8d operator|(Vec8d const &a, Vec8d const &b)
+{
+  return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) | Vec8q(_mm512_castpd_si512(b)));
+}
+
+// vector operator |= : bitwise or
+static inline Vec8d &operator|=(Vec8d &a, Vec8d const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8d operator^(Vec8d const &a, Vec8d const &b)
+{
+  return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) ^ Vec8q(_mm512_castpd_si512(b)));
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8d &operator^=(Vec8d &a, Vec8d const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8db operator!(Vec8d const &a) { return a == Vec8d(0.0); }
+
+/*****************************************************************************
+ *
+ *          Functions for Vec8d
+ *
+ *****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8d select(Vec8db const &s, Vec8d const &a, Vec8d const &b) { return _mm512_mask_mov_pd(b, s, a); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8d if_add(Vec8db const &f, Vec8d const &a, Vec8d const &b) { return _mm512_mask_add_pd(a, f, a, b); }
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec8d if_mul(Vec8db const &f, Vec8d const &a, Vec8d const &b) { return _mm512_mask_mul_pd(a, f, a, b); }
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add(Vec8d const &a)
+{
+#if defined(__INTEL_COMPILER)
+  return _mm512_reduce_add_pd(a);
+#else
+  return horizontal_add(a.get_low() + a.get_high());
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec8d max(Vec8d const &a, Vec8d const &b) { return _mm512_max_pd(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec8d min(Vec8d const &a, Vec8d const &b) { return _mm512_min_pd(a, b); }
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec8d abs(Vec8d const &a) { return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) & Vec8q(0x7FFFFFFFFFFFFFFF)); }
+
+// function sqrt: square root
+static inline Vec8d sqrt(Vec8d const &a) { return _mm512_sqrt_pd(a); }
+
+// function square: a * a
+static inline Vec8d square(Vec8d const &a) { return a * a; }
+
+// pow(Vec8d, int):
+template <typename TT>
+static Vec8d pow(Vec8d const &a, TT const &n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8d pow<int>(Vec8d const &x0, int const &n)
+{
+  return pow_template_i<Vec8d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8d pow<uint32_t>(Vec8d const &x0, uint32_t const &n)
+{
+  return pow_template_i<Vec8d>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec8d pow_n(Vec8d const &a)
+{
+  if(n < 0)
+    return Vec8d(1.0) / pow_n<-n>(a);
+  if(n == 0)
+    return Vec8d(1.0);
+  if(n >= 256)
+    return pow(a, n);
+  Vec8d x = a;                           // a^(2^i)
+  Vec8d y;                               // accumulator
+  const int lowest = n - (n & (n - 1));  // lowest set bit in n
+  if(n & 1)
+    y = x;
+  if(n < 2)
+    return y;
+  x = x * x;  // x^2
+  if(n & 2)
+    {
+      if(lowest == 2)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 4)
+    return y;
+  x = x * x;  // x^4
+  if(n & 4)
+    {
+      if(lowest == 4)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 8)
+    return y;
+  x = x * x;  // x^8
+  if(n & 8)
+    {
+      if(lowest == 8)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 16)
+    return y;
+  x = x * x;  // x^16
+  if(n & 16)
+    {
+      if(lowest == 16)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 32)
+    return y;
+  x = x * x;  // x^32
+  if(n & 32)
+    {
+      if(lowest == 32)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 64)
+    return y;
+  x = x * x;  // x^64
+  if(n & 64)
+    {
+      if(lowest == 64)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 128)
+    return y;
+  x = x * x;  // x^128
+  if(n & 128)
+    {
+      if(lowest == 128)
+        y = x;
+      else
+        y *= x;
+    }
+  return y;
+}
+
+template <int n>
+static inline Vec8d pow(Vec8d const &a, Const_int_t<n>)
+{
+  return pow_n<n>(a);
+}
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec8d round(Vec8d const &a) { return _mm512_roundscale_pd(a, 0); }
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec8d truncate(Vec8d const &a) { return _mm512_roundscale_pd(a, 3); }
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec8d floor(Vec8d const &a) { return _mm512_roundscale_pd(a, 1); }
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec8d ceil(Vec8d const &a) { return _mm512_roundscale_pd(a, 2); }
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int(Vec8d const &a)
+{
+  // return _mm512_cvtpd_epi32(a);
+  return _mm512_cvt_roundpd_epi32(a, 0 + 8);
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int(Vec8d const &a) { return _mm512_cvttpd_epi32(a); }
+
+// function truncate_to_int64: round towards zero. (inefficient)
+static inline Vec8q truncate_to_int64(Vec8d const &a)
+{
+#ifdef __AVX512DQ__
+  return _mm512_cvttpd_epi64(a);
+#else
+  double aa[8];
+  a.store(aa);
+  return Vec8q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]), int64_t(aa[4]), int64_t(aa[5]), int64_t(aa[6]),
+               int64_t(aa[7]));
+#endif
+}
+
+// function truncate_to_int64_limited: round towards zero.
+// result as 64-bit integer vector, but with limited range. Deprecated!
+static inline Vec8q truncate_to_int64_limited(Vec8d const &a)
+{
+#ifdef __AVX512DQ__
+  return truncate_to_int64(a);
+#else
+  // Note: assume MXCSR control register is set to rounding
+  Vec4q b   = _mm512_cvttpd_epi32(a);                                      // round to 32-bit integers
+  __m512i c = permute8q<0, -256, 1, -256, 2, -256, 3, -256>(Vec8q(b, b));  // get bits 64-127 to position 128-191, etc.
+  __m512i s = _mm512_srai_epi32(c, 31);                                    // sign extension bits
+  return _mm512_unpacklo_epi32(c, s);                                      // interleave with sign extensions
+#endif
+}
+
+// function round_to_int64: round to nearest or even. (inefficient)
+static inline Vec8q round_to_int64(Vec8d const &a)
+{
+#ifdef __AVX512DQ__
+  return _mm512_cvtpd_epi64(a);
+#else
+  return truncate_to_int64(round(a));
+#endif
+}
+
+// function round_to_int64_limited: round to nearest integer (even)
+// result as 64-bit integer vector, but with limited range. Deprecated!
+static inline Vec8q round_to_int64_limited(Vec8d const &a)
+{
+#ifdef __AVX512DQ__
+  return round_to_int64(a);
+#else
+  Vec4q b   = _mm512_cvt_roundpd_epi32(a, 0 + 8);                          // round to 32-bit integers
+  __m512i c = permute8q<0, -256, 1, -256, 2, -256, 3, -256>(Vec8q(b, b));  // get bits 64-127 to position 128-191, etc.
+  __m512i s = _mm512_srai_epi32(c, 31);                                    // sign extension bits
+  return _mm512_unpacklo_epi32(c, s);                                      // interleave with sign extensions
+#endif
+}
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec8d to_double(Vec8q const &a)
+{
+#if defined(__AVX512DQ__)
+  return _mm512_cvtepi64_pd(a);
+#else
+  int64_t aa[8];
+  a.store(aa);
+  return Vec8d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]), double(aa[4]), double(aa[5]), double(aa[6]), double(aa[7]));
+#endif
+}
+
+// function to_double_limited: convert integer vector elements to double vector
+// limited to abs(x) < 2^31. Deprecated!
+static inline Vec8d to_double_limited(Vec8q const &x)
+{
+#if defined(__AVX512DQ__)
+  return to_double(x);
+#else
+  Vec16i compressed = permute16i<0, 2, 4, 6, 8, 10, 12, 14, -256, -256, -256, -256, -256, -256, -256, -256>(Vec16i(x));
+  return _mm512_cvtepi32_pd(compressed.get_low());
+#endif
+}
+
+// function to_double: convert integer vector to double vector
+static inline Vec8d to_double(Vec8i const &a) { return _mm512_cvtepi32_pd(a); }
+
+// function compress: convert two Vec8d to one Vec16f
+static inline Vec16f compress(Vec8d const &low, Vec8d const &high)
+{
+  __m256 t1 = _mm512_cvtpd_ps(low);
+  __m256 t2 = _mm512_cvtpd_ps(high);
+  return Vec16f(t1, t2);
+}
+
+// Function extend_low : convert Vec16f vector elements 0 - 3 to Vec8d
+static inline Vec8d extend_low(Vec16f const &a) { return _mm512_cvtps_pd(_mm512_castps512_ps256(a)); }
+
+// Function extend_high : convert Vec16f vector elements 4 - 7 to Vec8d
+static inline Vec8d extend_high(Vec16f const &a) { return _mm512_cvtps_pd(a.get_high()); }
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8d mul_add(Vec8d const &a, Vec8d const &b, Vec8d const &c) { return _mm512_fmadd_pd(a, b, c); }
+
+// Multiply and subtract
+static inline Vec8d mul_sub(Vec8d const &a, Vec8d const &b, Vec8d const &c) { return _mm512_fmsub_pd(a, b, c); }
+
+// Multiply and inverse subtract
+static inline Vec8d nmul_add(Vec8d const &a, Vec8d const &b, Vec8d const &c) { return _mm512_fnmadd_pd(a, b, c); }
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+static inline Vec8d mul_sub_x(Vec8d const &a, Vec8d const &b, Vec8d const &c) { return _mm512_fmsub_pd(a, b, c); }
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec8q exponent(Vec8d const &a)
+{
+  Vec8uq t1 = _mm512_castpd_si512(a);  // reinterpret as 64-bit integer
+  Vec8uq t2 = t1 << 1;                 // shift out sign bit
+  Vec8uq t3 = t2 >> 53;                // shift down logical to position 0
+  Vec8q t4  = Vec8q(t3) - 0x3FF;       // subtract bias from exponent
+  return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25
+static inline Vec8d fraction(Vec8d const &a) { return _mm512_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero); }
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8d exp2(Vec8q const &n)
+{
+  Vec8q t1 = max(n, -0x3FF);  // limit to allowed range
+  Vec8q t2 = min(t1, 0x400);
+  Vec8q t3 = t2 + 0x3FF;           // add bias
+  Vec8q t4 = t3 << 52;             // put exponent into position 52
+  return _mm512_castsi512_pd(t4);  // reinterpret as double
+}
+// static Vec8d exp2(Vec8d const & x); // defined in vectormath_exp.h
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec8d(-0.0)) gives true, while Vec8d(-0.0) < Vec8d(0.0) gives false
+static inline Vec8db sign_bit(Vec8d const &a)
+{
+  Vec8q t1 = _mm512_castpd_si512(a);  // reinterpret as 64-bit integer
+  return Vec8db(t1 < 0);
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8d sign_combine(Vec8d const &a, Vec8d const &b)
+{
+  union
+  {
+    uint64_t i;
+    double f;
+  } u = {0x8000000000000000};  // mask for sign bit
+  return a ^ (b & Vec8d(u.f));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+static inline Vec8db is_finite(Vec8d const &a)
+{
+#ifdef __AVX512DQ__
+  __mmask8 f = _mm512_fpclass_pd_mask(a, 0x99);
+  return _mm512_knot(f);
+#else
+  Vec8q t1  = _mm512_castpd_si512(a);  // reinterpret as 64-bit integer
+  Vec8q t2  = t1 << 1;                 // shift out sign bit
+  Vec8q t3  = 0xFFE0000000000000ll;    // exponent mask
+  Vec8qb t4 = Vec8q(t2 & t3) != t3;    // exponent field is not all 1s
+  return Vec8db(t4);
+#endif
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec8db is_inf(Vec8d const &a)
+{
+  Vec8q t1 = _mm512_castpd_si512(a);          // reinterpret as 64-bit integer
+  Vec8q t2 = t1 << 1;                         // shift out sign bit
+  return Vec8db(t2 == 0xFFE0000000000000ll);  // exponent is all 1s, fraction is 0
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec8db is_nan(Vec8d const &a)
+{
+  Vec8q t1 = _mm512_castpd_si512(a);       // reinterpret as 64-bit integer
+  Vec8q t2 = t1 << 1;                      // shift out sign bit
+  Vec8q t3 = 0xFFE0000000000000ll;         // exponent mask
+  Vec8q t4 = t2 & t3;                      // exponent
+  Vec8q t5 = _mm512_andnot_si512(t3, t2);  // fraction
+  return Vec8db(t4 == t3 && t5 != 0);      // exponent = all 1s and fraction != 0
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8db is_subnormal(Vec8d const &a)
+{
+  Vec8q t1 = _mm512_castpd_si512(a);       // reinterpret as 64-bit integer
+  Vec8q t2 = t1 << 1;                      // shift out sign bit
+  Vec8q t3 = 0xFFE0000000000000ll;         // exponent mask
+  Vec8q t4 = t2 & t3;                      // exponent
+  Vec8q t5 = _mm512_andnot_si512(t3, t2);  // fraction
+  return Vec8db(t4 == 0 && t5 != 0);       // exponent = 0 and fraction != 0
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8db is_zero_or_subnormal(Vec8d const &a)
+{
+  Vec8q t = _mm512_castpd_si512(a);  // reinterpret as 32-bit integer
+  t &= 0x7FF0000000000000ll;         // isolate exponent
+  return Vec8db(t == 0);             // exponent = 0
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec8d infinite8d()
+{
+  union
+  {
+    uint64_t i;
+    double f;
+  } u = {0x7FF0000000000000};
+  return Vec8d(u.f);
+}
+
+// Function nan8d: returns a vector where all elements are +NAN (quiet NAN)
+static inline Vec8d nan8d(int n = 0x10)
+{
+  union
+  {
+    uint64_t i;
+    double f;
+  } u = {0x7FF8000000000000 + uint64_t(n)};
+  return Vec8d(u.f);
+}
+
+// change signs on vectors Vec8d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d change_sign(Vec8d const &a)
+{
+  const __mmask16 m = __mmask16((i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3 | (i4 & 1) << 4 | (i5 & 1) << 5 |
+                                (i6 & 1) << 6 | (i7 & 1) << 7);
+  if((uint8_t)m == 0)
+    return a;
+  __m512d s = _mm512_castsi512_pd(_mm512_maskz_set1_epi64(m, 0x8000000000000000));
+  return a ^ s;
+}
+
+/*****************************************************************************
+ *
+ *          Functions for reinterpretation between vector types
+ *
+ *****************************************************************************/
+
+// AVX512 requires gcc version 4.9 or higher. Apparently the problem with mangling intrinsic vector types no longer exists in gcc 4.x
+
+static inline __m512i reinterpret_i(__m512i const &x) { return x; }
+
+static inline __m512i reinterpret_i(__m512 const &x) { return _mm512_castps_si512(x); }
+
+static inline __m512i reinterpret_i(__m512d const &x) { return _mm512_castpd_si512(x); }
+
+static inline __m512 reinterpret_f(__m512i const &x) { return _mm512_castsi512_ps(x); }
+
+static inline __m512 reinterpret_f(__m512 const &x) { return x; }
+
+static inline __m512 reinterpret_f(__m512d const &x) { return _mm512_castpd_ps(x); }
+
+static inline __m512d reinterpret_d(__m512i const &x) { return _mm512_castsi512_pd(x); }
+
+static inline __m512d reinterpret_d(__m512 const &x) { return _mm512_castps_pd(x); }
+
+static inline __m512d reinterpret_d(__m512d const &x) { return x; }
+
+/*****************************************************************************
+ *
+ *          Vector permute functions
+ *
+ ******************************************************************************
+ *
+ * These permute functions can reorder the elements of a vector and optionally
+ * set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to select.
+ * An index of -1 will generate zero. An index of -256 means don't care.
+ *
+ * Example:
+ * Vec8d a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+ * Vec8d b;
+ * b = permute8d<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+// Permute vector of 8 64-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d permute8d(Vec8d const &a)
+{
+  // Combine indexes into a single bitfield, with 4 bits for each
+  const int m1 =
+      (i0 & 7) | (i1 & 7) << 4 | (i2 & 7) << 8 | (i3 & 7) << 12 | (i4 & 7) << 16 | (i5 & 7) << 20 | (i6 & 7) << 24 | (i7 & 7) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF0) | (i2 < 0 ? 0 : 0xF00) | (i3 < 0 ? 0 : 0xF000) | (i4 < 0 ? 0 : 0xF0000) |
+                 (i5 < 0 ? 0 : 0xF00000) | (i6 < 0 ? 0 : 0xF000000) | (i7 < 0 ? 0 : 0xF0000000);
+  const int m2 = m1 & mz;
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80) != 0;
+
+  // special case: all zero
+  if(mz == 0)
+    return _mm512_setzero_pd();
+
+  // mask for elements not zeroed
+  const __mmask16 z = __mmask16((i0 >= 0) << 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                                (i6 >= 0) << 6 | (i7 >= 0) << 7);
+  // same with 2 bits for each element
+  const __mmask16 zz = __mmask16((i0 >= 0 ? 3 : 0) | (i1 >= 0 ? 0xC : 0) | (i2 >= 0 ? 0x30 : 0) | (i3 >= 0 ? 0xC0 : 0) |
+                                 (i4 >= 0 ? 0x300 : 0) | (i5 >= 0 ? 0xC00 : 0) | (i6 >= 0 ? 0x3000 : 0) | (i7 >= 0 ? 0xC000 : 0));
+
+  if(((m1 ^ 0x76543210) & mz) == 0)
+    {
+      // no shuffling
+      if(dozero)
+        {
+          // zero some elements
+          return _mm512_maskz_mov_pd(z, a);
+        }
+      return a;  // do nothing
+    }
+
+  if(((m1 ^ 0x66442200) & 0x66666666 & mz) == 0)
+    {
+      // no exchange of data between the four 128-bit lanes
+      const int pat   = ((m2 | m2 >> 8 | m2 >> 16 | m2 >> 24) & 0x11) * 0x01010101;
+      const int pmask = ((pat & 1) * 10 + 4) | ((((pat >> 4) & 1) * 10 + 4) << 4);
+      if(((m1 ^ pat) & mz & 0x11111111) == 0)
+        {
+          // same permute pattern in all lanes
+          if(dozero)
+            {  // permute within lanes and zero
+              return _mm512_castsi512_pd(_mm512_maskz_shuffle_epi32(zz, _mm512_castpd_si512(a), (_MM_PERM_ENUM)pmask));
+            }
+          else
+            {  // permute within lanes
+              return _mm512_castsi512_pd(_mm512_shuffle_epi32(_mm512_castpd_si512(a), (_MM_PERM_ENUM)pmask));
+            }
+        }
+      // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
+    }
+  if((((m1 ^ 0x10101010) & 0x11111111 & mz) == 0) && ((m1 ^ (m1 >> 4)) & 0x06060606 & mz & (mz >> 4)) == 0)
+    {
+      // permute lanes only. no permutation within each lane
+      const int m3 = m2 | (m2 >> 4);
+      const int s  = ((m3 >> 1) & 3) | (((m3 >> 9) & 3) << 2) | (((m3 >> 17) & 3) << 4) | (((m3 >> 25) & 3) << 6);
+      if(dozero)
+        {
+          // permute lanes and zero some 64-bit elements
+          return _mm512_maskz_shuffle_f64x2(z, a, a, (_MM_PERM_ENUM)s);
+        }
+      else
+        {
+          // permute lanes
+          return _mm512_shuffle_f64x2(a, a, (_MM_PERM_ENUM)s);
+        }
+    }
+  // full permute needed
+  const __m512i pmask = constant16i<i0 & 7, 0, i1 & 7, 0, i2 & 7, 0, i3 & 7, 0, i4 & 7, 0, i5 & 7, 0, i6 & 7, 0, i7 & 7, 0>();
+  if(dozero)
+    {
+      // full permute and zeroing
+      return _mm512_maskz_permutexvar_pd(z, pmask, a);
+    }
+  else
+    {
+      return _mm512_permutexvar_pd(pmask, a);
+    }
+}
+
+// Permute vector of 16 32-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16f permute16f(Vec16f const &a)
+{
+  // Combine indexes into a single bitfield, with 4 bits for each
+  const uint64_t m1 = (i0 & 15) | (i1 & 15) << 4 | (i2 & 15) << 8 | (i3 & 15) << 12 | (i4 & 15) << 16 | (i5 & 15) << 20 |
+                      (i6 & 15) << 24 | (i7 & 15LL) << 28  // 15LL avoids sign extension of (int32_t | int64_t)
+                      | (i8 & 15LL) << 32 | (i9 & 15LL) << 36 | (i10 & 15LL) << 40 | (i11 & 15LL) << 44 | (i12 & 15LL) << 48 |
+                      (i13 & 15LL) << 52 | (i14 & 15LL) << 56 | (i15 & 15LL) << 60;
+
+  // Mask to zero out negative indexes
+  const uint64_t mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF0) | (i2 < 0 ? 0 : 0xF00) | (i3 < 0 ? 0 : 0xF000) |
+                      (i4 < 0 ? 0 : 0xF0000) | (i5 < 0 ? 0 : 0xF00000) | (i6 < 0 ? 0 : 0xF000000) | (i7 < 0 ? 0 : 0xF0000000ULL) |
+                      (i8 < 0 ? 0 : 0xF00000000) | (i9 < 0 ? 0 : 0xF000000000) | (i10 < 0 ? 0 : 0xF0000000000) |
+                      (i11 < 0 ? 0 : 0xF00000000000) | (i12 < 0 ? 0 : 0xF000000000000) | (i13 < 0 ? 0 : 0xF0000000000000) |
+                      (i14 < 0 ? 0 : 0xF00000000000000) | (i15 < 0 ? 0 : 0xF000000000000000);
+
+  const uint64_t m2 = m1 & mz;
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) & 0x80) != 0;
+
+  // special case: all zero
+  if(mz == 0)
+    return _mm512_setzero_ps();
+
+  // mask for elements not zeroed
+  const __mmask16 z = __mmask16((i0 >= 0) << 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                                (i6 >= 0) << 6 | (i7 >= 0) << 7 | (i8 >= 0) << 8 | (i9 >= 0) << 9 | (i10 >= 0) << 10 |
+                                (i11 >= 0) << 11 | (i12 >= 0) << 12 | (i13 >= 0) << 13 | (i14 >= 0) << 14 | (i15 >= 0) << 15);
+
+  if(((m1 ^ 0xFEDCBA9876543210) & mz) == 0)
+    {
+      // no shuffling
+      if(dozero)
+        {
+          // zero some elements
+          return _mm512_maskz_mov_ps(z, a);
+        }
+      return a;  // do nothing
+    }
+
+  if(((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0)
+    {
+      // no exchange of data between the four 128-bit lanes
+      const uint64_t pat = ((m2 | (m2 >> 16) | (m2 >> 32) | (m2 >> 48)) & 0x3333) * 0x0001000100010001;
+      const int pmask    = (pat & 3) | (((pat >> 4) & 3) << 2) | (((pat >> 8) & 3) << 4) | (((pat >> 12) & 3) << 6);
+      if(((m1 ^ pat) & 0x3333333333333333 & mz) == 0)
+        {
+          // same permute pattern in all lanes
+          if(dozero)
+            {  // permute within lanes and zero
+              return _mm512_castsi512_ps(_mm512_maskz_shuffle_epi32(z, _mm512_castps_si512(a), (_MM_PERM_ENUM)pmask));
+            }
+          else
+            {  // permute within lanes
+              return _mm512_castsi512_ps(_mm512_shuffle_epi32(_mm512_castps_si512(a), (_MM_PERM_ENUM)pmask));
+            }
+        }
+      // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
+    }
+  const uint64_t lane = (m2 | m2 >> 4 | m2 >> 8 | m2 >> 12) & 0x000C000C000C000C;
+  if((((m1 ^ 0x3210321032103210) & 0x3333333333333333 & mz) == 0) && ((m1 ^ (lane * 0x1111)) & 0xCCCCCCCCCCCCCCCC & mz) == 0)
+    {
+      // permute lanes only. no permutation within each lane
+      const uint64_t s = ((lane >> 2) & 3) | (((lane >> 18) & 3) << 2) | (((lane >> 34) & 3) << 4) | (((lane >> 50) & 3) << 6);
+      if(dozero)
+        {
+          // permute lanes and zero some 64-bit elements
+          return _mm512_maskz_shuffle_f32x4(z, a, a, (_MM_PERM_ENUM)s);
+        }
+      else
+        {
+          // permute lanes
+          return _mm512_shuffle_f32x4(a, a, (_MM_PERM_ENUM)s);
+        }
+    }
+  // full permute needed
+  const __m512i pmask = constant16i<i0 & 15, i1 & 15, i2 & 15, i3 & 15, i4 & 15, i5 & 15, i6 & 15, i7 & 15, i8 & 15, i9 & 15, i10 & 15,
+                                    i11 & 15, i12 & 15, i13 & 15, i14 & 15, i15 & 15>();
+  if(dozero)
+    {
+      // full permute and zeroing
+      return _mm512_maskz_permutexvar_ps(z, pmask, a);
+    }
+  else
+    {
+      return _mm512_permutexvar_ps(pmask, a);
+    }
+}
+
+/*****************************************************************************
+ *
+ *          Vector blend functions
+ *
+ ******************************************************************************
+ *
+ * These blend functions can mix elements from two different vectors and
+ * optionally set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select, where higher indexes indicate an element from the second source
+ * vector. For example, if each vector has 8 elements, then indexes 0 - 7
+ * will select an element from the first vector and indexes 8 - 15 will select
+ * an element from the second vector. A negative index will generate zero.
+ *
+ * Example:
+ * Vec8d a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8d b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+ * Vec8d c;
+ * c = blend8d<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d blend8d(Vec8d const &a, Vec8d const &b)
+{
+  // Combine indexes into a single bitfield, with 4 bits for each
+  const int m1 = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                 (i6 & 0xF) << 24 | (i7 & 0xF) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF0) | (i2 < 0 ? 0 : 0xF00) | (i3 < 0 ? 0 : 0xF000) | (i4 < 0 ? 0 : 0xF0000) |
+                 (i5 < 0 ? 0 : 0xF00000) | (i6 < 0 ? 0 : 0xF000000) | (i7 < 0 ? 0 : 0xF0000000);
+  const int m2 = m1 & mz;
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80) != 0;
+
+  // mask for elements not zeroed
+  const __mmask16 z = __mmask16((i0 >= 0) << 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                                (i6 >= 0) << 6 | (i7 >= 0) << 7);
+
+  // special case: all zero
+  if(mz == 0)
+    return _mm512_setzero_pd();
+
+  // special case: all from a
+  if((m1 & 0x88888888 & mz) == 0)
+    {
+      return permute8d<i0, i1, i2, i3, i4, i5, i6, i7>(a);
+    }
+
+  // special case: all from b
+  if((~m1 & 0x88888888 & mz) == 0)
+    {
+      return permute8d<i0 ^ 8, i1 ^ 8, i2 ^ 8, i3 ^ 8, i4 ^ 8, i5 ^ 8, i6 ^ 8, i7 ^ 8>(b);
+    }
+
+  // special case: blend without permute
+  if(((m1 ^ 0x76543210) & 0x77777777 & mz) == 0)
+    {
+      __mmask16 blendmask = __mmask16((i0 & 8) >> 3 | (i1 & 8) >> 2 | (i2 & 8) >> 1 | (i3 & 8) >> 0 | (i4 & 8) << 1 | (i5 & 8) << 2 |
+                                      (i6 & 8) << 3 | (i7 & 8) << 4);
+      __m512d t           = _mm512_mask_blend_pd(blendmask, a, b);
+      if(dozero)
+        {
+          t = _mm512_maskz_mov_pd(z, t);
+        }
+      return t;
+    }
+  // special case: all data stay within their lane
+  if(((m1 ^ 0x66442200) & 0x66666666 & mz) == 0)
+    {
+      // mask for elements from a and b
+      const uint32_t mb = ((i0 & 8) ? 0xF : 0) | ((i1 & 8) ? 0xF0 : 0) | ((i2 & 8) ? 0xF00 : 0) | ((i3 & 8) ? 0xF000 : 0) |
+                          ((i4 & 8) ? 0xF0000 : 0) | ((i5 & 8) ? 0xF00000 : 0) | ((i6 & 8) ? 0xF000000 : 0) |
+                          ((i7 & 8) ? 0xF0000000 : 0);
+      const uint32_t mbz  = mb & mz;   // mask for nonzero elements from b
+      const uint32_t maz  = ~mb & mz;  // mask for nonzero elements from a
+      const uint32_t m1a  = m1 & maz;
+      const uint32_t m1b  = m1 & mbz;
+      const uint32_t pata = ((m1a | m1a >> 8 | m1a >> 16 | m1a >> 24) & 0xFF) * 0x01010101;  // permute pattern for elements from a
+      const uint32_t patb = ((m1b | m1b >> 8 | m1b >> 16 | m1b >> 24) & 0xFF) * 0x01010101;  // permute pattern for elements from b
+
+      if(((m1 ^ pata) & 0x11111111 & maz) == 0 && ((m1 ^ patb) & 0x11111111 & mbz) == 0)
+        {
+          // Same permute pattern in all lanes:
+          // todo!: make special case for PSHUFD
+
+          // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
+          // and we are saving 64 bytes of data cache.
+          // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
+          __m512d ta = permute8d < (maz & 0xF) ? i0 & 7 : -1, (maz & 0xF0) ? i1 & 7 : -1, (maz & 0xF00) ? i2 & 7 : -1,
+                  (maz & 0xF000) ? i3 & 7 : -1, (maz & 0xF0000) ? i4 & 7 : -1, (maz & 0xF00000) ? i5 & 7 : -1,
+                  (maz & 0xF000000) ? i6 & 7 : -1, (maz & 0xF0000000) ? i7 & 7 : -1 > (a);
+          // write mask for elements from b
+          const __mmask16 sb = ((mbz & 0xF) ? 3 : 0) | ((mbz & 0xF0) ? 0xC : 0) | ((mbz & 0xF00) ? 0x30 : 0) |
+                               ((mbz & 0xF000) ? 0xC0 : 0) | ((mbz & 0xF0000) ? 0x300 : 0) | ((mbz & 0xF00000) ? 0xC00 : 0) |
+                               ((mbz & 0xF000000) ? 0x3000 : 0) | ((mbz & 0xF0000000) ? 0xC000 : 0);
+          // permute index for elements from b
+          const int pi = ((patb & 1) * 10 + 4) | ((((patb >> 4) & 1) * 10 + 4) << 4);
+          // 2. Permute elements from b and combine with elements from a through write mask
+          return _mm512_castsi512_pd(
+              _mm512_mask_shuffle_epi32(_mm512_castpd_si512(ta), sb, _mm512_castpd_si512(b), (_MM_PERM_ENUM)pi));
+        }
+      // not same permute pattern in all lanes. use full permute
+    }
+  // general case: full permute
+  const __m512i pmask =
+      constant16i<i0 & 0xF, 0, i1 & 0xF, 0, i2 & 0xF, 0, i3 & 0xF, 0, i4 & 0xF, 0, i5 & 0xF, 0, i6 & 0xF, 0, i7 & 0xF, 0>();
+  if(dozero)
+    {
+      return _mm512_maskz_permutex2var_pd(z, a, pmask, b);
+    }
+  else
+    {
+      return _mm512_permutex2var_pd(a, pmask, b);
+    }
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16f blend16f(Vec16f const &a, Vec16f const &b)
+{
+  // Combine indexes into a single bitfield, with 4 bits for each indicating shuffle, but not source
+  const uint64_t m1 = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                      (i6 & 0xF) << 24 | (i7 & 0xFLL) << 28 | (i8 & 0xFLL) << 32 | (i9 & 0xFLL) << 36 | (i10 & 0xFLL) << 40 |
+                      (i11 & 0xFLL) << 44 | (i12 & 0xFLL) << 48 | (i13 & 0xFLL) << 52 | (i14 & 0xFLL) << 56 | (i15 & 0xFLL) << 60;
+
+  // Mask to zero out negative indexes
+  const uint64_t mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF0) | (i2 < 0 ? 0 : 0xF00) | (i3 < 0 ? 0 : 0xF000) |
+                      (i4 < 0 ? 0 : 0xF0000) | (i5 < 0 ? 0 : 0xF00000) | (i6 < 0 ? 0 : 0xF000000) | (i7 < 0 ? 0 : 0xF0000000ULL) |
+                      (i8 < 0 ? 0 : 0xF00000000) | (i9 < 0 ? 0 : 0xF000000000) | (i10 < 0 ? 0 : 0xF0000000000) |
+                      (i11 < 0 ? 0 : 0xF00000000000) | (i12 < 0 ? 0 : 0xF000000000000) | (i13 < 0 ? 0 : 0xF0000000000000) |
+                      (i14 < 0 ? 0 : 0xF00000000000000) | (i15 < 0 ? 0 : 0xF000000000000000);
+  const uint64_t m2 = m1 & mz;
+
+  // collect bit 4 of each index = select source
+  const uint64_t ms = ((i0 & 16) ? 0xF : 0) | ((i1 & 16) ? 0xF0 : 0) | ((i2 & 16) ? 0xF00 : 0) | ((i3 & 16) ? 0xF000 : 0) |
+                      ((i4 & 16) ? 0xF0000 : 0) | ((i5 & 16) ? 0xF00000 : 0) | ((i6 & 16) ? 0xF000000 : 0) |
+                      ((i7 & 16) ? 0xF0000000ULL : 0) | ((i8 & 16) ? 0xF00000000 : 0) | ((i9 & 16) ? 0xF000000000 : 0) |
+                      ((i10 & 16) ? 0xF0000000000 : 0) | ((i11 & 16) ? 0xF00000000000 : 0) | ((i12 & 16) ? 0xF000000000000 : 0) |
+                      ((i13 & 16) ? 0xF0000000000000 : 0) | ((i14 & 16) ? 0xF00000000000000 : 0) |
+                      ((i15 & 16) ? 0xF000000000000000 : 0);
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) & 0x80) != 0;
+
+  // mask for elements not zeroed
+  const __mmask16 z = __mmask16((i0 >= 0) << 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                                (i6 >= 0) << 6 | (i7 >= 0) << 7 | (i8 >= 0) << 8 | (i9 >= 0) << 9 | (i10 >= 0) << 10 |
+                                (i11 >= 0) << 11 | (i12 >= 0) << 12 | (i13 >= 0) << 13 | (i14 >= 0) << 14 | (i15 >= 0) << 15);
+
+  // special case: all zero
+  if(mz == 0)
+    return _mm512_setzero_ps();
+
+  // special case: all from a
+  if((ms & mz) == 0)
+    {
+      return permute16f<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a);
+    }
+
+  // special case: all from b
+  if((~ms & mz) == 0)
+    {
+      return permute16f<i0 ^ 16, i1 ^ 16, i2 ^ 16, i3 ^ 16, i4 ^ 16, i5 ^ 16, i6 ^ 16, i7 ^ 16, i8 ^ 16, i9 ^ 16, i10 ^ 16, i11 ^ 16,
+                        i12 ^ 16, i13 ^ 16, i14 ^ 16, i15 ^ 16>(b);
+    }
+
+  // special case: blend without permute
+  if(((m1 ^ 0xFEDCBA9876543210) & mz) == 0)
+    {
+      __mmask16 blendmask = __mmask16((i0 & 16) >> 4 | (i1 & 16) >> 3 | (i2 & 16) >> 2 | (i3 & 16) >> 1 | (i4 & 16) | (i5 & 16) << 1 |
+                                      (i6 & 16) << 2 | (i7 & 16) << 3 | (i8 & 16) << 4 | (i9 & 16) << 5 | (i10 & 16) << 6 |
+                                      (i11 & 16) << 7 | (i12 & 16) << 8 | (i13 & 16) << 9 | (i14 & 16) << 10 | (i15 & 16) << 11);
+      __m512 t            = _mm512_mask_blend_ps(blendmask, a, b);
+      if(dozero)
+        {
+          t = _mm512_maskz_mov_ps(z, t);
+        }
+      return t;
+    }
+
+  // special case: all data stay within their lane
+  if(((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0)
+    {
+      // mask for elements from a and b
+      const uint64_t mb  = ms;
+      const uint64_t mbz = mb & mz;   // mask for nonzero elements from b
+      const uint64_t maz = ~mb & mz;  // mask for nonzero elements from a
+      const uint64_t m1a = m1 & maz;
+      const uint64_t m1b = m1 & mbz;
+      const uint64_t pata =
+          ((m1a | m1a >> 16 | m1a >> 32 | m1a >> 48) & 0xFFFF) * 0x0001000100010001;  // permute pattern for elements from a
+      const uint64_t patb =
+          ((m1b | m1b >> 16 | m1b >> 32 | m1b >> 48) & 0xFFFF) * 0x0001000100010001;  // permute pattern for elements from b
+
+      if(((m1 ^ pata) & 0x3333333333333333 & maz) == 0 && ((m1 ^ patb) & 0x3333333333333333 & mbz) == 0)
+        {
+          // Same permute pattern in all lanes:
+          // todo!: special case for SHUFPS
+
+          // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
+          // and we are saving 64 bytes of data cache.
+          // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
+          __m512 ta = permute16f < (maz & 0xF) ? i0 & 15 : -1, (maz & 0xF0) ? i1 & 15 : -1, (maz & 0xF00) ? i2 & 15 : -1,
+                 (maz & 0xF000) ? i3 & 15 : -1, (maz & 0xF0000) ? i4 & 15 : -1, (maz & 0xF00000) ? i5 & 15 : -1,
+                 (maz & 0xF000000) ? i6 & 15 : -1, (maz & 0xF0000000) ? i7 & 15 : -1, (maz & 0xF00000000) ? i8 & 15 : -1,
+                 (maz & 0xF000000000) ? i9 & 15 : -1, (maz & 0xF0000000000) ? i10 & 15 : -1, (maz & 0xF00000000000) ? i11 & 15 : -1,
+                 (maz & 0xF000000000000) ? i12 & 15 : -1, (maz & 0xF0000000000000) ? i13 & 15 : -1,
+                 (maz & 0xF00000000000000) ? i14 & 15 : -1, (maz & 0xF000000000000000) ? i15 & 15 : -1 > (a);
+          // write mask for elements from b
+          const __mmask16 sb = ((mbz & 0xF) ? 1 : 0) | ((mbz & 0xF0) ? 0x2 : 0) | ((mbz & 0xF00) ? 0x4 : 0) |
+                               ((mbz & 0xF000) ? 0x8 : 0) | ((mbz & 0xF0000) ? 0x10 : 0) | ((mbz & 0xF00000) ? 0x20 : 0) |
+                               ((mbz & 0xF000000) ? 0x40 : 0) | ((mbz & 0xF0000000) ? 0x80 : 0) | ((mbz & 0xF00000000) ? 0x100 : 0) |
+                               ((mbz & 0xF000000000) ? 0x200 : 0) | ((mbz & 0xF0000000000) ? 0x400 : 0) |
+                               ((mbz & 0xF00000000000) ? 0x800 : 0) | ((mbz & 0xF000000000000) ? 0x1000 : 0) |
+                               ((mbz & 0xF0000000000000) ? 0x2000 : 0) | ((mbz & 0xF00000000000000) ? 0x4000 : 0) |
+                               ((mbz & 0xF000000000000000) ? 0x8000 : 0);
+          // permute index for elements from b
+          const int pi = (patb & 3) | (((patb >> 4) & 3) << 2) | (((patb >> 8) & 3) << 4) | (((patb >> 12) & 3) << 6);
+          // 2. Permute elements from b and combine with elements from a through write mask
+          return _mm512_castsi512_ps(
+              _mm512_mask_shuffle_epi32(_mm512_castps_si512(ta), sb, _mm512_castps_si512(b), (_MM_PERM_ENUM)pi));
+        }
+      // not same permute pattern in all lanes. use full permute
+    }
+
+  // general case: full permute
+  const __m512i pmask = constant16i<i0 & 0x1F, i1 & 0x1F, i2 & 0x1F, i3 & 0x1F, i4 & 0x1F, i5 & 0x1F, i6 & 0x1F, i7 & 0x1F, i8 & 0x1F,
+                                    i9 & 0x1F, i10 & 0x1F, i11 & 0x1F, i12 & 0x1F, i13 & 0x1F, i14 & 0x1F, i15 & 0x1F>();
+  if(dozero)
+    {
+      return _mm512_maskz_permutex2var_ps(z, a, pmask, b);
+    }
+  else
+    {
+      return _mm512_permutex2var_ps(a, pmask, b);
+    }
+}
+
+/*****************************************************************************
+ *
+ *          Vector lookup functions
+ *
+ ******************************************************************************
+ *
+ * These functions use vector elements as indexes into a table.
+ * The table is given as one or more vectors or as an array.
+ *
+ * This can be used for several purposes:
+ *  - table lookup
+ *  - permute or blend with variable indexes
+ *  - blend from more than two sources
+ *  - gather non-contiguous data
+ *
+ * An index out of range may produce any value - the actual value produced is
+ * implementation dependent and may be different for different instruction
+ * sets. An index out of range does not produce an error message or exception.
+ *
+ * Example:
+ * Vec8d a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+ * Vec8d b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8d c;
+ * c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+ *
+ *****************************************************************************/
+
+static inline Vec16f lookup16(Vec16i const &index, Vec16f const &table) { return _mm512_permutexvar_ps(index, table); }
+
+template <int n>
+static inline Vec16f lookup(Vec16i const &index, float const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 16)
+    {
+      Vec16f table1 = Vec16f().load((float *)table);
+      return lookup16(index, table1);
+    }
+  if(n <= 32)
+    {
+      Vec16f table1 = Vec16f().load((float *)table);
+      Vec16f table2 = Vec16f().load((float *)table + 16);
+      return _mm512_permutex2var_ps(table1, index, table2);
+    }
+  // n > 32. Limit index
+  Vec16ui index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec16ui(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec16ui(index), uint32_t(n - 1));
+    }
+  return _mm512_i32gather_ps(index1, (const float *)table, 4);
+}
+
+static inline Vec8d lookup8(Vec8q const &index, Vec8d const &table) { return _mm512_permutexvar_pd(index, table); }
+
+template <int n>
+static inline Vec8d lookup(Vec8q const &index, double const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 8)
+    {
+      Vec8d table1 = Vec8d().load((double *)table);
+      return lookup8(index, table1);
+    }
+  if(n <= 16)
+    {
+      Vec8d table1 = Vec8d().load((double *)table);
+      Vec8d table2 = Vec8d().load((double *)table + 8);
+      return _mm512_permutex2var_pd(table1, index, table2);
+    }
+  // n > 16. Limit index
+  Vec8uq index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec8uq(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec8uq(index), uint32_t(n - 1));
+    }
+  return _mm512_i64gather_pd(index1, (const double *)table, 8);
+}
+
+/*****************************************************************************
+ *
+ *          Gather functions with fixed indexes
+ *
+ *****************************************************************************/
+// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16f gather16f(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) >= 0>
+      Negative_array_index;  // Error message if index is negative
+  // find smallest and biggest index, using only compile-time constant expressions
+  const int i01min    = i0 < i1 ? i0 : i1;
+  const int i23min    = i2 < i3 ? i2 : i3;
+  const int i45min    = i4 < i5 ? i4 : i5;
+  const int i67min    = i6 < i7 ? i6 : i7;
+  const int i89min    = i8 < i9 ? i8 : i9;
+  const int i1011min  = i10 < i11 ? i10 : i11;
+  const int i1213min  = i12 < i13 ? i12 : i13;
+  const int i1415min  = i14 < i15 ? i14 : i15;
+  const int i0_3min   = i01min < i23min ? i01min : i23min;
+  const int i4_7min   = i45min < i67min ? i45min : i67min;
+  const int i8_11min  = i89min < i1011min ? i89min : i1011min;
+  const int i12_15min = i1213min < i1415min ? i1213min : i1415min;
+  const int i0_7min   = i0_3min < i4_7min ? i0_3min : i4_7min;
+  const int i8_15min  = i8_11min < i12_15min ? i8_11min : i12_15min;
+  const int imin      = i0_7min < i8_15min ? i0_7min : i8_15min;
+  const int i01max    = i0 > i1 ? i0 : i1;
+  const int i23max    = i2 > i3 ? i2 : i3;
+  const int i45max    = i4 > i5 ? i4 : i5;
+  const int i67max    = i6 > i7 ? i6 : i7;
+  const int i89max    = i8 > i9 ? i8 : i9;
+  const int i1011max  = i10 > i11 ? i10 : i11;
+  const int i1213max  = i12 > i13 ? i12 : i13;
+  const int i1415max  = i14 > i15 ? i14 : i15;
+  const int i0_3max   = i01max > i23max ? i01max : i23max;
+  const int i4_7max   = i45max > i67max ? i45max : i67max;
+  const int i8_11max  = i89max > i1011max ? i89max : i1011max;
+  const int i12_15max = i1213max > i1415max ? i1213max : i1415max;
+  const int i0_7max   = i0_3max > i4_7max ? i0_3max : i4_7max;
+  const int i8_15max  = i8_11max > i12_15max ? i8_11max : i12_15max;
+  const int imax      = i0_7max > i8_15max ? i0_7max : i8_15max;
+  if(imax - imin <= 15)
+    {
+      // load one contiguous block and permute
+      if(imax > 15)
+        {
+          // make sure we don't read past the end of the array
+          Vec16f b = Vec16f().load((float const *)a + imax - 15);
+          return permute16f<i0 - imax + 15, i1 - imax + 15, i2 - imax + 15, i3 - imax + 15, i4 - imax + 15, i5 - imax + 15,
+                            i6 - imax + 15, i7 - imax + 15, i8 - imax + 15, i9 - imax + 15, i10 - imax + 15, i11 - imax + 15,
+                            i12 - imax + 15, i13 - imax + 15, i14 - imax + 15, i15 - imax + 15>(b);
+        }
+      else
+        {
+          Vec16f b = Vec16f().load((float const *)a + imin);
+          return permute16f<i0 - imin, i1 - imin, i2 - imin, i3 - imin, i4 - imin, i5 - imin, i6 - imin, i7 - imin, i8 - imin,
+                            i9 - imin, i10 - imin, i11 - imin, i12 - imin, i13 - imin, i14 - imin, i15 - imin>(b);
+        }
+    }
+  if((i0 < imin + 16 || i0 > imax - 16) && (i1 < imin + 16 || i1 > imax - 16) && (i2 < imin + 16 || i2 > imax - 16) &&
+     (i3 < imin + 16 || i3 > imax - 16) && (i4 < imin + 16 || i4 > imax - 16) && (i5 < imin + 16 || i5 > imax - 16) &&
+     (i6 < imin + 16 || i6 > imax - 16) && (i7 < imin + 16 || i7 > imax - 16) && (i8 < imin + 16 || i8 > imax - 16) &&
+     (i9 < imin + 16 || i9 > imax - 16) && (i10 < imin + 16 || i10 > imax - 16) && (i11 < imin + 16 || i11 > imax - 16) &&
+     (i12 < imin + 16 || i12 > imax - 16) && (i13 < imin + 16 || i13 > imax - 16) && (i14 < imin + 16 || i14 > imax - 16) &&
+     (i15 < imin + 16 || i15 > imax - 16))
+    {
+      // load two contiguous blocks and blend
+      Vec16f b      = Vec16f().load((float const *)a + imin);
+      Vec16f c      = Vec16f().load((float const *)a + imax - 15);
+      const int j0  = i0 < imin + 16 ? i0 - imin : 31 - imax + i0;
+      const int j1  = i1 < imin + 16 ? i1 - imin : 31 - imax + i1;
+      const int j2  = i2 < imin + 16 ? i2 - imin : 31 - imax + i2;
+      const int j3  = i3 < imin + 16 ? i3 - imin : 31 - imax + i3;
+      const int j4  = i4 < imin + 16 ? i4 - imin : 31 - imax + i4;
+      const int j5  = i5 < imin + 16 ? i5 - imin : 31 - imax + i5;
+      const int j6  = i6 < imin + 16 ? i6 - imin : 31 - imax + i6;
+      const int j7  = i7 < imin + 16 ? i7 - imin : 31 - imax + i7;
+      const int j8  = i8 < imin + 16 ? i8 - imin : 31 - imax + i8;
+      const int j9  = i9 < imin + 16 ? i9 - imin : 31 - imax + i9;
+      const int j10 = i10 < imin + 16 ? i10 - imin : 31 - imax + i10;
+      const int j11 = i11 < imin + 16 ? i11 - imin : 31 - imax + i11;
+      const int j12 = i12 < imin + 16 ? i12 - imin : 31 - imax + i12;
+      const int j13 = i13 < imin + 16 ? i13 - imin : 31 - imax + i13;
+      const int j14 = i14 < imin + 16 ? i14 - imin : 31 - imax + i14;
+      const int j15 = i15 < imin + 16 ? i15 - imin : 31 - imax + i15;
+      return blend16f<j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15>(b, c);
+    }
+  // use gather instruction
+  return _mm512_i32gather_ps(Vec16i(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15), (const float *)a, 4);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d gather8d(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) >= 0> Negative_array_index;  // Error message if index is negative
+
+  const int i01min   = i0 < i1 ? i0 : i1;
+  const int i23min   = i2 < i3 ? i2 : i3;
+  const int i45min   = i4 < i5 ? i4 : i5;
+  const int i67min   = i6 < i7 ? i6 : i7;
+  const int i0123min = i01min < i23min ? i01min : i23min;
+  const int i4567min = i45min < i67min ? i45min : i67min;
+  const int imin     = i0123min < i4567min ? i0123min : i4567min;
+  const int i01max   = i0 > i1 ? i0 : i1;
+  const int i23max   = i2 > i3 ? i2 : i3;
+  const int i45max   = i4 > i5 ? i4 : i5;
+  const int i67max   = i6 > i7 ? i6 : i7;
+  const int i0123max = i01max > i23max ? i01max : i23max;
+  const int i4567max = i45max > i67max ? i45max : i67max;
+  const int imax     = i0123max > i4567max ? i0123max : i4567max;
+  if(imax - imin <= 7)
+    {
+      // load one contiguous block and permute
+      if(imax > 7)
+        {
+          // make sure we don't read past the end of the array
+          Vec8d b = Vec8d().load((double const *)a + imax - 7);
+          return permute8d<i0 - imax + 7, i1 - imax + 7, i2 - imax + 7, i3 - imax + 7, i4 - imax + 7, i5 - imax + 7, i6 - imax + 7,
+                           i7 - imax + 7>(b);
+        }
+      else
+        {
+          Vec8d b = Vec8d().load((double const *)a + imin);
+          return permute8d<i0 - imin, i1 - imin, i2 - imin, i3 - imin, i4 - imin, i5 - imin, i6 - imin, i7 - imin>(b);
+        }
+    }
+  if((i0 < imin + 8 || i0 > imax - 8) && (i1 < imin + 8 || i1 > imax - 8) && (i2 < imin + 8 || i2 > imax - 8) &&
+     (i3 < imin + 8 || i3 > imax - 8) && (i4 < imin + 8 || i4 > imax - 8) && (i5 < imin + 8 || i5 > imax - 8) &&
+     (i6 < imin + 8 || i6 > imax - 8) && (i7 < imin + 8 || i7 > imax - 8))
+    {
+      // load two contiguous blocks and blend
+      Vec8d b      = Vec8d().load((double const *)a + imin);
+      Vec8d c      = Vec8d().load((double const *)a + imax - 7);
+      const int j0 = i0 < imin + 8 ? i0 - imin : 15 - imax + i0;
+      const int j1 = i1 < imin + 8 ? i1 - imin : 15 - imax + i1;
+      const int j2 = i2 < imin + 8 ? i2 - imin : 15 - imax + i2;
+      const int j3 = i3 < imin + 8 ? i3 - imin : 15 - imax + i3;
+      const int j4 = i4 < imin + 8 ? i4 - imin : 15 - imax + i4;
+      const int j5 = i5 < imin + 8 ? i5 - imin : 15 - imax + i5;
+      const int j6 = i6 < imin + 8 ? i6 - imin : 15 - imax + i6;
+      const int j7 = i7 < imin + 8 ? i7 - imin : 15 - imax + i7;
+      return blend8d<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+  // use gather instruction
+  return _mm512_i64gather_pd(Vec8q(i0, i1, i2, i3, i4, i5, i6, i7), (const double *)a, 8);
+}
+
+/*****************************************************************************
+ *
+ *          Vector scatter functions
+ *
+ ******************************************************************************
+ *
+ * These functions write the elements of a vector to arbitrary positions in an
+ * array in memory. Each vector element is written to an array position
+ * determined by an index. An element is not written if the corresponding
+ * index is out of range.
+ * The indexes can be specified as constant template parameters or as an
+ * integer vector.
+ *
+ * The scatter functions are useful if the data are distributed in a sparce
+ * manner into the array. If the array is dense then it is more efficient
+ * to permute the data into the right positions and then write the whole
+ * permuted vector into the array.
+ *
+ * Example:
+ * Vec8d a(10,11,12,13,14,15,16,17);
+ * double b[16] = {0};
+ * scatter<0,2,14,10,1,-1,5,9>(a,b);
+ * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
+ *
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline void scatter(Vec16f const &data, float *array)
+{
+  __m512i indx = constant16i<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>();
+  Vec16fb mask(i0 >= 0, i1 >= 0, i2 >= 0, i3 >= 0, i4 >= 0, i5 >= 0, i6 >= 0, i7 >= 0, i8 >= 0, i9 >= 0, i10 >= 0, i11 >= 0, i12 >= 0,
+               i13 >= 0, i14 >= 0, i15 >= 0);
+  _mm512_mask_i32scatter_ps(array, mask, indx, data, 4);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8d const &data, double *array)
+{
+  __m256i indx = constant8i<i0, i1, i2, i3, i4, i5, i6, i7>();
+  Vec8db mask(i0 >= 0, i1 >= 0, i2 >= 0, i3 >= 0, i4 >= 0, i5 >= 0, i6 >= 0, i7 >= 0);
+  _mm512_mask_i32scatter_pd(array, mask, indx, data, 8);
+}
+
+static inline void scatter(Vec16i const &index, uint32_t limit, Vec16f const &data, float *array)
+{
+  Vec16fb mask = Vec16ui(index) < limit;
+  _mm512_mask_i32scatter_ps(array, mask, index, data, 4);
+}
+
+static inline void scatter(Vec8q const &index, uint32_t limit, Vec8d const &data, double *array)
+{
+  Vec8db mask = Vec8uq(index) < uint64_t(limit);
+  _mm512_mask_i64scatter_pd(array, mask, index, data, 8);
+}
+
+static inline void scatter(Vec8i const &index, uint32_t limit, Vec8d const &data, double *array)
+{
+#if defined(__AVX512VL__)
+  __mmask16 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit));
+#else
+  __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
+#endif
+  _mm512_mask_i32scatter_pd(array, mask, index, data, 8);
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal scan functions
+ *
+ *****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec16fb const &x) { return horizontal_find_first(Vec16ib(x)); }
+
+static inline int horizontal_find_first(Vec8db const &x) { return horizontal_find_first(Vec8qb(x)); }
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec16fb const &x) { return horizontal_count(Vec16ib(x)); }
+
+static inline uint32_t horizontal_count(Vec8db const &x) { return horizontal_count(Vec8qb(x)); }
+
+/*****************************************************************************
+ *
+ *          Boolean <-> bitfield conversion functions
+ *
+ *****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16fb x) { return to_bits(Vec16ib(x)); }
+
+// to_Vec16fb: convert integer bitfield to boolean vector
+static inline Vec16fb to_Vec16fb(uint16_t x) { return Vec16fb(to_Vec16ib(x)); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8db x) { return to_bits(Vec8qb(x)); }
+
+// to_Vec8db: convert integer bitfield to boolean vector
+static inline Vec8db to_Vec8db(uint8_t x) { return Vec8db(to_Vec8qb(x)); }
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORF512_H
diff --git a/src/vectorclass/vectorf512e.h b/src/vectorclass/vectorf512e.h
new file mode 100644
index 0000000000000000000000000000000000000000..0753ba46492c96ce4611c7861d8d80e909d230f6
--- /dev/null
+++ b/src/vectorclass/vectorf512e.h
@@ -0,0 +1,2273 @@
+/****************************  vectorf512.h   *******************************
+ * Author:        Agner Fog
+ * Date created:  2014-07-23
+ * Last modified: 2017-02-19
+ * Version:       1.27
+ * Project:       vector classes
+ * Description:
+ * Header file defining floating point vector classes as interface to intrinsic
+ * functions in x86 microprocessors with AVX512 and later instruction sets.
+ *
+ * Instructions:
+ * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
+ * instruction set, which must be at least AVX512F.
+ *
+ * The following vector classes are defined here:
+ * Vec16f    Vector of  16  single precision floating point numbers
+ * Vec16fb   Vector of  16  Booleans for use with Vec16f
+ * Vec8d     Vector of   8  double precision floating point numbers
+ * Vec8db    Vector of   8  Booleans for use with Vec8d
+ *
+ * Each vector object is represented internally in the CPU as a 512-bit register.
+ * This header file defines operators and functions for these vectors.
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2014-2017 GNU General Public License http://www.gnu.org/licenses
+ *****************************************************************************/
+
+// check combination of header files
+#if defined(VECTORF512_H)
+#if VECTORF512_H != 1
+#error Two different versions of vectorf512.h included
+#endif
+#else
+#define VECTORF512_H 1
+
+#include "vectori512e.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/*****************************************************************************
+ *
+ *          Vec16fb: Vector of 16 Booleans for use with Vec16f
+ *
+ *****************************************************************************/
+class Vec16fb : public Vec16b
+{
+ public:
+  // Default constructor:
+  Vec16fb() {}
+  // Constructor to build from all elements:
+  Vec16fb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, bool x8, bool x9, bool x10, bool x11, bool x12,
+          bool x13, bool x14, bool x15)
+      : Vec16b(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15)
+  {
+  }
+  // Constructor from Vec16b
+  Vec16fb(Vec16b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Constructor from two Vec8fb
+  Vec16fb(Vec8fb const &x0, Vec8fb const &x1)
+  {
+    z0 = x0;
+    z1 = x1;
+  }
+  // Constructor to broadcast scalar value:
+  Vec16fb(bool b) : Vec16b(b) {}
+  // Assignment operator to broadcast scalar value:
+  Vec16fb &operator=(bool b)
+  {
+    *this = Vec16b(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec16fb(int b);
+  Vec16fb &operator=(int x);
+
+ public:
+  // Get low and high half
+  Vec8fb get_low() const { return reinterpret_f(Vec8i(z0)); }
+  Vec8fb get_high() const { return reinterpret_f(Vec8i(z1)); }
+};
+
+// Define operators for Vec16fb
+
+// vector operator & : bitwise and
+static inline Vec16fb operator&(Vec16fb const &a, Vec16fb const &b)
+{
+  return Vec16fb(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16fb operator&&(Vec16fb const &a, Vec16fb const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec16fb operator|(Vec16fb const &a, Vec16fb const &b)
+{
+  return Vec16fb(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16fb operator||(Vec16fb const &a, Vec16fb const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec16fb operator^(Vec16fb const &a, Vec16fb const &b)
+{
+  return Vec16fb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16fb operator~(Vec16fb const &a) { return Vec16fb(~a.get_low(), ~a.get_high()); }
+
+// vector operator ! : element not
+static inline Vec16fb operator!(Vec16fb const &a) { return ~a; }
+
+// vector operator &= : bitwise and
+static inline Vec16fb &operator&=(Vec16fb &a, Vec16fb const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16fb &operator|=(Vec16fb &a, Vec16fb const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16fb &operator^=(Vec16fb &a, Vec16fb const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Vec8db: Vector of 8 Booleans for use with Vec8d
+ *
+ *****************************************************************************/
+
+class Vec8db : public Vec512b
+{
+ public:
+  // Default constructor:
+  Vec8db() {}
+  // Constructor to build from all elements:
+  Vec8db(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7)
+  {
+    z0 = Vec4qb(x0, x1, x2, x3);
+    z1 = Vec4qb(x4, x5, x6, x7);
+  }
+  // Construct from Vec512b
+  Vec8db(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Constructor from two Vec4db
+  Vec8db(Vec4db const &x0, Vec4db const &x1)
+  {
+    z0 = x0;
+    z1 = x1;
+  }
+  // Constructor to broadcast single value:
+  Vec8db(bool b) { z0 = z1 = Vec8i(-int32_t(b)); }
+  // Assignment operator to broadcast scalar value:
+  Vec8db &operator=(bool b)
+  {
+    *this = Vec8db(b);
+    return *this;
+  }
+
+ private:
+  // Prevent constructing from int, etc. because of ambiguity
+  Vec8db(int b);
+  // Prevent assigning int because of ambiguity
+  Vec8db &operator=(int x);
+
+ public:
+  Vec8db &insert(int index, bool a)
+  {
+    if(index < 4)
+      {
+        z0 = Vec4q(z0).insert(index, -(int64_t)a);
+      }
+    else
+      {
+        z1 = Vec4q(z1).insert(index - 4, -(int64_t)a);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const
+  {
+    if(index < 4)
+      {
+        return Vec4q(z0).extract(index) != 0;
+      }
+    else
+      {
+        return Vec4q(z1).extract(index - 4) != 0;
+      }
+  }
+  // Extract a single element. Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+  // Get low and high half
+  Vec4db get_low() const { return reinterpret_d(Vec4q(z0)); }
+  Vec4db get_high() const { return reinterpret_d(Vec4q(z1)); }
+  static int size() { return 8; }
+};
+
+// Define operators for Vec8db
+
+// vector operator & : bitwise and
+static inline Vec8db operator&(Vec8db const &a, Vec8db const &b)
+{
+  return Vec8db(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec8db operator&&(Vec8db const &a, Vec8db const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec8db operator|(Vec8db const &a, Vec8db const &b)
+{
+  return Vec8db(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec8db operator||(Vec8db const &a, Vec8db const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec8db operator^(Vec8db const &a, Vec8db const &b)
+{
+  return Vec8db(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8db operator~(Vec8db const &a) { return Vec8db(~a.get_low(), ~a.get_high()); }
+
+// vector operator ! : element not
+static inline Vec8db operator!(Vec8db const &a) { return ~a; }
+
+// vector operator &= : bitwise and
+static inline Vec8db &operator&=(Vec8db &a, Vec8db const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8db &operator|=(Vec8db &a, Vec8db const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8db &operator^=(Vec8db &a, Vec8db const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Vec16f: Vector of 16 single precision floating point values
+ *
+ *****************************************************************************/
+
+class Vec16f
+{
+ protected:
+  Vec8f z0;
+  Vec8f z1;
+
+ public:
+  // Default constructor:
+  Vec16f() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec16f(float f) { z0 = z1 = Vec8f(f); }
+  // Constructor to build from all elements:
+  Vec16f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7, float f8, float f9, float f10, float f11,
+         float f12, float f13, float f14, float f15)
+  {
+    z0 = Vec8f(f0, f1, f2, f3, f4, f5, f6, f7);
+    z1 = Vec8f(f8, f9, f10, f11, f12, f13, f14, f15);
+  }
+  // Constructor to build from two Vec8f:
+  Vec16f(Vec8f const &a0, Vec8f const &a1)
+  {
+    z0 = a0;
+    z1 = a1;
+  }
+  // split into two halves
+  Vec8f get_low() const { return z0; }
+  Vec8f get_high() const { return z1; }
+  // Member function to load from array (unaligned)
+  Vec16f &load(float const *p)
+  {
+    z0 = Vec8f().load(p);
+    z1 = Vec8f().load(p + 8);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 64.
+  Vec16f &load_a(float const *p)
+  {
+    z0 = Vec8f().load_a(p);
+    z1 = Vec8f().load_a(p + 8);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(float *p) const
+  {
+    Vec8f(z0).store(p);
+    Vec8f(z1).store(p + 8);
+  }
+  // Member function to store into array, aligned by 64
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 64.
+  void store_a(float *p) const
+  {
+    Vec8f(z0).store_a(p);
+    Vec8f(z1).store_a(p + 8);
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec16f &load_partial(int n, float const *p)
+  {
+    if(n < 8)
+      {
+        z0 = Vec8f().load_partial(n, p);
+        z1 = Vec8f(0.f);
+      }
+    else
+      {
+        z0 = Vec8f().load(p);
+        z1 = Vec8f().load_partial(n - 8, p + 8);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, float *p) const
+  {
+    if(n < 8)
+      {
+        Vec8f(z0).store_partial(n, p);
+      }
+    else
+      {
+        Vec8f(z0).store(p);
+        Vec8f(z1).store_partial(n - 8, p + 8);
+      }
+  }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec16f &cutoff(int n)
+  {
+    if(n < 8)
+      {
+        z0 = Vec8f(z0).cutoff(n);
+        z1 = Vec8f(0.f);
+      }
+    else
+      {
+        z1 = Vec8f(z1).cutoff(n - 8);
+      }
+    return *this;
+  }
+  // Member function to change a single element in vector
+  Vec16f const &insert(uint32_t index, float value)
+  {
+    if(index < 8)
+      {
+        z0 = Vec8f(z0).insert(index, value);
+      }
+    else
+      {
+        z1 = Vec8f(z1).insert(index - 8, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  float extract(uint32_t index) const
+  {
+    float a[16];
+    store(a);
+    return a[index & 15];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  float operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 16; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec16f
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16f operator+(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator + : add vector and scalar
+static inline Vec16f operator+(Vec16f const &a, float b) { return a + Vec16f(b); }
+static inline Vec16f operator+(float a, Vec16f const &b) { return Vec16f(a) + b; }
+
+// vector operator += : add
+static inline Vec16f &operator+=(Vec16f &a, Vec16f const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec16f operator++(Vec16f &a, int)
+{
+  Vec16f a0 = a;
+  a         = a + 1.0f;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec16f &operator++(Vec16f &a)
+{
+  a = a + 1.0f;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16f operator-(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec16f operator-(Vec16f const &a, float b) { return a - Vec16f(b); }
+static inline Vec16f operator-(float a, Vec16f const &b) { return Vec16f(a) - b; }
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec16f operator-(Vec16f const &a) { return Vec16f(-a.get_low(), -a.get_high()); }
+
+// vector operator -= : subtract
+static inline Vec16f &operator-=(Vec16f &a, Vec16f const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec16f operator--(Vec16f &a, int)
+{
+  Vec16f a0 = a;
+  a         = a - 1.0f;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec16f &operator--(Vec16f &a)
+{
+  a = a - 1.0f;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16f operator*(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec16f operator*(Vec16f const &a, float b) { return a * Vec16f(b); }
+static inline Vec16f operator*(float a, Vec16f const &b) { return Vec16f(a) * b; }
+
+// vector operator *= : multiply
+static inline Vec16f &operator*=(Vec16f &a, Vec16f const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16f operator/(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(a.get_low() / b.get_low(), a.get_high() / b.get_high());
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec16f operator/(Vec16f const &a, float b) { return a / Vec16f(b); }
+static inline Vec16f operator/(float a, Vec16f const &b) { return Vec16f(a) / b; }
+
+// vector operator /= : divide
+static inline Vec16f &operator/=(Vec16f &a, Vec16f const &b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16fb operator==(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16fb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16fb operator!=(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16fb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16fb operator<(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16fb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec16fb operator<=(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16fb(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16fb operator>(Vec16f const &a, Vec16f const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec16fb operator>=(Vec16f const &a, Vec16f const &b) { return b <= a; }
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec16f operator&(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec16f &operator&=(Vec16f &a, Vec16f const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator & : bitwise and of Vec16f and Vec16fb
+static inline Vec16f operator&(Vec16f const &a, Vec16fb const &b)
+{
+  return Vec16f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16f operator&(Vec16fb const &a, Vec16f const &b) { return b & a; }
+
+// vector operator | : bitwise or
+static inline Vec16f operator|(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec16f &operator|=(Vec16f &a, Vec16f const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16f operator^(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16f &operator^=(Vec16f &a, Vec16f const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec16fb operator!(Vec16f const &a) { return Vec16fb(!a.get_low(), !a.get_high()); }
+
+/*****************************************************************************
+ *
+ *          Functions for Vec16f
+ *
+ *****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec16f select(Vec16fb const &s, Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16f if_add(Vec16fb const &f, Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec16f if_mul(Vec16fb const &f, Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add(Vec16f const &a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// function max: a > b ? a : b
+static inline Vec16f max(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16f min(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec16f abs(Vec16f const &a) { return Vec16f(abs(a.get_low()), abs(a.get_high())); }
+
+// function sqrt: square root
+static inline Vec16f sqrt(Vec16f const &a) { return Vec16f(sqrt(a.get_low()), sqrt(a.get_high())); }
+
+// function square: a * a
+static inline Vec16f square(Vec16f const &a) { return a * a; }
+
+// pow(Vec16f, int):
+template <typename TT>
+static Vec16f pow(Vec16f const &a, TT const &n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec16f pow<int>(Vec16f const &x0, int const &n)
+{
+  return pow_template_i<Vec16f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec16f pow<uint32_t>(Vec16f const &x0, uint32_t const &n)
+{
+  return pow_template_i<Vec16f>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec16f pow_n(Vec16f const &a)
+{
+  if(n < 0)
+    return Vec16f(1.0f) / pow_n<-n>(a);
+  if(n == 0)
+    return Vec16f(1.0f);
+  if(n >= 256)
+    return pow(a, n);
+  Vec16f x = a;                          // a^(2^i)
+  Vec16f y;                              // accumulator
+  const int lowest = n - (n & (n - 1));  // lowest set bit in n
+  if(n & 1)
+    y = x;
+  if(n < 2)
+    return y;
+  x = x * x;  // x^2
+  if(n & 2)
+    {
+      if(lowest == 2)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 4)
+    return y;
+  x = x * x;  // x^4
+  if(n & 4)
+    {
+      if(lowest == 4)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 8)
+    return y;
+  x = x * x;  // x^8
+  if(n & 8)
+    {
+      if(lowest == 8)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 16)
+    return y;
+  x = x * x;  // x^16
+  if(n & 16)
+    {
+      if(lowest == 16)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 32)
+    return y;
+  x = x * x;  // x^32
+  if(n & 32)
+    {
+      if(lowest == 32)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 64)
+    return y;
+  x = x * x;  // x^64
+  if(n & 64)
+    {
+      if(lowest == 64)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 128)
+    return y;
+  x = x * x;  // x^128
+  if(n & 128)
+    {
+      if(lowest == 128)
+        y = x;
+      else
+        y *= x;
+    }
+  return y;
+}
+
+template <int n>
+static inline Vec16f pow(Vec16f const &a, Const_int_t<n>)
+{
+  return pow_n<n>(a);
+}
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec16f round(Vec16f const &a) { return Vec16f(round(a.get_low()), round(a.get_high())); }
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec16f truncate(Vec16f const &a) { return Vec16f(truncate(a.get_low()), truncate(a.get_high())); }
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec16f floor(Vec16f const &a) { return Vec16f(floor(a.get_low()), floor(a.get_high())); }
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec16f ceil(Vec16f const &a) { return Vec16f(ceil(a.get_low()), ceil(a.get_high())); }
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec16i round_to_int(Vec16f const &a) { return Vec16i(round_to_int(a.get_low()), round_to_int(a.get_high())); }
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec16i truncate_to_int(Vec16f const &a) { return Vec16i(truncate_to_int(a.get_low()), truncate_to_int(a.get_high())); }
+
+// function to_float: convert integer vector to float vector
+static inline Vec16f to_float(Vec16i const &a) { return Vec16f(to_float(a.get_low()), to_float(a.get_high())); }
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec16f to_float(Vec16ui const &a) { return Vec16f(to_float(a.get_low()), to_float(a.get_high())); }
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a.
+// relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512)
+static inline Vec16f approx_recipr(Vec16f const &a) { return Vec16f(approx_recipr(a.get_low()), approx_recipr(a.get_high())); }
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a).
+// Relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512)
+static inline Vec16f approx_rsqrt(Vec16f const &a) { return Vec16f(approx_rsqrt(a.get_low()), approx_rsqrt(a.get_high())); }
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec16f mul_add(Vec16f const &a, Vec16f const &b, Vec16f const &c)
+{
+  return Vec16f(mul_add(a.get_low(), b.get_low(), c.get_low()), mul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec16f mul_sub(Vec16f const &a, Vec16f const &b, Vec16f const &c)
+{
+  return Vec16f(mul_sub(a.get_low(), b.get_low(), c.get_low()), mul_sub(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec16f nmul_add(Vec16f const &a, Vec16f const &b, Vec16f const &c)
+{
+  return Vec16f(nmul_add(a.get_low(), b.get_low(), c.get_low()), nmul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec16f mul_sub_x(Vec16f const &a, Vec16f const &b, Vec16f const &c)
+{
+  return Vec16f(mul_sub_x(a.get_low(), b.get_low(), c.get_low()), mul_sub_x(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec16i exponent(Vec16f const &a) { return Vec16i(exponent(a.get_low()), exponent(a.get_high())); }
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
+static inline Vec16f fraction(Vec16f const &a) { return Vec16f(fraction(a.get_low()), fraction(a.get_high())); }
+
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec16f exp2(Vec16i const &n) { return Vec16f(exp2(n.get_low()), exp2(n.get_high())); }
+// static Vec16f exp2(Vec16f const & x); // defined in vectormath_exp.h
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec16f(-0.0f)) gives true, while Vec16f(-0.0f) < Vec16f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb sign_bit(Vec16f const &a) { return Vec16fb(sign_bit(a.get_low()), sign_bit(a.get_high())); }
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec16f sign_combine(Vec16f const &a, Vec16f const &b)
+{
+  return Vec16f(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_finite(Vec16f const &a) { return Vec16fb(is_finite(a.get_low()), is_finite(a.get_high())); }
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_inf(Vec16f const &a) { return Vec16fb(is_inf(a.get_low()), is_inf(a.get_high())); }
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_nan(Vec16f const &a) { return Vec16fb(is_nan(a.get_low()), is_nan(a.get_high())); }
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec16fb is_subnormal(Vec16f const &a) { return Vec16fb(is_subnormal(a.get_low()), is_subnormal(a.get_high())); }
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec16fb is_zero_or_subnormal(Vec16f const &a)
+{
+  return Vec16fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec16f infinite16f()
+{
+  Vec8f inf = infinite8f();
+  return Vec16f(inf, inf);
+}
+
+// Function nan4f: returns a vector where all elements are +NAN (quiet)
+static inline Vec16f nan16f(int n = 0x10)
+{
+  Vec8f nan = nan8f(n);
+  return Vec16f(nan, nan);
+}
+
+// change signs on vectors Vec16f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16f change_sign(Vec16f const &a)
+{
+  return Vec16f(change_sign<i0, i1, i2, i3, i4, i5, i6, i7>(a.get_low()),
+                change_sign<i8, i9, i10, i11, i12, i13, i14, i15>(a.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Vec8d: Vector of 8 double precision floating point values
+ *
+ *****************************************************************************/
+
+class Vec8d
+{
+ protected:
+  Vec4d z0;
+  Vec4d z1;
+
+ public:
+  // Default constructor:
+  Vec8d() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8d(double d) { z0 = z1 = Vec4d(d); }
+  // Constructor to build from all elements:
+  Vec8d(double d0, double d1, double d2, double d3, double d4, double d5, double d6, double d7)
+  {
+    z0 = Vec4d(d0, d1, d2, d3);
+    z1 = Vec4d(d4, d5, d6, d7);
+  }
+  // Constructor to build from two Vec4d:
+  Vec8d(Vec4d const &a0, Vec4d const &a1)
+  {
+    z0 = a0;
+    z1 = a1;
+  }
+  // Member function to load from array (unaligned)
+  Vec8d &load(double const *p)
+  {
+    z0.load(p);
+    z1.load(p + 4);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 64
+  Vec8d &load_a(double const *p)
+  {
+    z0.load_a(p);
+    z1.load_a(p + 4);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(double *p) const
+  {
+    z0.store(p);
+    z1.store(p + 4);
+  }
+  // Member function to store into array, aligned by 64
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 64
+  void store_a(double *p) const
+  {
+    z0.store_a(p);
+    z1.store_a(p + 4);
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec8d &load_partial(int n, double const *p)
+  {
+    if(n < 4)
+      {
+        z0.load_partial(n, p);
+        z1 = Vec4d(0.);
+      }
+    else
+      {
+        z0.load(p);
+        z1.load_partial(n - 4, p + 4);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, double *p) const
+  {
+    if(n < 4)
+      {
+        z0.store_partial(n, p);
+      }
+    else
+      {
+        z0.store(p);
+        z1.store_partial(n - 4, p + 4);
+      }
+  }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec8d &cutoff(int n)
+  {
+    if(n < 4)
+      {
+        z0.cutoff(n);
+        z1 = Vec4d(0.);
+      }
+    else
+      {
+        z1.cutoff(n - 4);
+      }
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8d const &insert(uint32_t index, double value)
+  {
+    if(index < 4)
+      {
+        z0.insert(index, value);
+      }
+    else
+      {
+        z1.insert(index - 4, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  double extract(uint32_t index) const
+  {
+    double a[8];
+    store(a);
+    return a[index & 7];
+  }
+
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  double operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4d:
+  Vec4d get_low() const { return z0; }
+  Vec4d get_high() const { return z1; }
+  static int size() { return 8; }
+};
+
+/*****************************************************************************
+ *
+ *          Operators for Vec8d
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8d operator+(Vec8d const &a, Vec8d const &b) { return Vec8d(a.get_low() + b.get_low(), a.get_high() + b.get_high()); }
+
+// vector operator + : add vector and scalar
+static inline Vec8d operator+(Vec8d const &a, double b) { return a + Vec8d(b); }
+static inline Vec8d operator+(double a, Vec8d const &b) { return Vec8d(a) + b; }
+
+// vector operator += : add
+static inline Vec8d &operator+=(Vec8d &a, Vec8d const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec8d operator++(Vec8d &a, int)
+{
+  Vec8d a0 = a;
+  a        = a + 1.0;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec8d &operator++(Vec8d &a)
+{
+  a = a + 1.0;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8d operator-(Vec8d const &a, Vec8d const &b) { return Vec8d(a.get_low() - b.get_low(), a.get_high() - b.get_high()); }
+
+// vector operator - : subtract vector and scalar
+static inline Vec8d operator-(Vec8d const &a, double b) { return a - Vec8d(b); }
+static inline Vec8d operator-(double a, Vec8d const &b) { return Vec8d(a) - b; }
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8d operator-(Vec8d const &a) { return Vec8d(-a.get_low(), -a.get_high()); }
+
+// vector operator -= : subtract
+static inline Vec8d &operator-=(Vec8d &a, Vec8d const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec8d operator--(Vec8d &a, int)
+{
+  Vec8d a0 = a;
+  a        = a - 1.0;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec8d &operator--(Vec8d &a)
+{
+  a = a - 1.0;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8d operator*(Vec8d const &a, Vec8d const &b) { return Vec8d(a.get_low() * b.get_low(), a.get_high() * b.get_high()); }
+
+// vector operator * : multiply vector and scalar
+static inline Vec8d operator*(Vec8d const &a, double b) { return a * Vec8d(b); }
+static inline Vec8d operator*(double a, Vec8d const &b) { return Vec8d(a) * b; }
+
+// vector operator *= : multiply
+static inline Vec8d &operator*=(Vec8d &a, Vec8d const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8d operator/(Vec8d const &a, Vec8d const &b) { return Vec8d(a.get_low() / b.get_low(), a.get_high() / b.get_high()); }
+
+// vector operator / : divide vector and scalar
+static inline Vec8d operator/(Vec8d const &a, double b) { return a / Vec8d(b); }
+static inline Vec8d operator/(double a, Vec8d const &b) { return Vec8d(a) / b; }
+
+// vector operator /= : divide
+static inline Vec8d &operator/=(Vec8d &a, Vec8d const &b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8db operator==(Vec8d const &a, Vec8d const &b)
+{
+  return Vec8db(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8db operator!=(Vec8d const &a, Vec8d const &b)
+{
+  return Vec8db(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8db operator<(Vec8d const &a, Vec8d const &b)
+{
+  return Vec8db(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8db operator<=(Vec8d const &a, Vec8d const &b)
+{
+  return Vec8db(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8db operator>(Vec8d const &a, Vec8d const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8db operator>=(Vec8d const &a, Vec8d const &b) { return b <= a; }
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8d operator&(Vec8d const &a, Vec8d const &b) { return Vec8d(a.get_low() & b.get_low(), a.get_high() & b.get_high()); }
+
+// vector operator &= : bitwise and
+static inline Vec8d &operator&=(Vec8d &a, Vec8d const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator & : bitwise and of Vec8d and Vec8db
+static inline Vec8d operator&(Vec8d const &a, Vec8db const &b)
+{
+  return Vec8d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+static inline Vec8d operator&(Vec8db const &a, Vec8d const &b) { return b & a; }
+
+// vector operator | : bitwise or
+static inline Vec8d operator|(Vec8d const &a, Vec8d const &b) { return Vec8d(a.get_low() | b.get_low(), a.get_high() | b.get_high()); }
+
+// vector operator |= : bitwise or
+static inline Vec8d &operator|=(Vec8d &a, Vec8d const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8d operator^(Vec8d const &a, Vec8d const &b) { return Vec8d(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); }
+
+// vector operator ^= : bitwise xor
+static inline Vec8d &operator^=(Vec8d &a, Vec8d const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8db operator!(Vec8d const &a) { return Vec8db(!a.get_low(), !a.get_high()); }
+
+/*****************************************************************************
+ *
+ *          Functions for Vec8d
+ *
+ *****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8d select(Vec8db const &s, Vec8d const &a, Vec8d const &b)
+{
+  return Vec8d(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8d if_add(Vec8db const &f, Vec8d const &a, Vec8d const &b)
+{
+  return Vec8d(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec8d if_mul(Vec8db const &f, Vec8d const &a, Vec8d const &b)
+{
+  return Vec8d(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add(Vec8d const &a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// function max: a > b ? a : b
+static inline Vec8d max(Vec8d const &a, Vec8d const &b)
+{
+  return Vec8d(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8d min(Vec8d const &a, Vec8d const &b)
+{
+  return Vec8d(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec8d abs(Vec8d const &a) { return Vec8d(abs(a.get_low()), abs(a.get_high())); }
+
+// function sqrt: square root
+static inline Vec8d sqrt(Vec8d const &a) { return Vec8d(sqrt(a.get_low()), sqrt(a.get_high())); }
+
+// function square: a * a
+static inline Vec8d square(Vec8d const &a) { return a * a; }
+
+// pow(Vec8d, int):
+template <typename TT>
+static Vec8d pow(Vec8d const &a, TT const &n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8d pow<int>(Vec8d const &x0, int const &n)
+{
+  return pow_template_i<Vec8d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8d pow<uint32_t>(Vec8d const &x0, uint32_t const &n)
+{
+  return pow_template_i<Vec8d>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec8d pow_n(Vec8d const &a)
+{
+  if(n < 0)
+    return Vec8d(1.0) / pow_n<-n>(a);
+  if(n == 0)
+    return Vec8d(1.0);
+  if(n >= 256)
+    return pow(a, n);
+  Vec8d x = a;                           // a^(2^i)
+  Vec8d y;                               // accumulator
+  const int lowest = n - (n & (n - 1));  // lowest set bit in n
+  if(n & 1)
+    y = x;
+  if(n < 2)
+    return y;
+  x = x * x;  // x^2
+  if(n & 2)
+    {
+      if(lowest == 2)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 4)
+    return y;
+  x = x * x;  // x^4
+  if(n & 4)
+    {
+      if(lowest == 4)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 8)
+    return y;
+  x = x * x;  // x^8
+  if(n & 8)
+    {
+      if(lowest == 8)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 16)
+    return y;
+  x = x * x;  // x^16
+  if(n & 16)
+    {
+      if(lowest == 16)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 32)
+    return y;
+  x = x * x;  // x^32
+  if(n & 32)
+    {
+      if(lowest == 32)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 64)
+    return y;
+  x = x * x;  // x^64
+  if(n & 64)
+    {
+      if(lowest == 64)
+        y = x;
+      else
+        y *= x;
+    }
+  if(n < 128)
+    return y;
+  x = x * x;  // x^128
+  if(n & 128)
+    {
+      if(lowest == 128)
+        y = x;
+      else
+        y *= x;
+    }
+  return y;
+}
+
+template <int n>
+static inline Vec8d pow(Vec8d const &a, Const_int_t<n>)
+{
+  return pow_n<n>(a);
+}
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec8d round(Vec8d const &a) { return Vec8d(round(a.get_low()), round(a.get_high())); }
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec8d truncate(Vec8d const &a) { return Vec8d(truncate(a.get_low()), truncate(a.get_high())); }
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec8d floor(Vec8d const &a) { return Vec8d(floor(a.get_low()), floor(a.get_high())); }
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec8d ceil(Vec8d const &a) { return Vec8d(ceil(a.get_low()), ceil(a.get_high())); }
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int(Vec8d const &a)
+{
+  // Note: assume MXCSR control register is set to rounding
+  return Vec8i(round_to_int(a.get_low()), round_to_int(a.get_high()));
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int(Vec8d const &a) { return Vec8i(truncate_to_int(a.get_low()), truncate_to_int(a.get_high())); }
+
+// function truncate_to_int64: round towards zero. (inefficient)
+static inline Vec8q truncate_to_int64(Vec8d const &a)
+{
+  return Vec8q(truncate_to_int64(a.get_low()), truncate_to_int64(a.get_high()));
+}
+
+// function truncate_to_int64_limited: round towards zero.
+// result as 64-bit integer vector, but with limited range
+static inline Vec8q truncate_to_int64_limited(Vec8d const &a)
+{
+  // Note: assume MXCSR control register is set to rounding
+  return Vec8q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high()));
+}
+
+// function round_to_int64: round to nearest or even. (inefficient)
+static inline Vec8q round_to_int64(Vec8d const &a) { return Vec8q(round_to_int64(a.get_low()), round_to_int64(a.get_high())); }
+
+// function round_to_int64_limited: round to nearest integer (even)
+// result as 64-bit integer vector, but with limited range
+static inline Vec8q round_to_int64_limited(Vec8d const &a)
+{
+  // Note: assume MXCSR control register is set to rounding
+  return Vec8q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high()));
+}
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec8d to_double(Vec8q const &a) { return Vec8d(to_double(a.get_low()), to_double(a.get_high())); }
+
+// function to_double_limited: convert integer vector elements to double vector
+// limited to abs(x) < 2^31
+static inline Vec8d to_double_limited(Vec8q const &a)
+{
+  return Vec8d(to_double_limited(a.get_low()), to_double_limited(a.get_high()));
+}
+
+// function to_double: convert integer vector to double vector
+static inline Vec8d to_double(Vec8i const &a) { return Vec8d(to_double(a.get_low()), to_double(a.get_high())); }
+
+// function compress: convert two Vec8d to one Vec16f
+static inline Vec16f compress(Vec8d const &low, Vec8d const &high)
+{
+  return Vec16f(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function extend_low : convert Vec16f vector elements 0 - 3 to Vec8d
+static inline Vec8d extend_low(Vec16f const &a) { return Vec8d(extend_low(a.get_low()), extend_high(a.get_low())); }
+
+// Function extend_high : convert Vec16f vector elements 4 - 7 to Vec8d
+static inline Vec8d extend_high(Vec16f const &a) { return Vec8d(extend_low(a.get_high()), extend_high(a.get_high())); }
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8d mul_add(Vec8d const &a, Vec8d const &b, Vec8d const &c)
+{
+  return Vec8d(mul_add(a.get_low(), b.get_low(), c.get_low()), mul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec8d mul_sub(Vec8d const &a, Vec8d const &b, Vec8d const &c)
+{
+  return Vec8d(mul_sub(a.get_low(), b.get_low(), c.get_low()), mul_sub(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec8d nmul_add(Vec8d const &a, Vec8d const &b, Vec8d const &c)
+{
+  return Vec8d(nmul_add(a.get_low(), b.get_low(), c.get_low()), nmul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec8d mul_sub_x(Vec8d const &a, Vec8d const &b, Vec8d const &c)
+{
+  return Vec8d(mul_sub_x(a.get_low(), b.get_low(), c.get_low()), mul_sub_x(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec8q exponent(Vec8d const &a) { return Vec8q(exponent(a.get_low()), exponent(a.get_high())); }
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25
+static inline Vec8d fraction(Vec8d const &a) { return Vec8d(fraction(a.get_low()), fraction(a.get_high())); }
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8d exp2(Vec8q const &n) { return Vec8d(exp2(n.get_low()), exp2(n.get_high())); }
+// static Vec8d exp2(Vec8d const & x); // defined in vectormath_exp.h
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec8d(-0.0)) gives true, while Vec8d(-0.0) < Vec8d(0.0) gives false
+static inline Vec8db sign_bit(Vec8d const &a) { return Vec8db(sign_bit(a.get_low()), sign_bit(a.get_high())); }
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8d sign_combine(Vec8d const &a, Vec8d const &b)
+{
+  return Vec8d(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+static inline Vec8db is_finite(Vec8d const &a) { return Vec8db(is_finite(a.get_low()), is_finite(a.get_high())); }
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec8db is_inf(Vec8d const &a) { return Vec8db(is_inf(a.get_low()), is_inf(a.get_high())); }
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec8db is_nan(Vec8d const &a) { return Vec8db(is_nan(a.get_low()), is_nan(a.get_high())); }
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8db is_subnormal(Vec8d const &a) { return Vec8db(is_subnormal(a.get_low()), is_subnormal(a.get_high())); }
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8db is_zero_or_subnormal(Vec8d const &a)
+{
+  return Vec8db(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec8d infinite8d()
+{
+  Vec4d inf = infinite4d();
+  return Vec8d(inf, inf);
+}
+
+// Function nan8d: returns a vector where all elements are +NAN (quiet NAN)
+static inline Vec8d nan8d(int n = 0x10)
+{
+  Vec4d nan = nan4d(n);
+  return Vec8d(nan, nan);
+}
+
+// change signs on vectors Vec8d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d change_sign(Vec8d const &a)
+{
+  return Vec8d(change_sign<i0, i1, i2, i3>(a.get_low()), change_sign<i4, i5, i6, i7>(a.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Functions for reinterpretation between vector types
+ *
+ *****************************************************************************/
+
+static inline Vec512ie reinterpret_i(Vec512ie const &x) { return x; }
+
+static inline Vec512ie reinterpret_i(Vec16f const &x) { return Vec512ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); }
+
+static inline Vec512ie reinterpret_i(Vec8d const &x) { return Vec512ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); }
+
+static inline Vec16f reinterpret_f(Vec512ie const &x)
+{
+  return Vec16f(Vec8f(reinterpret_f(x.get_low())), Vec8f(reinterpret_f(x.get_high())));
+}
+
+static inline Vec16f reinterpret_f(Vec16f const &x) { return x; }
+
+static inline Vec16f reinterpret_f(Vec8d const &x)
+{
+  return Vec16f(Vec8f(reinterpret_f(x.get_low())), Vec8f(reinterpret_f(x.get_high())));
+}
+
+static inline Vec8d reinterpret_d(Vec512ie const &x)
+{
+  return Vec8d(Vec4d(reinterpret_d(x.get_low())), Vec4d(reinterpret_d(x.get_high())));
+}
+
+static inline Vec8d reinterpret_d(Vec16f const &x)
+{
+  return Vec8d(Vec4d(reinterpret_d(x.get_low())), Vec4d(reinterpret_d(x.get_high())));
+}
+
+static inline Vec8d reinterpret_d(Vec8d const &x) { return x; }
+
+/*****************************************************************************
+ *
+ *          Vector permute functions
+ *
+ ******************************************************************************
+ *
+ * These permute functions can reorder the elements of a vector and optionally
+ * set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to select.
+ * An index of -1 will generate zero. An index of -256 means don't care.
+ *
+ * Example:
+ * Vec8d a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+ * Vec8d b;
+ * b = permute8d<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+// Permute vector of 8 double
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d permute8d(Vec8d const &a)
+{
+  return Vec8d(blend4d<i0, i1, i2, i3>(a.get_low(), a.get_high()), blend4d<i4, i5, i6, i7>(a.get_low(), a.get_high()));
+}
+
+// Permute vector of 16 float
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16f permute16f(Vec16f const &a)
+{
+  return Vec16f(blend8f<i0, i1, i2, i3, i4, i5, i6, i7>(a.get_low(), a.get_high()),
+                blend8f<i8, i9, i10, i11, i12, i13, i14, i15>(a.get_low(), a.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Vector blend functions
+ *
+ ******************************************************************************
+ *
+ * These blend functions can mix elements from two different vectors and
+ * optionally set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select, where higher indexes indicate an element from the second source
+ * vector. For example, if each vector has 8 elements, then indexes 0 - 7
+ * will select an element from the first vector and indexes 8 - 15 will select
+ * an element from the second vector. A negative index will generate zero.
+ *
+ * Example:
+ * Vec8d a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8d b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+ * Vec8d c;
+ * c = blend8d<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+// helper function used below
+template <int n>
+static inline Vec4d select4(Vec8d const &a, Vec8d const &b)
+{
+  switch(n)
+    {
+      case 0:
+        return a.get_low();
+      case 1:
+        return a.get_high();
+      case 2:
+        return b.get_low();
+      case 3:
+        return b.get_high();
+    }
+  return Vec4d(0.);
+}
+
+// blend vectors Vec8d
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d blend8d(Vec8d const &a, Vec8d const &b)
+{
+  const int j0 = i0 >= 0 ? i0 / 4 : i0;
+  const int j1 = i1 >= 0 ? i1 / 4 : i1;
+  const int j2 = i2 >= 0 ? i2 / 4 : i2;
+  const int j3 = i3 >= 0 ? i3 / 4 : i3;
+  const int j4 = i4 >= 0 ? i4 / 4 : i4;
+  const int j5 = i5 >= 0 ? i5 / 4 : i5;
+  const int j6 = i6 >= 0 ? i6 / 4 : i6;
+  const int j7 = i7 >= 0 ? i7 / 4 : i7;
+  Vec4d x0, x1;
+
+  const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
+  const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
+  const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
+  const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
+
+  // Combine all the indexes into a single bitfield, with 4 bits for each
+  const int m1 = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                 (i6 & 0xF) << 24 | (i7 & 0xF) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12 |
+                 (i4 < 0 ? 0 : 0xF) << 16 | (i5 < 0 ? 0 : 0xF) << 20 | (i6 < 0 ? 0 : 0xF) << 24 | (i7 < 0 ? 0 : 0xF) << 28;
+
+  if(r0 < 0)
+    {
+      x0 = Vec4d(0.);
+    }
+  else if(((m1 ^ r0 * 0x4444) & 0xCCCC & mz) == 0)
+    {
+      // i0 - i3 all from same source
+      x0 = permute4d<i0 & -13, i1 & -13, i2 & -13, i3 & -13>(select4<r0>(a, b));
+    }
+  else if((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0))
+    {
+      // i0 - i3 all from two sources
+      const int k0 = i0 >= 0 ? i0 & 3 : i0;
+      const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
+      const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
+      const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
+      x0           = blend4d<k0, k1, k2, k3>(select4<r0>(a, b), select4<s0>(a, b));
+    }
+  else
+    {
+      // i0 - i3 from three or four different sources
+      x0 = blend4d<0, 1, 6, 7>(blend4d<i0 & -13, (i1 & -13) | 4, -0x100, -0x100>(select4<j0>(a, b), select4<j1>(a, b)),
+                               blend4d<-0x100, -0x100, i2 & -13, (i3 & -13) | 4>(select4<j2>(a, b), select4<j3>(a, b)));
+    }
+
+  if(r1 < 0)
+    {
+      x1 = Vec4d(0.);
+    }
+  else if(((m1 ^ uint32_t(r1) * 0x44440000u) & 0xCCCC0000 & mz) == 0)
+    {
+      // i4 - i7 all from same source
+      x1 = permute4d<i4 & -13, i5 & -13, i6 & -13, i7 & -13>(select4<r1>(a, b));
+    }
+  else if((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1))
+    {
+      // i4 - i7 all from two sources
+      const int k4 = i4 >= 0 ? i4 & 3 : i4;
+      const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
+      const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
+      const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
+      x1           = blend4d<k4, k5, k6, k7>(select4<r1>(a, b), select4<s1>(a, b));
+    }
+  else
+    {
+      // i4 - i7 from three or four different sources
+      x1 = blend4d<0, 1, 6, 7>(blend4d<i4 & -13, (i5 & -13) | 4, -0x100, -0x100>(select4<j4>(a, b), select4<j5>(a, b)),
+                               blend4d<-0x100, -0x100, i6 & -13, (i7 & -13) | 4>(select4<j6>(a, b), select4<j7>(a, b)));
+    }
+
+  return Vec8d(x0, x1);
+}
+
+// helper function used below
+template <int n>
+static inline Vec8f select4(Vec16f const &a, Vec16f const &b)
+{
+  switch(n)
+    {
+      case 0:
+        return a.get_low();
+      case 1:
+        return a.get_high();
+      case 2:
+        return b.get_low();
+      case 3:
+        return b.get_high();
+    }
+  return Vec8f(0.f);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16f blend16f(Vec16f const &a, Vec16f const &b)
+{
+  const int j0  = i0 >= 0 ? i0 / 8 : i0;
+  const int j1  = i1 >= 0 ? i1 / 8 : i1;
+  const int j2  = i2 >= 0 ? i2 / 8 : i2;
+  const int j3  = i3 >= 0 ? i3 / 8 : i3;
+  const int j4  = i4 >= 0 ? i4 / 8 : i4;
+  const int j5  = i5 >= 0 ? i5 / 8 : i5;
+  const int j6  = i6 >= 0 ? i6 / 8 : i6;
+  const int j7  = i7 >= 0 ? i7 / 8 : i7;
+  const int j8  = i8 >= 0 ? i8 / 8 : i8;
+  const int j9  = i9 >= 0 ? i9 / 8 : i9;
+  const int j10 = i10 >= 0 ? i10 / 8 : i10;
+  const int j11 = i11 >= 0 ? i11 / 8 : i11;
+  const int j12 = i12 >= 0 ? i12 / 8 : i12;
+  const int j13 = i13 >= 0 ? i13 / 8 : i13;
+  const int j14 = i14 >= 0 ? i14 / 8 : i14;
+  const int j15 = i15 >= 0 ? i15 / 8 : i15;
+
+  Vec8f x0, x1;
+
+  const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3 >= 0 ? j3 : j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
+  const int r1 =
+      j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
+  const int s0 = (j1 >= 0 && j1 != r0)
+                     ? j1
+                     : (j2 >= 0 && j2 != r0)
+                           ? j2
+                           : (j3 >= 0 && j3 != r0)
+                                 ? j3
+                                 : (j4 >= 0 && j4 != r0) ? j4 : (j5 >= 0 && j5 != r0) ? j5 : (j6 >= 0 && j6 != r0) ? j6 : j7;
+  const int s1 = (j9 >= 0 && j9 != r1)
+                     ? j9
+                     : (j10 >= 0 && j10 != r1)
+                           ? j10
+                           : (j11 >= 0 && j11 != r1)
+                                 ? j11
+                                 : (j12 >= 0 && j12 != r1) ? j12 : (j13 >= 0 && j13 != r1) ? j13 : (j14 >= 0 && j14 != r1) ? j14 : j15;
+
+  if(r0 < 0)
+    {
+      x0 = Vec8f(0.f);
+    }
+  else if(r0 == s0)
+    {
+      // i0 - i7 all from same source
+      x0 = permute8f<i0 & -25, i1 & -25, i2 & -25, i3 & -25, i4 & -25, i5 & -25, i6 & -25, i7 & -25>(select4<r0>(a, b));
+    }
+  else if((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0) && (j4 < 0 || j4 == r0 || j4 == s0) &&
+          (j5 < 0 || j5 == r0 || j5 == s0) && (j6 < 0 || j6 == r0 || j6 == s0) && (j7 < 0 || j7 == r0 || j7 == s0))
+    {
+      // i0 - i7 all from two sources
+      const int k0 = i0 >= 0 ? (i0 & 7) : i0;
+      const int k1 = (i1 >= 0 ? (i1 & 7) : i1) | (j1 == s0 ? 8 : 0);
+      const int k2 = (i2 >= 0 ? (i2 & 7) : i2) | (j2 == s0 ? 8 : 0);
+      const int k3 = (i3 >= 0 ? (i3 & 7) : i3) | (j3 == s0 ? 8 : 0);
+      const int k4 = (i4 >= 0 ? (i4 & 7) : i4) | (j4 == s0 ? 8 : 0);
+      const int k5 = (i5 >= 0 ? (i5 & 7) : i5) | (j5 == s0 ? 8 : 0);
+      const int k6 = (i6 >= 0 ? (i6 & 7) : i6) | (j6 == s0 ? 8 : 0);
+      const int k7 = (i7 >= 0 ? (i7 & 7) : i7) | (j7 == s0 ? 8 : 0);
+      x0           = blend8f<k0, k1, k2, k3, k4, k5, k6, k7>(select4<r0>(a, b), select4<s0>(a, b));
+    }
+  else
+    {
+      // i0 - i7 from three or four different sources
+      const int n0 = j0 >= 0 ? j0 / 2 * 8 + 0 : j0;
+      const int n1 = j1 >= 0 ? j1 / 2 * 8 + 1 : j1;
+      const int n2 = j2 >= 0 ? j2 / 2 * 8 + 2 : j2;
+      const int n3 = j3 >= 0 ? j3 / 2 * 8 + 3 : j3;
+      const int n4 = j4 >= 0 ? j4 / 2 * 8 + 4 : j4;
+      const int n5 = j5 >= 0 ? j5 / 2 * 8 + 5 : j5;
+      const int n6 = j6 >= 0 ? j6 / 2 * 8 + 6 : j6;
+      const int n7 = j7 >= 0 ? j7 / 2 * 8 + 7 : j7;
+      x0           = blend8f<n0, n1, n2, n3, n4, n5, n6, n7>(
+          blend8f < j0 & 2 ? -256 : i0 & 15, j1 & 2 ? -256 : i1 & 15, j2 & 2 ? -256 : i2 & 15, j3 & 2 ? -256 : i3 & 15,
+          j4 & 2 ? -256 : i4 & 15, j5 & 2 ? -256 : i5 & 15, j6 & 2 ? -256 : i6 & 15,
+          j7 & 2 ? -256 : i7 & 15 > (a.get_low(), a.get_high()), blend8f < (j0 ^ 2) & 6 ? -256 : i0 & 15,
+          (j1 ^ 2) & 6 ? -256 : i1 & 15, (j2 ^ 2) & 6 ? -256 : i2 & 15, (j3 ^ 2) & 6 ? -256 : i3 & 15, (j4 ^ 2) & 6 ? -256 : i4 & 15,
+          (j5 ^ 2) & 6 ? -256 : i5 & 15, (j6 ^ 2) & 6 ? -256 : i6 & 15, (j7 ^ 2) & 6 ? -256 : i7 & 15 > (b.get_low(), b.get_high()));
+    }
+
+  if(r1 < 0)
+    {
+      x1 = Vec8f(0.f);
+    }
+  else if(r1 == s1)
+    {
+      // i8 - i15 all from same source
+      x1 = permute8f<i8 & -25, i9 & -25, i10 & -25, i11 & -25, i12 & -25, i13 & -25, i14 & -25, i15 & -25>(select4<r1>(a, b));
+    }
+  else if((j10 < 0 || j10 == r1 || j10 == s1) && (j11 < 0 || j11 == r1 || j11 == s1) && (j12 < 0 || j12 == r1 || j12 == s1) &&
+          (j13 < 0 || j13 == r1 || j13 == s1) && (j14 < 0 || j14 == r1 || j14 == s1) && (j15 < 0 || j15 == r1 || j15 == s1))
+    {
+      // i8 - i15 all from two sources
+      const int k8  = i8 >= 0 ? (i8 & 7) : i8;
+      const int k9  = (i9 >= 0 ? (i9 & 7) : i9) | (j9 == s1 ? 8 : 0);
+      const int k10 = (i10 >= 0 ? (i10 & 7) : i10) | (j10 == s1 ? 8 : 0);
+      const int k11 = (i11 >= 0 ? (i11 & 7) : i11) | (j11 == s1 ? 8 : 0);
+      const int k12 = (i12 >= 0 ? (i12 & 7) : i12) | (j12 == s1 ? 8 : 0);
+      const int k13 = (i13 >= 0 ? (i13 & 7) : i13) | (j13 == s1 ? 8 : 0);
+      const int k14 = (i14 >= 0 ? (i14 & 7) : i14) | (j14 == s1 ? 8 : 0);
+      const int k15 = (i15 >= 0 ? (i15 & 7) : i15) | (j15 == s1 ? 8 : 0);
+      x1            = blend8f<k8, k9, k10, k11, k12, k13, k14, k15>(select4<r1>(a, b), select4<s1>(a, b));
+    }
+  else
+    {
+      // i8 - i15 from three or four different sources
+      const int n8  = j8 >= 0 ? j8 / 2 * 8 + 0 : j8;
+      const int n9  = j9 >= 0 ? j9 / 2 * 8 + 1 : j9;
+      const int n10 = j10 >= 0 ? j10 / 2 * 8 + 2 : j10;
+      const int n11 = j11 >= 0 ? j11 / 2 * 8 + 3 : j11;
+      const int n12 = j12 >= 0 ? j12 / 2 * 8 + 4 : j12;
+      const int n13 = j13 >= 0 ? j13 / 2 * 8 + 5 : j13;
+      const int n14 = j14 >= 0 ? j14 / 2 * 8 + 6 : j14;
+      const int n15 = j15 >= 0 ? j15 / 2 * 8 + 7 : j15;
+      x1            = blend8f<n8, n9, n10, n11, n12, n13, n14, n15>(
+          blend8f < j8 & 2 ? -256 : i8 & 15, j9 & 2 ? -256 : i9 & 15, j10 & 2 ? -256 : i10 & 15, j11 & 2 ? -256 : i11 & 15,
+          j12 & 2 ? -256 : i12 & 15, j13 & 2 ? -256 : i13 & 15, j14 & 2 ? -256 : i14 & 15,
+          j15 & 2 ? -256 : i15 & 15 > (a.get_low(), a.get_high()), blend8f < (j8 ^ 2) & 6 ? -256 : i8 & 15,
+          (j9 ^ 2) & 6 ? -256 : i9 & 15, (j10 ^ 2) & 6 ? -256 : i10 & 15, (j11 ^ 2) & 6 ? -256 : i11 & 15,
+          (j12 ^ 2) & 6 ? -256 : i12 & 15, (j13 ^ 2) & 6 ? -256 : i13 & 15, (j14 ^ 2) & 6 ? -256 : i14 & 15,
+          (j15 ^ 2) & 6 ? -256 : i15 & 15 > (b.get_low(), b.get_high()));
+    }
+  return Vec16f(x0, x1);
+}
+
+/*****************************************************************************
+ *
+ *          Vector lookup functions
+ *
+ ******************************************************************************
+ *
+ * These functions use vector elements as indexes into a table.
+ * The table is given as one or more vectors or as an array.
+ *
+ * This can be used for several purposes:
+ *  - table lookup
+ *  - permute or blend with variable indexes
+ *  - blend from more than two sources
+ *  - gather non-contiguous data
+ *
+ * An index out of range may produce any value - the actual value produced is
+ * implementation dependent and may be different for different instruction
+ * sets. An index out of range does not produce an error message or exception.
+ *
+ * Example:
+ * Vec8d a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+ * Vec8d b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8d c;
+ * c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+ *
+ *****************************************************************************/
+
+static inline Vec16f lookup16(Vec16i const &index, Vec16f const &table)
+{
+  float tab[16];
+  table.store(tab);
+  Vec8f t0 = lookup<16>(index.get_low(), tab);
+  Vec8f t1 = lookup<16>(index.get_high(), tab);
+  return Vec16f(t0, t1);
+}
+
+template <int n>
+static inline Vec16f lookup(Vec16i const &index, float const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 8)
+    {
+      Vec8f table1 = Vec8f().load(table);
+      return Vec16f(lookup8(index.get_low(), table1), lookup8(index.get_high(), table1));
+    }
+  if(n <= 16)
+    return lookup16(index, Vec16f().load(table));
+  // n > 16. Limit index
+  Vec16ui i1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      i1 = Vec16ui(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      i1 = min(Vec16ui(index), n - 1);
+    }
+  float const *t = table;
+  return Vec16f(t[i1[0]], t[i1[1]], t[i1[2]], t[i1[3]], t[i1[4]], t[i1[5]], t[i1[6]], t[i1[7]], t[i1[8]], t[i1[9]], t[i1[10]],
+                t[i1[11]], t[i1[12]], t[i1[13]], t[i1[14]], t[i1[15]]);
+}
+
+static inline Vec8d lookup8(Vec8q const &index, Vec8d const &table)
+{
+  double tab[8];
+  table.store(tab);
+  Vec4d t0 = lookup<8>(index.get_low(), tab);
+  Vec4d t1 = lookup<8>(index.get_high(), tab);
+  return Vec8d(t0, t1);
+}
+
+template <int n>
+static inline Vec8d lookup(Vec8q const &index, double const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 4)
+    {
+      Vec4d table1 = Vec4d().load(table);
+      return Vec8d(lookup4(index.get_low(), table1), lookup4(index.get_high(), table1));
+    }
+  if(n <= 8)
+    {
+      return lookup8(index, Vec8d().load(table));
+    }
+  // n > 8. Limit index
+  Vec8uq i1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      i1 = Vec8uq(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      i1 = min(Vec8uq(index), n - 1);
+    }
+  double const *t = table;
+  return Vec8d(t[i1[0]], t[i1[1]], t[i1[2]], t[i1[3]], t[i1[4]], t[i1[5]], t[i1[6]], t[i1[7]]);
+}
+
+/*****************************************************************************
+ *
+ *          Gather functions with fixed indexes
+ *
+ *****************************************************************************/
+// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16f gather16f(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) >= 0>
+      Negative_array_index;  // Error message if index is negative
+  // find smallest and biggest index, using only compile-time constant expressions
+  const int i01min    = i0 < i1 ? i0 : i1;
+  const int i23min    = i2 < i3 ? i2 : i3;
+  const int i45min    = i4 < i5 ? i4 : i5;
+  const int i67min    = i6 < i7 ? i6 : i7;
+  const int i89min    = i8 < i9 ? i8 : i9;
+  const int i1011min  = i10 < i11 ? i10 : i11;
+  const int i1213min  = i12 < i13 ? i12 : i13;
+  const int i1415min  = i14 < i15 ? i14 : i15;
+  const int i0_3min   = i01min < i23min ? i01min : i23min;
+  const int i4_7min   = i45min < i67min ? i45min : i67min;
+  const int i8_11min  = i89min < i1011min ? i89min : i1011min;
+  const int i12_15min = i1213min < i1415min ? i1213min : i1415min;
+  const int i0_7min   = i0_3min < i4_7min ? i0_3min : i4_7min;
+  const int i8_15min  = i8_11min < i12_15min ? i8_11min : i12_15min;
+  const int imin      = i0_7min < i8_15min ? i0_7min : i8_15min;
+  const int i01max    = i0 > i1 ? i0 : i1;
+  const int i23max    = i2 > i3 ? i2 : i3;
+  const int i45max    = i4 > i5 ? i4 : i5;
+  const int i67max    = i6 > i7 ? i6 : i7;
+  const int i89max    = i8 > i9 ? i8 : i9;
+  const int i1011max  = i10 > i11 ? i10 : i11;
+  const int i1213max  = i12 > i13 ? i12 : i13;
+  const int i1415max  = i14 > i15 ? i14 : i15;
+  const int i0_3max   = i01max > i23max ? i01max : i23max;
+  const int i4_7max   = i45max > i67max ? i45max : i67max;
+  const int i8_11max  = i89max > i1011max ? i89max : i1011max;
+  const int i12_15max = i1213max > i1415max ? i1213max : i1415max;
+  const int i0_7max   = i0_3max > i4_7max ? i0_3max : i4_7max;
+  const int i8_15max  = i8_11max > i12_15max ? i8_11max : i12_15max;
+  const int imax      = i0_7max > i8_15max ? i0_7max : i8_15max;
+  if(imax - imin <= 15)
+    {
+      // load one contiguous block and permute
+      if(imax > 15)
+        {
+          // make sure we don't read past the end of the array
+          Vec16f b = Vec16f().load((float const *)a + imax - 15);
+          return permute16f<i0 - imax + 15, i1 - imax + 15, i2 - imax + 15, i3 - imax + 15, i4 - imax + 15, i5 - imax + 15,
+                            i6 - imax + 15, i7 - imax + 15, i8 - imax + 15, i9 - imax + 15, i10 - imax + 15, i11 - imax + 15,
+                            i12 - imax + 15, i13 - imax + 15, i14 - imax + 15, i15 - imax + 15>(b);
+        }
+      else
+        {
+          Vec16f b = Vec16f().load((float const *)a + imin);
+          return permute16f<i0 - imin, i1 - imin, i2 - imin, i3 - imin, i4 - imin, i5 - imin, i6 - imin, i7 - imin, i8 - imin,
+                            i9 - imin, i10 - imin, i11 - imin, i12 - imin, i13 - imin, i14 - imin, i15 - imin>(b);
+        }
+    }
+  if((i0 < imin + 16 || i0 > imax - 16) && (i1 < imin + 16 || i1 > imax - 16) && (i2 < imin + 16 || i2 > imax - 16) &&
+     (i3 < imin + 16 || i3 > imax - 16) && (i4 < imin + 16 || i4 > imax - 16) && (i5 < imin + 16 || i5 > imax - 16) &&
+     (i6 < imin + 16 || i6 > imax - 16) && (i7 < imin + 16 || i7 > imax - 16) && (i8 < imin + 16 || i8 > imax - 16) &&
+     (i9 < imin + 16 || i9 > imax - 16) && (i10 < imin + 16 || i10 > imax - 16) && (i11 < imin + 16 || i11 > imax - 16) &&
+     (i12 < imin + 16 || i12 > imax - 16) && (i13 < imin + 16 || i13 > imax - 16) && (i14 < imin + 16 || i14 > imax - 16) &&
+     (i15 < imin + 16 || i15 > imax - 16))
+    {
+      // load two contiguous blocks and blend
+      Vec16f b      = Vec16f().load((float const *)a + imin);
+      Vec16f c      = Vec16f().load((float const *)a + imax - 15);
+      const int j0  = i0 < imin + 16 ? i0 - imin : 31 - imax + i0;
+      const int j1  = i1 < imin + 16 ? i1 - imin : 31 - imax + i1;
+      const int j2  = i2 < imin + 16 ? i2 - imin : 31 - imax + i2;
+      const int j3  = i3 < imin + 16 ? i3 - imin : 31 - imax + i3;
+      const int j4  = i4 < imin + 16 ? i4 - imin : 31 - imax + i4;
+      const int j5  = i5 < imin + 16 ? i5 - imin : 31 - imax + i5;
+      const int j6  = i6 < imin + 16 ? i6 - imin : 31 - imax + i6;
+      const int j7  = i7 < imin + 16 ? i7 - imin : 31 - imax + i7;
+      const int j8  = i8 < imin + 16 ? i8 - imin : 31 - imax + i8;
+      const int j9  = i9 < imin + 16 ? i9 - imin : 31 - imax + i9;
+      const int j10 = i10 < imin + 16 ? i10 - imin : 31 - imax + i10;
+      const int j11 = i11 < imin + 16 ? i11 - imin : 31 - imax + i11;
+      const int j12 = i12 < imin + 16 ? i12 - imin : 31 - imax + i12;
+      const int j13 = i13 < imin + 16 ? i13 - imin : 31 - imax + i13;
+      const int j14 = i14 < imin + 16 ? i14 - imin : 31 - imax + i14;
+      const int j15 = i15 < imin + 16 ? i15 - imin : 31 - imax + i15;
+      return blend16f<j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15>(b, c);
+    }
+  // use lookup function
+  return lookup<imax + 1>(Vec16i(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15), (const float *)a);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d gather8d(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) >= 0> Negative_array_index;  // Error message if index is negative
+
+  const int i01min   = i0 < i1 ? i0 : i1;
+  const int i23min   = i2 < i3 ? i2 : i3;
+  const int i45min   = i4 < i5 ? i4 : i5;
+  const int i67min   = i6 < i7 ? i6 : i7;
+  const int i0123min = i01min < i23min ? i01min : i23min;
+  const int i4567min = i45min < i67min ? i45min : i67min;
+  const int imin     = i0123min < i4567min ? i0123min : i4567min;
+  const int i01max   = i0 > i1 ? i0 : i1;
+  const int i23max   = i2 > i3 ? i2 : i3;
+  const int i45max   = i4 > i5 ? i4 : i5;
+  const int i67max   = i6 > i7 ? i6 : i7;
+  const int i0123max = i01max > i23max ? i01max : i23max;
+  const int i4567max = i45max > i67max ? i45max : i67max;
+  const int imax     = i0123max > i4567max ? i0123max : i4567max;
+  if(imax - imin <= 7)
+    {
+      // load one contiguous block and permute
+      if(imax > 7)
+        {
+          // make sure we don't read past the end of the array
+          Vec8d b = Vec8d().load((double const *)a + imax - 7);
+          return permute8d<i0 - imax + 7, i1 - imax + 7, i2 - imax + 7, i3 - imax + 7, i4 - imax + 7, i5 - imax + 7, i6 - imax + 7,
+                           i7 - imax + 7>(b);
+        }
+      else
+        {
+          Vec8d b = Vec8d().load((double const *)a + imin);
+          return permute8d<i0 - imin, i1 - imin, i2 - imin, i3 - imin, i4 - imin, i5 - imin, i6 - imin, i7 - imin>(b);
+        }
+    }
+  if((i0 < imin + 8 || i0 > imax - 8) && (i1 < imin + 8 || i1 > imax - 8) && (i2 < imin + 8 || i2 > imax - 8) &&
+     (i3 < imin + 8 || i3 > imax - 8) && (i4 < imin + 8 || i4 > imax - 8) && (i5 < imin + 8 || i5 > imax - 8) &&
+     (i6 < imin + 8 || i6 > imax - 8) && (i7 < imin + 8 || i7 > imax - 8))
+    {
+      // load two contiguous blocks and blend
+      Vec8d b      = Vec8d().load((double const *)a + imin);
+      Vec8d c      = Vec8d().load((double const *)a + imax - 7);
+      const int j0 = i0 < imin + 8 ? i0 - imin : 15 - imax + i0;
+      const int j1 = i1 < imin + 8 ? i1 - imin : 15 - imax + i1;
+      const int j2 = i2 < imin + 8 ? i2 - imin : 15 - imax + i2;
+      const int j3 = i3 < imin + 8 ? i3 - imin : 15 - imax + i3;
+      const int j4 = i4 < imin + 8 ? i4 - imin : 15 - imax + i4;
+      const int j5 = i5 < imin + 8 ? i5 - imin : 15 - imax + i5;
+      const int j6 = i6 < imin + 8 ? i6 - imin : 15 - imax + i6;
+      const int j7 = i7 < imin + 8 ? i7 - imin : 15 - imax + i7;
+      return blend8d<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+  // use lookup function
+  return lookup<imax + 1>(Vec8q(i0, i1, i2, i3, i4, i5, i6, i7), (const double *)a);
+}
+
+/*****************************************************************************
+ *
+ *          Vector scatter functions
+ *
+ ******************************************************************************
+ *
+ * These functions write the elements of a vector to arbitrary positions in an
+ * array in memory. Each vector element is written to an array position
+ * determined by an index. An element is not written if the corresponding
+ * index is out of range.
+ * The indexes can be specified as constant template parameters or as an
+ * integer vector.
+ *
+ * The scatter functions are useful if the data are distributed in a sparce
+ * manner into the array. If the array is dense then it is more efficient
+ * to permute the data into the right positions and then write the whole
+ * permuted vector into the array.
+ *
+ * Example:
+ * Vec8d a(10,11,12,13,14,15,16,17);
+ * double b[16] = {0};
+ * scatter<0,2,14,10,1,-1,5,9>(a,b);
+ * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
+ *
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline void scatter(Vec16f const &data, float *array)
+{
+  const int index[16] = {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15};
+  for(int i = 0; i < 16; i++)
+    {
+      if(index[i] >= 0)
+        array[index[i]] = data[i];
+    }
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8d const &data, double *array)
+{
+  const int index[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+  for(int i = 0; i < 8; i++)
+    {
+      if(index[i] >= 0)
+        array[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec16i const &index, uint32_t limit, Vec16f const &data, float *array)
+{
+  for(int i = 0; i < 16; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        array[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec8q const &index, uint32_t limit, Vec8d const &data, double *array)
+{
+  for(int i = 0; i < 8; i++)
+    {
+      if(uint64_t(index[i]) < uint64_t(limit))
+        array[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec8i const &index, uint32_t limit, Vec8d const &data, double *array)
+{
+  for(int i = 0; i < 8; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        array[index[i]] = data[i];
+    }
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal scan functions
+ *
+ *****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec16fb const &x)
+{
+  int a1 = horizontal_find_first(x.get_low());
+  if(a1 >= 0)
+    return a1;
+  int a2 = horizontal_find_first(x.get_high());
+  if(a2 < 0)
+    return a2;
+  return a2 + 8;
+}
+
+static inline int horizontal_find_first(Vec8db const &x)
+{
+  int a1 = horizontal_find_first(x.get_low());
+  if(a1 >= 0)
+    return a1;
+  int a2 = horizontal_find_first(x.get_high());
+  if(a2 < 0)
+    return a2;
+  return a2 + 4;
+}
+
+// count the number of true elements
+static inline uint32_t horizontal_count(Vec16fb const &x) { return horizontal_count(x.get_low()) + horizontal_count(x.get_high()); }
+
+static inline uint32_t horizontal_count(Vec8db const &x) { return horizontal_count(x.get_low()) + horizontal_count(x.get_high()); }
+
+/*****************************************************************************
+ *
+ *          Boolean <-> bitfield conversion functions
+ *
+ *****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16fb const &x) { return to_bits(Vec16ib(x)); }
+
+// to_Vec16fb: convert integer bitfield to boolean vector
+static inline Vec16fb to_Vec16fb(uint16_t x) { return Vec16fb(to_Vec16ib(x)); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8db const &x) { return to_bits(Vec8qb(x)); }
+
+// to_Vec8db: convert integer bitfield to boolean vector
+static inline Vec8db to_Vec8db(uint8_t x) { return Vec8db(to_Vec8qb(x)); }
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORF512_H
diff --git a/src/vectorclass/vectori128.h b/src/vectorclass/vectori128.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8461e615312ced8e752b02e544b406aee7e5798
--- /dev/null
+++ b/src/vectorclass/vectori128.h
@@ -0,0 +1,6396 @@
+/****************************  vectori128.h   *******************************
+ * Author:        Agner Fog
+ * Date created:  2012-05-30
+ * Last modified: 2017-05-02
+ * Version:       1.28
+ * Project:       vector classes
+ * Description:
+ * Header file defining integer vector classes as interface to intrinsic
+ * functions in x86 microprocessors with SSE2 and later instruction sets
+ * up to AVX.
+ *
+ * Instructions:
+ * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
+ * instruction set, which must be at least SSE2. Specify the supported
+ * instruction set by a command line define, e.g. __SSE4_1__ if the
+ * compiler does not automatically do so.
+ *
+ * The following vector classes are defined here:
+ * Vec128b   Vector of 128  1-bit unsigned  integers or Booleans
+ * Vec16c    Vector of  16  8-bit signed    integers
+ * Vec16uc   Vector of  16  8-bit unsigned  integers
+ * Vec16cb   Vector of  16  Booleans for use with Vec16c and Vec16uc
+ * Vec8s     Vector of   8  16-bit signed   integers
+ * Vec8us    Vector of   8  16-bit unsigned integers
+ * Vec8sb    Vector of   8  Booleans for use with Vec8s and Vec8us
+ * Vec4i     Vector of   4  32-bit signed   integers
+ * Vec4ui    Vector of   4  32-bit unsigned integers
+ * Vec4ib    Vector of   4  Booleans for use with Vec4i and Vec4ui
+ * Vec2q     Vector of   2  64-bit signed   integers
+ * Vec2uq    Vector of   2  64-bit unsigned integers
+ * Vec2qb    Vector of   2  Booleans for use with Vec2q and Vec2uq
+ *
+ * Each vector object is represented internally in the CPU as a 128-bit register.
+ * This header file defines operators and functions for these vectors.
+ *
+ * For example:
+ * Vec4i a(1,2,3,4), b(5,6,7,8), c;
+ * c = a + b;     // now c contains (6,8,10,12)
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
+ *****************************************************************************/
+#ifndef VECTORI128_H
+#define VECTORI128_H
+
+#include "instrset.h"  // Select supported instruction set
+
+#if INSTRSET < 2  // SSE2 required
+#error Please compile for the SSE2 instruction set or higher
+#endif
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/*****************************************************************************
+ *
+ *          Vector of 128 1-bit unsigned integers or Booleans
+ *
+ *****************************************************************************/
+class Vec128b
+{
+ protected:
+  __m128i xmm;  // Integer vector
+ public:
+  // Default constructor:
+  Vec128b() {}
+  // Constructor to broadcast the same value into all elements
+  // Removed because of undesired implicit conversions
+  // Vec128b(int i) {
+  //     xmm = _mm_set1_epi32(-(i & 1));}
+
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec128b(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec128b &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m128i used in intrinsics
+  operator __m128i() const { return xmm; }
+  // Member function to load from array (unaligned)
+  Vec128b &load(void const *p)
+  {
+    xmm = _mm_loadu_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 16
+  // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1,
+  // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 16.
+  void load_a(void const *p) { xmm = _mm_load_si128((__m128i const *)p); }
+  // Member function to store into array (unaligned)
+  void store(void *p) const { _mm_storeu_si128((__m128i *)p, xmm); }
+  // Member function to store into array, aligned by 16
+  // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1,
+  // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 16.
+  void store_a(void *p) const { _mm_store_si128((__m128i *)p, xmm); }
+  // Member function to change a single bit
+  // Note: This function is inefficient. Use load function if changing more than one bit
+  Vec128b const &set_bit(uint32_t index, int value)
+  {
+    static const union
+    {
+      uint64_t i[4];
+      __m128i x[2];
+    } u          = {{1, 0, 0, 1}};    // 2 vectors with bit 0 and 64 set, respectively
+    int w        = (index >> 6) & 1;  // qword index
+    int bi       = index & 0x3F;      // bit index within qword w
+    __m128i mask = u.x[w];
+    mask         = _mm_sll_epi64(mask, _mm_cvtsi32_si128(bi));  // mask with bit number b set
+    if(value & 1)
+      {
+        xmm = _mm_or_si128(mask, xmm);
+      }
+    else
+      {
+        xmm = _mm_andnot_si128(mask, xmm);
+      }
+    return *this;
+  }
+  // Member function to get a single bit
+  // Note: This function is inefficient. Use store function if reading more than one bit
+  int get_bit(uint32_t index) const
+  {
+    union
+    {
+      __m128i x;
+      uint8_t i[16];
+    } u;
+    u.x    = xmm;
+    int w  = (index >> 3) & 0xF;  // byte index
+    int bi = index & 7;           // bit index within byte w
+    return (u.i[w] >> bi) & 1;
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return get_bit(index) != 0; }
+  static int size() { return 128; }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec128b operator&(Vec128b const &a, Vec128b const &b) { return _mm_and_si128(a, b); }
+static inline Vec128b operator&&(Vec128b const &a, Vec128b const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec128b operator|(Vec128b const &a, Vec128b const &b) { return _mm_or_si128(a, b); }
+static inline Vec128b operator||(Vec128b const &a, Vec128b const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec128b operator^(Vec128b const &a, Vec128b const &b) { return _mm_xor_si128(a, b); }
+
+// vector operator ~ : bitwise not
+static inline Vec128b operator~(Vec128b const &a) { return _mm_xor_si128(a, _mm_set1_epi32(-1)); }
+
+// vector operator &= : bitwise and
+static inline Vec128b &operator&=(Vec128b &a, Vec128b const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec128b &operator|=(Vec128b &a, Vec128b const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec128b &operator^=(Vec128b &a, Vec128b const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec128b andnot(Vec128b const &a, Vec128b const &b) { return _mm_andnot_si128(b, a); }
+
+/*****************************************************************************
+ *
+ *          Generate compile-time constant vector
+ *
+ *****************************************************************************/
+// Generate a constant vector of 4 integers stored in memory.
+// Can be converted to any integer vector type
+template <int32_t i0, int32_t i1, int32_t i2, int32_t i3>
+static inline __m128i constant4i()
+{
+  static const union
+  {
+    int i[4];
+    __m128i xmm;
+  } u = {{i0, i1, i2, i3}};
+  return u.xmm;
+}
+
+template <uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3>
+static inline __m128i constant4ui()
+{
+  return constant4i<int32_t(i0), int32_t(i1), int32_t(i2), int32_t(i3)>();
+}
+
+/*****************************************************************************
+ *
+ *          selectb function
+ *
+ *****************************************************************************/
+// Select between two sources, byte by byte. Used in various functions and operators
+// Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
+// The implementation depends on the instruction set:
+// If SSE4.1 is supported then only bit 7 in each byte of s is checked,
+// otherwise all bits in s are used.
+static inline __m128i selectb(__m128i const &s, __m128i const &a, __m128i const &b)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_blendv_epi8(b, a, s);
+#else
+  return _mm_or_si128(_mm_and_si128(s, a), _mm_andnot_si128(s, b));
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal Boolean functions
+ *
+ *****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec128b const &a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported. Use PTEST
+  return _mm_testc_si128(a, constant4i<-1, -1, -1, -1>()) != 0;
+#else
+  __m128i t1 = _mm_unpackhi_epi64(a, a);  // get 64 bits down
+  __m128i t2 = _mm_and_si128(a, t1);      // and 64 bits
+#ifdef __x86_64__
+  int64_t t5 = _mm_cvtsi128_si64(t2);     // transfer 64 bits to integer
+  return t5 == int64_t(-1);
+#else
+  __m128i t3 = _mm_srli_epi64(t2, 32);  // get 32 bits down
+  __m128i t4 = _mm_and_si128(t2, t3);   // and 32 bits
+  int t5     = _mm_cvtsi128_si32(t4);   // transfer 32 bits to integer
+  return t5 == -1;
+#endif  // __x86_64__
+#endif  // INSTRSET
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec128b const &a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported. Use PTEST
+  return !_mm_testz_si128(a, a);
+#else
+  __m128i t1 = _mm_unpackhi_epi64(a, a);  // get 64 bits down
+  __m128i t2 = _mm_or_si128(a, t1);       // and 64 bits
+#ifdef __x86_64__
+  int64_t t5 = _mm_cvtsi128_si64(t2);     // transfer 64 bits to integer
+  return t5 != int64_t(0);
+#else
+  __m128i t3 = _mm_srli_epi64(t2, 32);  // get 32 bits down
+  __m128i t4 = _mm_or_si128(t2, t3);    // and 32 bits
+  int t5     = _mm_cvtsi128_si32(t4);   // transfer to integer
+  return t5 != 0;
+#endif  // __x86_64__
+#endif  // INSTRSET
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 16 8-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec16c : public Vec128b
+{
+ public:
+  // Default constructor:
+  Vec16c() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec16c(int i) { xmm = _mm_set1_epi8((char)i); }
+  // Constructor to build from all elements:
+  Vec16c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7, int8_t i8, int8_t i9, int8_t i10,
+         int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15)
+  {
+    xmm = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+  }
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec16c(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec16c &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m128i used in intrinsics
+  operator __m128i() const { return xmm; }
+  // Member function to load from array (unaligned)
+  Vec16c &load(void const *p)
+  {
+    xmm = _mm_loadu_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to load from array (aligned)
+  Vec16c &load_a(void const *p)
+  {
+    xmm = _mm_load_si128((__m128i const *)p);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec16c &load_partial(int n, void const *p)
+  {
+    if(n >= 16)
+      load(p);
+    else if(n <= 0)
+      *this = 0;
+    else if(((int)(intptr_t)p & 0xFFF) < 0xFF0)
+      {
+        // p is at least 16 bytes from a page boundary. OK to read 16 bytes
+        load(p);
+      }
+    else
+      {
+        // worst case. read 1 byte at a time and suffer store forwarding penalty
+        char x[16];
+        for(int i = 0; i < n; i++)
+          x[i] = ((char const *)p)[i];
+        load(x);
+      }
+    cutoff(n);
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n >= 16)
+      {
+        store(p);
+        return;
+      }
+    if(n <= 0)
+      return;
+    // we are not using _mm_maskmoveu_si128 because it is too slow on many processors
+    union
+    {
+      int8_t c[16];
+      int16_t s[8];
+      int32_t i[4];
+      int64_t q[2];
+    } u;
+    store(u.c);
+    int j = 0;
+    if(n & 8)
+      {
+        *(int64_t *)p = u.q[0];
+        j += 8;
+      }
+    if(n & 4)
+      {
+        ((int32_t *)p)[j / 4] = u.i[j / 4];
+        j += 4;
+      }
+    if(n & 2)
+      {
+        ((int16_t *)p)[j / 2] = u.s[j / 2];
+        j += 2;
+      }
+    if(n & 1)
+      {
+        ((int8_t *)p)[j] = u.c[j];
+      }
+  }
+  // cut off vector to n elements. The last 16-n elements are set to zero
+  Vec16c &cutoff(int n)
+  {
+    if(uint32_t(n) >= 16)
+      return *this;
+    static const char mask[32] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0};
+    *this &= Vec16c().load(mask + 16 - n);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec16c const &insert(uint32_t index, int8_t value)
+  {
+    static const int8_t maskl[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    __m128i broad                 = _mm_set1_epi8(value);  // broadcast value into all elements
+    __m128i mask                  = _mm_loadu_si128((__m128i const *)(maskl + 16 - (index & 0x0F)));  // mask with FF at index position
+    xmm                           = selectb(mask, broad, xmm);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int8_t extract(uint32_t index) const
+  {
+    int8_t x[16];
+    store(x);
+    return x[index & 0x0F];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int8_t operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 16; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec16cb: Vector of 16 Booleans for use with Vec16c and Vec16uc
+ *
+ *****************************************************************************/
+
+class Vec16cb : public Vec16c
+{
+ public:
+  // Default constructor
+  Vec16cb() {}
+  // Constructor to build from all elements:
+  Vec16cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, bool x8, bool x9, bool x10, bool x11, bool x12,
+          bool x13, bool x14, bool x15)
+  {
+    xmm = Vec16c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), -int8_t(x8),
+                 -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15));
+  }
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec16cb(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec16cb &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec16cb(bool b) : Vec16c(-int8_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec16cb &operator=(bool b)
+  {
+    *this = Vec16cb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec16cb(int b);
+  Vec16cb &operator=(int x);
+
+ public:
+  Vec16cb &insert(int index, bool a)
+  {
+    Vec16c::insert(index, -(int)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const { return Vec16c::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec16cb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec16cb operator&(Vec16cb const &a, Vec16cb const &b) { return Vec16cb(Vec128b(a) & Vec128b(b)); }
+static inline Vec16cb operator&&(Vec16cb const &a, Vec16cb const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec16cb &operator&=(Vec16cb &a, Vec16cb const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16cb operator|(Vec16cb const &a, Vec16cb const &b) { return Vec16cb(Vec128b(a) | Vec128b(b)); }
+static inline Vec16cb operator||(Vec16cb const &a, Vec16cb const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec16cb &operator|=(Vec16cb &a, Vec16cb const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16cb operator^(Vec16cb const &a, Vec16cb const &b) { return Vec16cb(Vec128b(a) ^ Vec128b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec16cb &operator^=(Vec16cb &a, Vec16cb const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16cb operator~(Vec16cb const &a) { return Vec16cb(~Vec128b(a)); }
+
+// vector operator ! : element not
+static inline Vec16cb operator!(Vec16cb const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec16cb andnot(Vec16cb const &a, Vec16cb const &b) { return Vec16cb(andnot(Vec128b(a), Vec128b(b))); }
+
+// Horizontal Boolean functions for Vec16cb
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec16cb const &a) { return _mm_movemask_epi8(a) == 0xFFFF; }
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec16cb const &a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported. Use PTEST
+  return !_mm_testz_si128(a, a);
+#else
+  return _mm_movemask_epi8(a) != 0;
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec16c
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16c operator+(Vec16c const &a, Vec16c const &b) { return _mm_add_epi8(a, b); }
+
+// vector operator += : add
+static inline Vec16c &operator+=(Vec16c &a, Vec16c const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec16c operator++(Vec16c &a, int)
+{
+  Vec16c a0 = a;
+  a         = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec16c &operator++(Vec16c &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16c operator-(Vec16c const &a, Vec16c const &b) { return _mm_sub_epi8(a, b); }
+
+// vector operator - : unary minus
+static inline Vec16c operator-(Vec16c const &a) { return _mm_sub_epi8(_mm_setzero_si128(), a); }
+
+// vector operator -= : add
+static inline Vec16c &operator-=(Vec16c &a, Vec16c const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec16c operator--(Vec16c &a, int)
+{
+  Vec16c a0 = a;
+  a         = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec16c &operator--(Vec16c &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16c operator*(Vec16c const &a, Vec16c const &b)
+{
+  // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies
+  __m128i aodd    = _mm_srli_epi16(a, 8);            // odd numbered elements of a
+  __m128i bodd    = _mm_srli_epi16(b, 8);            // odd numbered elements of b
+  __m128i muleven = _mm_mullo_epi16(a, b);           // product of even numbered elements
+  __m128i mulodd  = _mm_mullo_epi16(aodd, bodd);     // product of odd  numbered elements
+  mulodd          = _mm_slli_epi16(mulodd, 8);       // put odd numbered elements back in place
+  __m128i mask    = _mm_set1_epi32(0x00FF00FF);      // mask for even positions
+  __m128i product = selectb(mask, muleven, mulodd);  // interleave even and odd
+  return product;
+}
+
+// vector operator *= : multiply
+static inline Vec16c &operator*=(Vec16c &a, Vec16c const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16c operator<<(Vec16c const &a, int b)
+{
+  uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;                // mask to remove bits that are shifted out
+  __m128i am    = _mm_and_si128(a, _mm_set1_epi8((char)mask));  // remove bits that will overflow
+  __m128i res   = _mm_sll_epi16(am, _mm_cvtsi32_si128(b));      // 16-bit shifts
+  return res;
+}
+
+// vector operator <<= : shift left
+static inline Vec16c &operator<<=(Vec16c &a, int b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic all elements
+static inline Vec16c operator>>(Vec16c const &a, int b)
+{
+  __m128i aeven = _mm_slli_epi16(a, 8);                            // even numbered elements of a. get sign bit in position
+  aeven         = _mm_sra_epi16(aeven, _mm_cvtsi32_si128(b + 8));  // shift arithmetic, back to position
+  __m128i aodd  = _mm_sra_epi16(a, _mm_cvtsi32_si128(b));          // shift odd numbered elements arithmetic
+  __m128i mask  = _mm_set1_epi32(0x00FF00FF);                      // mask for even positions
+  __m128i res   = selectb(mask, aeven, aodd);                      // interleave even and odd
+  return res;
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16c &operator>>=(Vec16c &a, int b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16cb operator==(Vec16c const &a, Vec16c const &b) { return _mm_cmpeq_epi8(a, b); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16cb operator!=(Vec16c const &a, Vec16c const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return (Vec16cb)_mm_comneq_epi8(a, b);
+#else  // SSE2 instruction set
+  return Vec16cb(Vec16c(~(a == b)));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b (signed)
+static inline Vec16cb operator>(Vec16c const &a, Vec16c const &b) { return _mm_cmpgt_epi8(a, b); }
+
+// vector operator < : returns true for elements for which a < b (signed)
+static inline Vec16cb operator<(Vec16c const &a, Vec16c const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16cb operator>=(Vec16c const &a, Vec16c const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return (Vec16cb)_mm_comge_epi8(a, b);
+#else  // SSE2 instruction set
+  return Vec16cb(Vec16c(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16cb operator<=(Vec16c const &a, Vec16c const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec16c operator&(Vec16c const &a, Vec16c const &b) { return Vec16c(Vec128b(a) & Vec128b(b)); }
+static inline Vec16c operator&&(Vec16c const &a, Vec16c const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec16c &operator&=(Vec16c &a, Vec16c const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16c operator|(Vec16c const &a, Vec16c const &b) { return Vec16c(Vec128b(a) | Vec128b(b)); }
+static inline Vec16c operator||(Vec16c const &a, Vec16c const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec16c &operator|=(Vec16c &a, Vec16c const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16c operator^(Vec16c const &a, Vec16c const &b) { return Vec16c(Vec128b(a) ^ Vec128b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec16c &operator^=(Vec16c &a, Vec16c const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16c operator~(Vec16c const &a) { return Vec16c(~Vec128b(a)); }
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec16cb operator!(Vec16c const &a) { return _mm_cmpeq_epi8(a, _mm_setzero_si128()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+static inline Vec16c select(Vec16cb const &s, Vec16c const &a, Vec16c const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16c if_add(Vec16cb const &f, Vec16c const &a, Vec16c const &b) { return a + (Vec16c(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add(Vec16c const &a)
+{
+  __m128i sum1 = _mm_sad_epu8(a, _mm_setzero_si128());
+  __m128i sum2 = _mm_shuffle_epi32(sum1, 2);
+  __m128i sum3 = _mm_add_epi16(sum1, sum2);
+  int8_t sum4  = (int8_t)_mm_cvtsi128_si32(sum3);  // truncate to 8 bits
+  return sum4;                                     // sign extend to 32 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x(Vec16c const &a)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  __m128i sum1 = _mm_haddq_epi8(a);
+  __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E);  // high element
+  __m128i sum3 = _mm_add_epi32(sum1, sum2);      // sum
+  return _mm_cvtsi128_si32(sum3);
+#elif INSTRSET >= 4  // SSSE3
+  __m128i aeven = _mm_slli_epi16(a, 8);        // even numbered elements of a. get sign bit in position
+  aeven         = _mm_srai_epi16(aeven, 8);    // sign extend even numbered elements
+  __m128i aodd  = _mm_srai_epi16(a, 8);        // sign extend odd  numbered elements
+  __m128i sum1  = _mm_add_epi16(aeven, aodd);  // add even and odd elements
+  __m128i sum2  = _mm_hadd_epi16(sum1, sum1);  // horizontally add 8 elements in 3 steps
+  __m128i sum3  = _mm_hadd_epi16(sum2, sum2);
+  __m128i sum4  = _mm_hadd_epi16(sum3, sum3);
+  int16_t sum5  = (int16_t)_mm_cvtsi128_si32(sum4);  // 16 bit sum
+  return sum5;                                       // sign extend to 32 bits
+#else                // SSE2
+  __m128i aeven = _mm_slli_epi16(a, 8);             // even numbered elements of a. get sign bit in position
+  aeven         = _mm_srai_epi16(aeven, 8);         // sign extend even numbered elements
+  __m128i aodd  = _mm_srai_epi16(a, 8);             // sign extend odd  numbered elements
+  __m128i sum1  = _mm_add_epi16(aeven, aodd);       // add even and odd elements
+  __m128i sum2  = _mm_shuffle_epi32(sum1, 0x0E);    // 4 high elements
+  __m128i sum3  = _mm_add_epi16(sum1, sum2);        // 4 sums
+  __m128i sum4  = _mm_shuffle_epi32(sum3, 0x01);    // 2 high elements
+  __m128i sum5  = _mm_add_epi16(sum3, sum4);        // 2 sums
+  __m128i sum6  = _mm_shufflelo_epi16(sum5, 0x01);  // 1 high element
+  __m128i sum7  = _mm_add_epi16(sum5, sum6);        // 1 sum
+  int16_t sum8  = _mm_cvtsi128_si32(sum7);          // 16 bit sum
+  return sum8;                                      // sign extend to 32 bits
+#endif
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16c add_saturated(Vec16c const &a, Vec16c const &b) { return _mm_adds_epi8(a, b); }
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16c sub_saturated(Vec16c const &a, Vec16c const &b) { return _mm_subs_epi8(a, b); }
+
+// function max: a > b ? a : b
+static inline Vec16c max(Vec16c const &a, Vec16c const &b)
+{
+#if INSTRSET >= 5  // SSE4.1
+  return _mm_max_epi8(a, b);
+#else  // SSE2
+  __m128i signbit = _mm_set1_epi32(0x80808080);
+  __m128i a1      = _mm_xor_si128(a, signbit);  // add 0x80
+  __m128i b1      = _mm_xor_si128(b, signbit);  // add 0x80
+  __m128i m1      = _mm_max_epu8(a1, b1);       // unsigned max
+  return _mm_xor_si128(m1, signbit);            // sub 0x80
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec16c min(Vec16c const &a, Vec16c const &b)
+{
+#if INSTRSET >= 5  // SSE4.1
+  return _mm_min_epi8(a, b);
+#else  // SSE2
+  __m128i signbit = _mm_set1_epi32(0x80808080);
+  __m128i a1      = _mm_xor_si128(a, signbit);  // add 0x80
+  __m128i b1      = _mm_xor_si128(b, signbit);  // add 0x80
+  __m128i m1      = _mm_min_epu8(a1, b1);       // unsigned min
+  return _mm_xor_si128(m1, signbit);            // sub 0x80
+#endif
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16c abs(Vec16c const &a)
+{
+#if INSTRSET >= 4  // SSSE3 supported
+  return _mm_sign_epi8(a, a);
+#else  // SSE2
+  __m128i nega = _mm_sub_epi8(_mm_setzero_si128(), a);
+  return _mm_min_epu8(a, nega);                            // unsigned min (the negative value is bigger when compared as unsigned)
+#endif
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16c abs_saturated(Vec16c const &a)
+{
+  __m128i absa   = abs(a);                                     // abs(a)
+  __m128i overfl = _mm_cmpgt_epi8(_mm_setzero_si128(), absa);  // 0 > a
+  return _mm_add_epi8(absa, overfl);                           // subtract 1 if 0x80
+}
+
+// function rotate_left: rotate each element left by b bits
+// Use negative count to rotate right
+static inline Vec16c rotate_left(Vec16c const &a, int b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return _mm_rot_epi8(a, _mm_set1_epi8(b));
+#else  // SSE2 instruction set
+  __m128i bb        = _mm_cvtsi32_si128(b & 7);            // b modulo 8
+  __m128i mbb       = _mm_cvtsi32_si128((8 - b) & 7);      // 8-b modulo 8
+  __m128i maskeven  = _mm_set1_epi32(0x00FF00FF);          // mask for even numbered bytes
+  __m128i even      = _mm_and_si128(a, maskeven);          // even numbered bytes of a
+  __m128i odd       = _mm_andnot_si128(maskeven, a);       // odd numbered bytes of a
+  __m128i evenleft  = _mm_sll_epi16(even, bb);             // even bytes of a << b
+  __m128i oddleft   = _mm_sll_epi16(odd, bb);              // odd  bytes of a << b
+  __m128i evenright = _mm_srl_epi16(even, mbb);            // even bytes of a >> 8-b
+  __m128i oddright  = _mm_srl_epi16(odd, mbb);             // odd  bytes of a >> 8-b
+  __m128i evenrot   = _mm_or_si128(evenleft, evenright);   // even bytes of a rotated
+  __m128i oddrot    = _mm_or_si128(oddleft, oddright);     // odd  bytes of a rotated
+  __m128i allrot    = selectb(maskeven, evenrot, oddrot);  // all  bytes rotated
+  return allrot;
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 16 8-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec16uc : public Vec16c
+{
+ public:
+  // Default constructor:
+  Vec16uc() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec16uc(uint32_t i) { xmm = _mm_set1_epi8((char)i); }
+  // Constructor to build from all elements:
+  Vec16uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, uint8_t i8, uint8_t i9,
+          uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15)
+  {
+    xmm = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+  }
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec16uc(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec16uc &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec16uc &load(void const *p)
+  {
+    xmm = _mm_loadu_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to load from array (aligned)
+  Vec16uc &load_a(void const *p)
+  {
+    xmm = _mm_load_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec16uc const &insert(uint32_t index, uint8_t value)
+  {
+    Vec16c::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint8_t extract(uint32_t index) const { return Vec16c::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint8_t operator[](uint32_t index) const { return extract(index); }
+};
+
+// Define operators for this class
+
+// vector operator << : shift left all elements
+static inline Vec16uc operator<<(Vec16uc const &a, uint32_t b)
+{
+  uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;                // mask to remove bits that are shifted out
+  __m128i am    = _mm_and_si128(a, _mm_set1_epi8((char)mask));  // remove bits that will overflow
+  __m128i res   = _mm_sll_epi16(am, _mm_cvtsi32_si128(b));      // 16-bit shifts
+  return res;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16uc operator<<(Vec16uc const &a, int32_t b) { return a << (uint32_t)b; }
+
+// vector operator >> : shift right logical all elements
+static inline Vec16uc operator>>(Vec16uc const &a, uint32_t b)
+{
+  uint32_t mask = (uint32_t)0xFF << (uint32_t)b;                // mask to remove bits that are shifted out
+  __m128i am    = _mm_and_si128(a, _mm_set1_epi8((char)mask));  // remove bits that will overflow
+  __m128i res   = _mm_srl_epi16(am, _mm_cvtsi32_si128(b));      // 16-bit shifts
+  return res;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16uc operator>>(Vec16uc const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right logical
+static inline Vec16uc &operator>>=(Vec16uc &a, int b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16cb operator>=(Vec16uc const &a, Vec16uc const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return (Vec16cb)_mm_comge_epu8(a, b);
+#else  // SSE2 instruction set
+  return (Vec16cb)_mm_cmpeq_epi8(_mm_max_epu8(a, b), a);  // a == max(a,b)
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16cb operator<=(Vec16uc const &a, Vec16uc const &b) { return b >= a; }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16cb operator>(Vec16uc const &a, Vec16uc const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return (Vec16cb)_mm_comgt_epu8(a, b);
+#else  // SSE2 instruction set
+  return Vec16cb(Vec16c(~(b >= a)));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16cb operator<(Vec16uc const &a, Vec16uc const &b) { return b > a; }
+
+// vector operator + : add
+static inline Vec16uc operator+(Vec16uc const &a, Vec16uc const &b) { return Vec16uc(Vec16c(a) + Vec16c(b)); }
+
+// vector operator - : subtract
+static inline Vec16uc operator-(Vec16uc const &a, Vec16uc const &b) { return Vec16uc(Vec16c(a) - Vec16c(b)); }
+
+// vector operator * : multiply
+static inline Vec16uc operator*(Vec16uc const &a, Vec16uc const &b) { return Vec16uc(Vec16c(a) * Vec16c(b)); }
+
+// vector operator & : bitwise and
+static inline Vec16uc operator&(Vec16uc const &a, Vec16uc const &b) { return Vec16uc(Vec128b(a) & Vec128b(b)); }
+static inline Vec16uc operator&&(Vec16uc const &a, Vec16uc const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec16uc operator|(Vec16uc const &a, Vec16uc const &b) { return Vec16uc(Vec128b(a) | Vec128b(b)); }
+static inline Vec16uc operator||(Vec16uc const &a, Vec16uc const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec16uc operator^(Vec16uc const &a, Vec16uc const &b) { return Vec16uc(Vec128b(a) ^ Vec128b(b)); }
+
+// vector operator ~ : bitwise not
+static inline Vec16uc operator~(Vec16uc const &a) { return Vec16uc(~Vec128b(a)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16uc select(Vec16cb const &s, Vec16uc const &a, Vec16uc const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16uc if_add(Vec16cb const &f, Vec16uc const &a, Vec16uc const &b) { return a + (Vec16uc(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+// (Note: horizontal_add_x(Vec16uc) is slightly faster)
+static inline uint32_t horizontal_add(Vec16uc const &a)
+{
+  __m128i sum1  = _mm_sad_epu8(a, _mm_setzero_si128());
+  __m128i sum2  = _mm_shuffle_epi32(sum1, 2);
+  __m128i sum3  = _mm_add_epi16(sum1, sum2);
+  uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3);  // truncate to 16 bits
+  return sum4;
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x(Vec16uc const &a)
+{
+  __m128i sum1 = _mm_sad_epu8(a, _mm_setzero_si128());
+  __m128i sum2 = _mm_shuffle_epi32(sum1, 2);
+  __m128i sum3 = _mm_add_epi16(sum1, sum2);
+  return _mm_cvtsi128_si32(sum3);
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16uc add_saturated(Vec16uc const &a, Vec16uc const &b) { return _mm_adds_epu8(a, b); }
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16uc sub_saturated(Vec16uc const &a, Vec16uc const &b) { return _mm_subs_epu8(a, b); }
+
+// function max: a > b ? a : b
+static inline Vec16uc max(Vec16uc const &a, Vec16uc const &b) { return _mm_max_epu8(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec16uc min(Vec16uc const &a, Vec16uc const &b) { return _mm_min_epu8(a, b); }
+
+/*****************************************************************************
+ *
+ *          Vector of 8 16-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec8s : public Vec128b
+{
+ public:
+  // Default constructor:
+  Vec8s() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8s(int i) { xmm = _mm_set1_epi16((int16_t)i); }
+  // Constructor to build from all elements:
+  Vec8s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7)
+  {
+    xmm = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7);
+  }
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec8s(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec8s &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m128i used in intrinsics
+  operator __m128i() const { return xmm; }
+  // Member function to load from array (unaligned)
+  Vec8s &load(void const *p)
+  {
+    xmm = _mm_loadu_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to load from array (aligned)
+  Vec8s &load_a(void const *p)
+  {
+    xmm = _mm_load_si128((__m128i const *)p);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec8s &load_partial(int n, void const *p)
+  {
+    if(n >= 8)
+      load(p);
+    else if(n <= 0)
+      *this = 0;
+    else if(((int)(intptr_t)p & 0xFFF) < 0xFF0)
+      {
+        // p is at least 16 bytes from a page boundary. OK to read 16 bytes
+        load(p);
+      }
+    else
+      {
+        // worst case. read 1 byte at a time and suffer store forwarding penalty
+        int16_t x[8];
+        for(int i = 0; i < n; i++)
+          x[i] = ((int16_t const *)p)[i];
+        load(x);
+      }
+    cutoff(n);
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n >= 8)
+      {
+        store(p);
+        return;
+      }
+    if(n <= 0)
+      return;
+    // we are not using _mm_maskmoveu_si128 because it is too slow on many processors
+    union
+    {
+      int8_t c[16];
+      int16_t s[8];
+      int32_t i[4];
+      int64_t q[2];
+    } u;
+    store(u.c);
+    int j = 0;
+    if(n & 4)
+      {
+        *(int64_t *)p = u.q[0];
+        j += 8;
+      }
+    if(n & 2)
+      {
+        ((int32_t *)p)[j / 4] = u.i[j / 4];
+        j += 4;
+      }
+    if(n & 1)
+      {
+        ((int16_t *)p)[j / 2] = u.s[j / 2];
+      }
+  }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec8s &cutoff(int n)
+  {
+    *this = Vec16c(xmm).cutoff(n * 2);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8s const &insert(uint32_t index, int16_t value)
+  {
+    switch(index)
+      {
+        case 0:
+          xmm = _mm_insert_epi16(xmm, value, 0);
+          break;
+        case 1:
+          xmm = _mm_insert_epi16(xmm, value, 1);
+          break;
+        case 2:
+          xmm = _mm_insert_epi16(xmm, value, 2);
+          break;
+        case 3:
+          xmm = _mm_insert_epi16(xmm, value, 3);
+          break;
+        case 4:
+          xmm = _mm_insert_epi16(xmm, value, 4);
+          break;
+        case 5:
+          xmm = _mm_insert_epi16(xmm, value, 5);
+          break;
+        case 6:
+          xmm = _mm_insert_epi16(xmm, value, 6);
+          break;
+        case 7:
+          xmm = _mm_insert_epi16(xmm, value, 7);
+          break;
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  // Note: This function is inefficient. Use store function if extracting more than one element
+  int16_t extract(uint32_t index) const
+  {
+    switch(index)
+      {
+        case 0:
+          return (int16_t)_mm_extract_epi16(xmm, 0);
+        case 1:
+          return (int16_t)_mm_extract_epi16(xmm, 1);
+        case 2:
+          return (int16_t)_mm_extract_epi16(xmm, 2);
+        case 3:
+          return (int16_t)_mm_extract_epi16(xmm, 3);
+        case 4:
+          return (int16_t)_mm_extract_epi16(xmm, 4);
+        case 5:
+          return (int16_t)_mm_extract_epi16(xmm, 5);
+        case 6:
+          return (int16_t)_mm_extract_epi16(xmm, 6);
+        case 7:
+          return (int16_t)_mm_extract_epi16(xmm, 7);
+      }
+    return 0;
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int16_t operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 8; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec8sb: Vector of 8 Booleans for use with Vec8s and Vec8us
+ *
+ *****************************************************************************/
+
+class Vec8sb : public Vec8s
+{
+ public:
+  // Constructor to build from all elements:
+  Vec8sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7)
+  {
+    xmm = Vec8s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7));
+  }
+  // Default constructor:
+  Vec8sb() {}
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec8sb(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec8sb &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec8sb(bool b) : Vec8s(-int16_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec8sb &operator=(bool b)
+  {
+    *this = Vec8sb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec8sb(int b);
+  Vec8sb &operator=(int x);
+
+ public:
+  Vec8sb &insert(int index, bool a)
+  {
+    Vec8s::insert(index, -(int)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  // Note: This function is inefficient. Use store function if extracting more than one element
+  bool extract(uint32_t index) const { return Vec8s::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec8sb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8sb operator&(Vec8sb const &a, Vec8sb const &b) { return Vec8sb(Vec128b(a) & Vec128b(b)); }
+static inline Vec8sb operator&&(Vec8sb const &a, Vec8sb const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec8sb &operator&=(Vec8sb &a, Vec8sb const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8sb operator|(Vec8sb const &a, Vec8sb const &b) { return Vec8sb(Vec128b(a) | Vec128b(b)); }
+static inline Vec8sb operator||(Vec8sb const &a, Vec8sb const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec8sb &operator|=(Vec8sb &a, Vec8sb const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8sb operator^(Vec8sb const &a, Vec8sb const &b) { return Vec8sb(Vec128b(a) ^ Vec128b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec8sb &operator^=(Vec8sb &a, Vec8sb const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8sb operator~(Vec8sb const &a) { return Vec8sb(~Vec128b(a)); }
+
+// vector operator ! : element not
+static inline Vec8sb operator!(Vec8sb const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec8sb andnot(Vec8sb const &a, Vec8sb const &b) { return Vec8sb(andnot(Vec128b(a), Vec128b(b))); }
+
+// Horizontal Boolean functions for Vec8sb
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec8sb const &a) { return _mm_movemask_epi8(a) == 0xFFFF; }
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec8sb const &a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported. Use PTEST
+  return !_mm_testz_si128(a, a);
+#else
+  return _mm_movemask_epi8(a) != 0;
+#endif
+}
+
+/*****************************************************************************
+ *
+ *         operators for Vec8s
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8s operator+(Vec8s const &a, Vec8s const &b) { return _mm_add_epi16(a, b); }
+
+// vector operator += : add
+static inline Vec8s &operator+=(Vec8s &a, Vec8s const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec8s operator++(Vec8s &a, int)
+{
+  Vec8s a0 = a;
+  a        = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec8s &operator++(Vec8s &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8s operator-(Vec8s const &a, Vec8s const &b) { return _mm_sub_epi16(a, b); }
+
+// vector operator - : unary minus
+static inline Vec8s operator-(Vec8s const &a) { return _mm_sub_epi16(_mm_setzero_si128(), a); }
+
+// vector operator -= : subtract
+static inline Vec8s &operator-=(Vec8s &a, Vec8s const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec8s operator--(Vec8s &a, int)
+{
+  Vec8s a0 = a;
+  a        = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec8s &operator--(Vec8s &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8s operator*(Vec8s const &a, Vec8s const &b) { return _mm_mullo_epi16(a, b); }
+
+// vector operator *= : multiply
+static inline Vec8s &operator*=(Vec8s &a, Vec8s const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+// vector operator << : shift left
+static inline Vec8s operator<<(Vec8s const &a, int b) { return _mm_sll_epi16(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator <<= : shift left
+static inline Vec8s &operator<<=(Vec8s &a, int b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8s operator>>(Vec8s const &a, int b) { return _mm_sra_epi16(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8s &operator>>=(Vec8s &a, int b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8sb operator==(Vec8s const &a, Vec8s const &b) { return _mm_cmpeq_epi16(a, b); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8sb operator!=(Vec8s const &a, Vec8s const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return (Vec8sb)_mm_comneq_epi16(a, b);
+#else  // SSE2 instruction set
+  return Vec8sb(~(a == b));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8sb operator>(Vec8s const &a, Vec8s const &b) { return _mm_cmpgt_epi16(a, b); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8sb operator<(Vec8s const &a, Vec8s const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8sb operator>=(Vec8s const &a, Vec8s const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return (Vec8sb)_mm_comge_epi16(a, b);
+#else  // SSE2 instruction set
+  return Vec8sb(~(b > a));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8sb operator<=(Vec8s const &a, Vec8s const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec8s operator&(Vec8s const &a, Vec8s const &b) { return Vec8s(Vec128b(a) & Vec128b(b)); }
+static inline Vec8s operator&&(Vec8s const &a, Vec8s const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec8s &operator&=(Vec8s &a, Vec8s const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8s operator|(Vec8s const &a, Vec8s const &b) { return Vec8s(Vec128b(a) | Vec128b(b)); }
+static inline Vec8s operator||(Vec8s const &a, Vec8s const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec8s &operator|=(Vec8s &a, Vec8s const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8s operator^(Vec8s const &a, Vec8s const &b) { return Vec8s(Vec128b(a) ^ Vec128b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec8s &operator^=(Vec8s &a, Vec8s const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8s operator~(Vec8s const &a) { return Vec8s(~Vec128b(a)); }
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec8s operator!(Vec8s const &a) { return _mm_cmpeq_epi16(a, _mm_setzero_si128()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8s select(Vec8sb const &s, Vec8s const &a, Vec8s const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8s if_add(Vec8sb const &f, Vec8s const &a, Vec8s const &b) { return a + (Vec8s(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add(Vec8s const &a)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  __m128i sum1 = _mm_haddq_epi16(a);
+  __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E);  // high element
+  __m128i sum3 = _mm_add_epi32(sum1, sum2);      // sum
+  int16_t sum4 = _mm_cvtsi128_si32(sum3);        // truncate to 16 bits
+  return sum4;                                   // sign extend to 32 bits
+#elif INSTRSET >= 4                              // SSSE3
+  __m128i sum1 = _mm_hadd_epi16(a, a);  // horizontally add 8 elements in 3 steps
+  __m128i sum2 = _mm_hadd_epi16(sum1, sum1);
+  __m128i sum3 = _mm_hadd_epi16(sum2, sum2);
+  int16_t sum4 = (int16_t)_mm_cvtsi128_si32(sum3);  // 16 bit sum
+  return sum4;                                      // sign extend to 32 bits
+#else                                            // SSE2
+  __m128i sum1 = _mm_shuffle_epi32(a, 0x0E);        // 4 high elements
+  __m128i sum2 = _mm_add_epi16(a, sum1);            // 4 sums
+  __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);     // 2 high elements
+  __m128i sum4 = _mm_add_epi16(sum2, sum3);         // 2 sums
+  __m128i sum5 = _mm_shufflelo_epi16(sum4, 0x01);   // 1 high element
+  __m128i sum6 = _mm_add_epi16(sum4, sum5);         // 1 sum
+  int16_t sum7 = _mm_cvtsi128_si32(sum6);           // 16 bit sum
+  return sum7;                                      // sign extend to 32 bits
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int32_t horizontal_add_x(Vec8s const &a)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  __m128i sum1 = _mm_haddq_epi16(a);
+  __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E);  // high element
+  __m128i sum3 = _mm_add_epi32(sum1, sum2);      // sum
+  return _mm_cvtsi128_si32(sum3);
+#elif INSTRSET >= 4  // SSSE3
+  __m128i aeven = _mm_slli_epi32(a, 16);            // even numbered elements of a. get sign bit in position
+  aeven         = _mm_srai_epi32(aeven, 16);        // sign extend even numbered elements
+  __m128i aodd  = _mm_srai_epi32(a, 16);            // sign extend odd  numbered elements
+  __m128i sum1  = _mm_add_epi32(aeven, aodd);       // add even and odd elements
+  __m128i sum2  = _mm_hadd_epi32(sum1, sum1);       // horizontally add 4 elements in 2 steps
+  __m128i sum3  = _mm_hadd_epi32(sum2, sum2);
+  return _mm_cvtsi128_si32(sum3);
+#else                // SSE2
+  __m128i aeven = _mm_slli_epi32(a, 16);            // even numbered elements of a. get sign bit in position
+  aeven         = _mm_srai_epi32(aeven, 16);        // sign extend even numbered elements
+  __m128i aodd  = _mm_srai_epi32(a, 16);            // sign extend odd  numbered elements
+  __m128i sum1  = _mm_add_epi32(aeven, aodd);       // add even and odd elements
+  __m128i sum2  = _mm_shuffle_epi32(sum1, 0x0E);    // 2 high elements
+  __m128i sum3  = _mm_add_epi32(sum1, sum2);
+  __m128i sum4  = _mm_shuffle_epi32(sum3, 0x01);  // 1 high elements
+  __m128i sum5  = _mm_add_epi32(sum3, sum4);
+  return _mm_cvtsi128_si32(sum5);                   // 32 bit sum
+#endif
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec8s add_saturated(Vec8s const &a, Vec8s const &b) { return _mm_adds_epi16(a, b); }
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec8s sub_saturated(Vec8s const &a, Vec8s const &b) { return _mm_subs_epi16(a, b); }
+
+// function max: a > b ? a : b
+static inline Vec8s max(Vec8s const &a, Vec8s const &b) { return _mm_max_epi16(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec8s min(Vec8s const &a, Vec8s const &b) { return _mm_min_epi16(a, b); }
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8s abs(Vec8s const &a)
+{
+#if INSTRSET >= 4  // SSSE3 supported
+  return _mm_sign_epi16(a, a);
+#else  // SSE2
+  __m128i nega = _mm_sub_epi16(_mm_setzero_si128(), a);
+  return _mm_max_epi16(a, nega);
+#endif
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8s abs_saturated(Vec8s const &a)
+{
+  __m128i absa   = abs(a);                    // abs(a)
+  __m128i overfl = _mm_srai_epi16(absa, 15);  // sign
+  return _mm_add_epi16(absa, overfl);         // subtract 1 if 0x8000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8s rotate_left(Vec8s const &a, int b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return _mm_rot_epi16(a, _mm_set1_epi16(b));
+#else  // SSE2 instruction set
+  __m128i left  = _mm_sll_epi16(a, _mm_cvtsi32_si128(b & 0x0F));         // a << b
+  __m128i right = _mm_srl_epi16(a, _mm_cvtsi32_si128((16 - b) & 0x0F));  // a >> (16 - b)
+  __m128i rot   = _mm_or_si128(left, right);                             // or
+  return rot;
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 8 16-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec8us : public Vec8s
+{
+ public:
+  // Default constructor:
+  Vec8us() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8us(uint32_t i) { xmm = _mm_set1_epi16((int16_t)i); }
+  // Constructor to build from all elements:
+  Vec8us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7)
+  {
+    xmm = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7);
+  }
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec8us(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec8us &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec8us &load(void const *p)
+  {
+    xmm = _mm_loadu_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to load from array (aligned)
+  Vec8us &load_a(void const *p)
+  {
+    xmm = _mm_load_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8us const &insert(uint32_t index, uint16_t value)
+  {
+    Vec8s::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint16_t extract(uint32_t index) const { return Vec8s::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint16_t operator[](uint32_t index) const { return extract(index); }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8us operator+(Vec8us const &a, Vec8us const &b) { return Vec8us(Vec8s(a) + Vec8s(b)); }
+
+// vector operator - : subtract
+static inline Vec8us operator-(Vec8us const &a, Vec8us const &b) { return Vec8us(Vec8s(a) - Vec8s(b)); }
+
+// vector operator * : multiply
+static inline Vec8us operator*(Vec8us const &a, Vec8us const &b) { return Vec8us(Vec8s(a) * Vec8s(b)); }
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec8us operator>>(Vec8us const &a, uint32_t b) { return _mm_srl_epi16(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec8us operator>>(Vec8us const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right logical
+static inline Vec8us &operator>>=(Vec8us &a, int b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8us operator<<(Vec8us const &a, uint32_t b) { return _mm_sll_epi16(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator << : shift left all elements
+static inline Vec8us operator<<(Vec8us const &a, int32_t b) { return a << (uint32_t)b; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8sb operator>=(Vec8us const &a, Vec8us const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return _mm_comge_epu16(a, b);
+#elif INSTRSET >= 5  // SSE4.1
+  __m128i max_ab = _mm_max_epu16(a, b);  // max(a,b), unsigned
+  return _mm_cmpeq_epi16(a, max_ab);     // a == max(a,b)
+#else                // SSE2 instruction set
+  __m128i s = _mm_subs_epu16(b, a);                 // b-a, saturated
+  return _mm_cmpeq_epi16(s, _mm_setzero_si128());   // s == 0
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8sb operator<=(Vec8us const &a, Vec8us const &b) { return b >= a; }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8sb operator>(Vec8us const &a, Vec8us const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return (Vec8s)_mm_comgt_epu16(a, b);
+#else  // SSE2 instruction set
+  return Vec8sb(~(b >= a));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8sb operator<(Vec8us const &a, Vec8us const &b) { return b > a; }
+
+// vector operator & : bitwise and
+static inline Vec8us operator&(Vec8us const &a, Vec8us const &b) { return Vec8us(Vec128b(a) & Vec128b(b)); }
+static inline Vec8us operator&&(Vec8us const &a, Vec8us const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec8us operator|(Vec8us const &a, Vec8us const &b) { return Vec8us(Vec128b(a) | Vec128b(b)); }
+static inline Vec8us operator||(Vec8us const &a, Vec8us const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec8us operator^(Vec8us const &a, Vec8us const &b) { return Vec8us(Vec128b(a) ^ Vec128b(b)); }
+
+// vector operator ~ : bitwise not
+static inline Vec8us operator~(Vec8us const &a) { return Vec8us(~Vec128b(a)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8us select(Vec8sb const &s, Vec8us const &a, Vec8us const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8us if_add(Vec8sb const &f, Vec8us const &a, Vec8us const &b) { return a + (Vec8us(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add(Vec8us const &a)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  __m128i sum1  = _mm_haddq_epu16(a);
+  __m128i sum2  = _mm_shuffle_epi32(sum1, 0x0E);  // high element
+  __m128i sum3  = _mm_add_epi32(sum1, sum2);      // sum
+  uint16_t sum4 = _mm_cvtsi128_si32(sum3);        // truncate to 16 bits
+  return sum4;                                    // zero extend to 32 bits
+#elif INSTRSET >= 4                               // SSSE3
+  __m128i sum1  = _mm_hadd_epi16(a, a);  // horizontally add 8 elements in 3 steps
+  __m128i sum2  = _mm_hadd_epi16(sum1, sum1);
+  __m128i sum3  = _mm_hadd_epi16(sum2, sum2);
+  uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3);  // 16 bit sum
+  return sum4;                                        // zero extend to 32 bits
+#else                                             // SSE2
+  __m128i sum1  = _mm_shuffle_epi32(a, 0x0E);       // 4 high elements
+  __m128i sum2  = _mm_add_epi16(a, sum1);           // 4 sums
+  __m128i sum3  = _mm_shuffle_epi32(sum2, 0x01);    // 2 high elements
+  __m128i sum4  = _mm_add_epi16(sum2, sum3);        // 2 sums
+  __m128i sum5  = _mm_shufflelo_epi16(sum4, 0x01);  // 1 high element
+  __m128i sum6  = _mm_add_epi16(sum4, sum5);        // 1 sum
+  uint16_t sum7 = _mm_cvtsi128_si32(sum6);          // 16 bit sum
+  return sum7;                                      // zero extend to 32 bits
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x(Vec8us const &a)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  __m128i sum1 = _mm_haddq_epu16(a);
+  __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E);  // high element
+  __m128i sum3 = _mm_add_epi32(sum1, sum2);      // sum
+  return _mm_cvtsi128_si32(sum3);
+#elif INSTRSET >= 4  // SSSE3
+  __m128i mask  = _mm_set1_epi32(0x0000FFFF);         // mask for even positions
+  __m128i aeven = _mm_and_si128(a, mask);             // even numbered elements of a
+  __m128i aodd  = _mm_srli_epi32(a, 16);              // zero extend odd numbered elements
+  __m128i sum1  = _mm_add_epi32(aeven, aodd);         // add even and odd elements
+  __m128i sum2  = _mm_hadd_epi32(sum1, sum1);         // horizontally add 4 elements in 2 steps
+  __m128i sum3  = _mm_hadd_epi32(sum2, sum2);
+  return _mm_cvtsi128_si32(sum3);
+#else                // SSE2
+  __m128i mask  = _mm_set1_epi32(0x0000FFFF);       // mask for even positions
+  __m128i aeven = _mm_and_si128(a, mask);           // even numbered elements of a
+  __m128i aodd  = _mm_srli_epi32(a, 16);            // zero extend odd numbered elements
+  __m128i sum1  = _mm_add_epi32(aeven, aodd);       // add even and odd elements
+  __m128i sum2  = _mm_shuffle_epi32(sum1, 0x0E);    // 2 high elements
+  __m128i sum3  = _mm_add_epi32(sum1, sum2);
+  __m128i sum4  = _mm_shuffle_epi32(sum3, 0x01);  // 1 high elements
+  __m128i sum5  = _mm_add_epi32(sum3, sum4);
+  return _mm_cvtsi128_si32(sum5);                                        // 16 bit sum
+#endif
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec8us add_saturated(Vec8us const &a, Vec8us const &b) { return _mm_adds_epu16(a, b); }
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec8us sub_saturated(Vec8us const &a, Vec8us const &b) { return _mm_subs_epu16(a, b); }
+
+// function max: a > b ? a : b
+static inline Vec8us max(Vec8us const &a, Vec8us const &b)
+{
+#if INSTRSET >= 5  // SSE4.1
+  return _mm_max_epu16(a, b);
+#else  // SSE2
+  __m128i signbit = _mm_set1_epi32(0x80008000);
+  __m128i a1      = _mm_xor_si128(a, signbit);  // add 0x8000
+  __m128i b1      = _mm_xor_si128(b, signbit);  // add 0x8000
+  __m128i m1      = _mm_max_epi16(a1, b1);      // signed max
+  return _mm_xor_si128(m1, signbit);            // sub 0x8000
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec8us min(Vec8us const &a, Vec8us const &b)
+{
+#if INSTRSET >= 5  // SSE4.1
+  return _mm_min_epu16(a, b);
+#else  // SSE2
+  __m128i signbit = _mm_set1_epi32(0x80008000);
+  __m128i a1      = _mm_xor_si128(a, signbit);  // add 0x8000
+  __m128i b1      = _mm_xor_si128(b, signbit);  // add 0x8000
+  __m128i m1      = _mm_min_epi16(a1, b1);      // signed min
+  return _mm_xor_si128(m1, signbit);            // sub 0x8000
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 4 32-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec4i : public Vec128b
+{
+ public:
+  // Default constructor:
+  Vec4i() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec4i(int i) { xmm = _mm_set1_epi32(i); }
+  // Constructor to build from all elements:
+  Vec4i(int32_t i0, int32_t i1, int32_t i2, int32_t i3) { xmm = _mm_setr_epi32(i0, i1, i2, i3); }
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec4i(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec4i &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m128i used in intrinsics
+  operator __m128i() const { return xmm; }
+  // Member function to load from array (unaligned)
+  Vec4i &load(void const *p)
+  {
+    xmm = _mm_loadu_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to load from array (aligned)
+  Vec4i &load_a(void const *p)
+  {
+    xmm = _mm_load_si128((__m128i const *)p);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec4i &load_partial(int n, void const *p)
+  {
+    switch(n)
+      {
+        case 0:
+          *this = 0;
+          break;
+        case 1:
+          xmm = _mm_cvtsi32_si128(*(int32_t const *)p);
+          break;
+        case 2:
+          // intrinsic for movq is missing!
+          xmm = _mm_setr_epi32(((int32_t const *)p)[0], ((int32_t const *)p)[1], 0, 0);
+          break;
+        case 3:
+          xmm = _mm_setr_epi32(((int32_t const *)p)[0], ((int32_t const *)p)[1], ((int32_t const *)p)[2], 0);
+          break;
+        case 4:
+          load(p);
+          break;
+        default:
+          break;
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    union
+    {
+      int32_t i[4];
+      int64_t q[2];
+    } u;
+    switch(n)
+      {
+        case 1:
+          *(int32_t *)p = _mm_cvtsi128_si32(xmm);
+          break;
+        case 2:
+          // intrinsic for movq is missing!
+          store(u.i);
+          *(int64_t *)p = u.q[0];
+          break;
+        case 3:
+          store(u.i);
+          *(int64_t *)p     = u.q[0];
+          ((int32_t *)p)[2] = u.i[2];
+          break;
+        case 4:
+          store(p);
+          break;
+        default:
+          break;
+      }
+  }
+  // cut off vector to n elements. The last 4-n elements are set to zero
+  Vec4i &cutoff(int n)
+  {
+    *this = Vec16c(xmm).cutoff(n * 4);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4i const &insert(uint32_t index, int32_t value)
+  {
+    static const int32_t maskl[8] = {0, 0, 0, 0, -1, 0, 0, 0};
+    __m128i broad                 = _mm_set1_epi32(value);                       // broadcast value into all elements
+    __m128i mask = _mm_loadu_si128((__m128i const *)(maskl + 4 - (index & 3)));  // mask with FFFFFFFF at index position
+    xmm          = selectb(mask, broad, xmm);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int32_t extract(uint32_t index) const
+  {
+    int32_t x[4];
+    store(x);
+    return x[index & 3];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int32_t operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 4; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec4ib: Vector of 4 Booleans for use with Vec4i and Vec4ui
+ *
+ *****************************************************************************/
+class Vec4ib : public Vec4i
+{
+ public:
+  // Default constructor:
+  Vec4ib() {}
+  // Constructor to build from all elements:
+  Vec4ib(bool x0, bool x1, bool x2, bool x3) { xmm = Vec4i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3)); }
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec4ib(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec4ib &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec4ib(bool b) : Vec4i(-int32_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec4ib &operator=(bool b)
+  {
+    *this = Vec4ib(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec4ib(int b);
+  Vec4ib &operator=(int x);
+
+ public:
+  Vec4ib &insert(int index, bool a)
+  {
+    Vec4i::insert(index, -(int)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const { return Vec4i::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec4ib
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4ib operator&(Vec4ib const &a, Vec4ib const &b) { return Vec4ib(Vec128b(a) & Vec128b(b)); }
+static inline Vec4ib operator&&(Vec4ib const &a, Vec4ib const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec4ib &operator&=(Vec4ib &a, Vec4ib const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4ib operator|(Vec4ib const &a, Vec4ib const &b) { return Vec4ib(Vec128b(a) | Vec128b(b)); }
+static inline Vec4ib operator||(Vec4ib const &a, Vec4ib const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec4ib &operator|=(Vec4ib &a, Vec4ib const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4ib operator^(Vec4ib const &a, Vec4ib const &b) { return Vec4ib(Vec128b(a) ^ Vec128b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec4ib &operator^=(Vec4ib &a, Vec4ib const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4ib operator~(Vec4ib const &a) { return Vec4ib(~Vec128b(a)); }
+
+// vector operator ! : element not
+static inline Vec4ib operator!(Vec4ib const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec4ib andnot(Vec4ib const &a, Vec4ib const &b) { return Vec4ib(andnot(Vec128b(a), Vec128b(b))); }
+
+// Horizontal Boolean functions for Vec4ib
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec4ib const &a) { return _mm_movemask_epi8(a) == 0xFFFF; }
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec4ib const &a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported. Use PTEST
+  return !_mm_testz_si128(a, a);
+#else
+  return _mm_movemask_epi8(a) != 0;
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Operators for Vec4i
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4i operator+(Vec4i const &a, Vec4i const &b) { return _mm_add_epi32(a, b); }
+
+// vector operator += : add
+static inline Vec4i &operator+=(Vec4i &a, Vec4i const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec4i operator++(Vec4i &a, int)
+{
+  Vec4i a0 = a;
+  a        = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec4i &operator++(Vec4i &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4i operator-(Vec4i const &a, Vec4i const &b) { return _mm_sub_epi32(a, b); }
+
+// vector operator - : unary minus
+static inline Vec4i operator-(Vec4i const &a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+
+// vector operator -= : subtract
+static inline Vec4i &operator-=(Vec4i &a, Vec4i const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec4i operator--(Vec4i &a, int)
+{
+  Vec4i a0 = a;
+  a        = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec4i &operator--(Vec4i &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4i operator*(Vec4i const &a, Vec4i const &b)
+{
+#if INSTRSET >= 5  // SSE4.1 instruction set
+  return _mm_mullo_epi32(a, b);
+#else
+  __m128i a13    = _mm_shuffle_epi32(a, 0xF5);          // (-,a3,-,a1)
+  __m128i b13    = _mm_shuffle_epi32(b, 0xF5);          // (-,b3,-,b1)
+  __m128i prod02 = _mm_mul_epu32(a, b);                 // (-,a2*b2,-,a0*b0)
+  __m128i prod13 = _mm_mul_epu32(a13, b13);             // (-,a3*b3,-,a1*b1)
+  __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);  // (-,-,a1*b1,a0*b0)
+  __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);  // (-,-,a3*b3,a2*b2)
+  return _mm_unpacklo_epi64(prod01, prod23);            // (ab3,ab2,ab1,ab0)
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec4i &operator*=(Vec4i &a, Vec4i const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+// vector operator << : shift left
+static inline Vec4i operator<<(Vec4i const &a, int32_t b) { return _mm_sll_epi32(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator <<= : shift left
+static inline Vec4i &operator<<=(Vec4i &a, int32_t b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec4i operator>>(Vec4i const &a, int32_t b) { return _mm_sra_epi32(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >>= : shift right arithmetic
+static inline Vec4i &operator>>=(Vec4i &a, int32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4ib operator==(Vec4i const &a, Vec4i const &b) { return _mm_cmpeq_epi32(a, b); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4ib operator!=(Vec4i const &a, Vec4i const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return (Vec4ib)_mm_comneq_epi32(a, b);
+#else  // SSE2 instruction set
+  return Vec4ib(Vec4i(~(a == b)));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4ib operator>(Vec4i const &a, Vec4i const &b) { return _mm_cmpgt_epi32(a, b); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4ib operator<(Vec4i const &a, Vec4i const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec4ib operator>=(Vec4i const &a, Vec4i const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return (Vec4ib)_mm_comge_epi32(a, b);
+#else  // SSE2 instruction set
+  return Vec4ib(Vec4i(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec4ib operator<=(Vec4i const &a, Vec4i const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec4i operator&(Vec4i const &a, Vec4i const &b) { return Vec4i(Vec128b(a) & Vec128b(b)); }
+static inline Vec4i operator&&(Vec4i const &a, Vec4i const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec4i &operator&=(Vec4i &a, Vec4i const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4i operator|(Vec4i const &a, Vec4i const &b) { return Vec4i(Vec128b(a) | Vec128b(b)); }
+static inline Vec4i operator||(Vec4i const &a, Vec4i const &b) { return a | b; }
+// vector operator |= : bitwise and
+static inline Vec4i &operator|=(Vec4i &a, Vec4i const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4i operator^(Vec4i const &a, Vec4i const &b) { return Vec4i(Vec128b(a) ^ Vec128b(b)); }
+// vector operator ^= : bitwise and
+static inline Vec4i &operator^=(Vec4i &a, Vec4i const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4i operator~(Vec4i const &a) { return Vec4i(~Vec128b(a)); }
+
+// vector operator ! : returns true for elements == 0
+static inline Vec4ib operator!(Vec4i const &a) { return _mm_cmpeq_epi32(a, _mm_setzero_si128()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4i select(Vec4ib const &s, Vec4i const &a, Vec4i const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4i if_add(Vec4ib const &f, Vec4i const &a, Vec4i const &b) { return a + (Vec4i(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add(Vec4i const &a)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  __m128i sum1 = _mm_haddq_epi32(a);
+  __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E);  // high element
+  __m128i sum3 = _mm_add_epi32(sum1, sum2);      // sum
+  return _mm_cvtsi128_si32(sum3);                // truncate to 32 bits
+#elif INSTRSET >= 4                              // SSSE3
+  __m128i sum1 = _mm_hadd_epi32(a, a);  // horizontally add 4 elements in 2 steps
+  __m128i sum2 = _mm_hadd_epi32(sum1, sum1);
+  return _mm_cvtsi128_si32(sum2);                // 32 bit sum
+#else                                            // SSE2
+  __m128i sum1 = _mm_shuffle_epi32(a, 0x0E);                             // 2 high elements
+  __m128i sum2 = _mm_add_epi32(a, sum1);                                 // 2 sums
+  __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);                          // 1 high element
+  __m128i sum4 = _mm_add_epi32(sum2, sum3);                              // 2 sums
+  return _mm_cvtsi128_si32(sum4);                                        // 32 bit sum
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x(Vec4i const &a)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  __m128i sum1 = _mm_haddq_epi32(a);
+#else  // SSE2
+  __m128i signs = _mm_srai_epi32(a, 31);         // sign of all elements
+  __m128i a01   = _mm_unpacklo_epi32(a, signs);  // sign-extended a0, a1
+  __m128i a23   = _mm_unpackhi_epi32(a, signs);  // sign-extended a2, a3
+  __m128i sum1  = _mm_add_epi64(a01, a23);       // add
+#endif
+  __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1);  // high qword
+  __m128i sum3 = _mm_add_epi64(sum1, sum2);       // add
+#if defined(__x86_64__)
+  return _mm_cvtsi128_si64(sum3);  // 64 bit mode
+#else
+  union
+  {
+    __m128i x;  // silly definition of _mm_storel_epi64 requires __m128i
+    int64_t i;
+  } u;
+  _mm_storel_epi64(&u.x, sum3);
+  return u.i;
+#endif
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec4i add_saturated(Vec4i const &a, Vec4i const &b)
+{
+  __m128i sum    = _mm_add_epi32(a, b);         // a + b
+  __m128i axb    = _mm_xor_si128(a, b);         // check if a and b have different sign
+  __m128i axs    = _mm_xor_si128(a, sum);       // check if a and sum have different sign
+  __m128i overf1 = _mm_andnot_si128(axb, axs);  // check if sum has wrong sign
+  __m128i overf2 = _mm_srai_epi32(overf1, 31);  // -1 if overflow
+  __m128i asign  = _mm_srli_epi32(a, 31);       // 1  if a < 0
+  __m128i sat1   = _mm_srli_epi32(overf2, 1);   // 7FFFFFFF if overflow
+  __m128i sat2   = _mm_add_epi32(sat1, asign);  // 7FFFFFFF if positive overflow 80000000 if negative overflow
+  return selectb(overf2, sat2, sum);            // sum if not overflow, else sat2
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec4i sub_saturated(Vec4i const &a, Vec4i const &b)
+{
+  __m128i diff   = _mm_sub_epi32(a, b);         // a + b
+  __m128i axb    = _mm_xor_si128(a, b);         // check if a and b have different sign
+  __m128i axs    = _mm_xor_si128(a, diff);      // check if a and sum have different sign
+  __m128i overf1 = _mm_and_si128(axb, axs);     // check if sum has wrong sign
+  __m128i overf2 = _mm_srai_epi32(overf1, 31);  // -1 if overflow
+  __m128i asign  = _mm_srli_epi32(a, 31);       // 1  if a < 0
+  __m128i sat1   = _mm_srli_epi32(overf2, 1);   // 7FFFFFFF if overflow
+  __m128i sat2   = _mm_add_epi32(sat1, asign);  // 7FFFFFFF if positive overflow 80000000 if negative overflow
+  return selectb(overf2, sat2, diff);           // diff if not overflow, else sat2
+}
+
+// function max: a > b ? a : b
+static inline Vec4i max(Vec4i const &a, Vec4i const &b)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_max_epi32(a, b);
+#else
+  __m128i greater = _mm_cmpgt_epi32(a, b);
+  return selectb(greater, a, b);
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec4i min(Vec4i const &a, Vec4i const &b)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_min_epi32(a, b);
+#else
+  __m128i greater = _mm_cmpgt_epi32(a, b);
+  return selectb(greater, b, a);
+#endif
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec4i abs(Vec4i const &a)
+{
+#if INSTRSET >= 4  // SSSE3 supported
+  return _mm_sign_epi32(a, a);
+#else  // SSE2
+  __m128i sign = _mm_srai_epi32(a, 31);   // sign of a
+  __m128i inv  = _mm_xor_si128(a, sign);  // invert bits if negative
+  return _mm_sub_epi32(inv, sign);        // add 1
+#endif
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec4i abs_saturated(Vec4i const &a)
+{
+  __m128i absa   = abs(a);                    // abs(a)
+  __m128i overfl = _mm_srai_epi32(absa, 31);  // sign
+  return _mm_add_epi32(absa, overfl);         // subtract 1 if 0x80000000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec4i rotate_left(Vec4i const &a, int b)
+{
+#ifdef __AVX512VL__
+  return _mm_rolv_epi32(a, _mm_set1_epi32(b));
+#elif defined __XOP__  // AMD XOP instruction set
+  return _mm_rot_epi32(a, _mm_set1_epi32(b));
+#else                  // SSE2 instruction set
+  __m128i left  = _mm_sll_epi32(a, _mm_cvtsi32_si128(b & 0x1F));         // a << b
+  __m128i right = _mm_srl_epi32(a, _mm_cvtsi32_si128((32 - b) & 0x1F));  // a >> (32 - b)
+  __m128i rot   = _mm_or_si128(left, right);                             // or
+  return rot;
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 4 32-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec4ui : public Vec4i
+{
+ public:
+  // Default constructor:
+  Vec4ui() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec4ui(uint32_t i) { xmm = _mm_set1_epi32(i); }
+  // Constructor to build from all elements:
+  Vec4ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { xmm = _mm_setr_epi32(i0, i1, i2, i3); }
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec4ui(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec4ui &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec4ui &load(void const *p)
+  {
+    xmm = _mm_loadu_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to load from array (aligned)
+  Vec4ui &load_a(void const *p)
+  {
+    xmm = _mm_load_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4ui const &insert(uint32_t index, uint32_t value)
+  {
+    Vec4i::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint32_t extract(uint32_t index) const { return Vec4i::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint32_t operator[](uint32_t index) const { return extract(index); }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec4ui operator+(Vec4ui const &a, Vec4ui const &b) { return Vec4ui(Vec4i(a) + Vec4i(b)); }
+
+// vector operator - : subtract
+static inline Vec4ui operator-(Vec4ui const &a, Vec4ui const &b) { return Vec4ui(Vec4i(a) - Vec4i(b)); }
+
+// vector operator * : multiply
+static inline Vec4ui operator*(Vec4ui const &a, Vec4ui const &b) { return Vec4ui(Vec4i(a) * Vec4i(b)); }
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec4ui operator>>(Vec4ui const &a, uint32_t b) { return _mm_srl_epi32(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec4ui operator>>(Vec4ui const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right logical
+static inline Vec4ui &operator>>=(Vec4ui &a, int b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec4ui operator<<(Vec4ui const &a, uint32_t b) { return Vec4ui((Vec4i)a << (int32_t)b); }
+
+// vector operator << : shift left all elements
+static inline Vec4ui operator<<(Vec4ui const &a, int32_t b) { return Vec4ui((Vec4i)a << (int32_t)b); }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec4ib operator>(Vec4ui const &a, Vec4ui const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return (Vec4ib)_mm_comgt_epu32(a, b);
+#else  // SSE2 instruction set
+  __m128i signbit = _mm_set1_epi32(0x80000000);
+  __m128i a1      = _mm_xor_si128(a, signbit);
+  __m128i b1      = _mm_xor_si128(b, signbit);
+  return (Vec4ib)_mm_cmpgt_epi32(a1, b1);      // signed compare
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec4ib operator<(Vec4ui const &a, Vec4ui const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec4ib operator>=(Vec4ui const &a, Vec4ui const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return (Vec4ib)_mm_comge_epu32(a, b);
+#elif INSTRSET >= 5  // SSE4.1
+  __m128i max_ab = _mm_max_epu32(a, b);        // max(a,b), unsigned
+  return (Vec4ib)_mm_cmpeq_epi32(a, max_ab);   // a == max(a,b)
+#else                // SSE2 instruction set
+  return Vec4ib(Vec4i(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec4ib operator<=(Vec4ui const &a, Vec4ui const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec4ui operator&(Vec4ui const &a, Vec4ui const &b) { return Vec4ui(Vec128b(a) & Vec128b(b)); }
+static inline Vec4ui operator&&(Vec4ui const &a, Vec4ui const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec4ui operator|(Vec4ui const &a, Vec4ui const &b) { return Vec4ui(Vec128b(a) | Vec128b(b)); }
+static inline Vec4ui operator||(Vec4ui const &a, Vec4ui const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec4ui operator^(Vec4ui const &a, Vec4ui const &b) { return Vec4ui(Vec128b(a) ^ Vec128b(b)); }
+
+// vector operator ~ : bitwise not
+static inline Vec4ui operator~(Vec4ui const &a) { return Vec4ui(~Vec128b(a)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4ui select(Vec4ib const &s, Vec4ui const &a, Vec4ui const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4ui if_add(Vec4ib const &f, Vec4ui const &a, Vec4ui const &b) { return a + (Vec4ui(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add(Vec4ui const &a) { return horizontal_add((Vec4i)a); }
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x(Vec4ui const &a)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  __m128i sum1 = _mm_haddq_epu32(a);
+#else  // SSE2
+  __m128i zero = _mm_setzero_si128();          // 0
+  __m128i a01  = _mm_unpacklo_epi32(a, zero);  // zero-extended a0, a1
+  __m128i a23  = _mm_unpackhi_epi32(a, zero);  // zero-extended a2, a3
+  __m128i sum1 = _mm_add_epi64(a01, a23);      // add
+#endif
+  __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1);  // high qword
+  __m128i sum3 = _mm_add_epi64(sum1, sum2);       // add
+#if defined(_M_AMD64) || defined(_M_X64) || defined(__x86_64__) || defined(__amd64)
+  return _mm_cvtsi128_si64(sum3);  // 64 bit mode
+#else
+  union
+  {
+    __m128i x;  // silly definition of _mm_storel_epi64 requires __m128i
+    uint64_t i;
+  } u;
+  _mm_storel_epi64(&u.x, sum3);
+  return u.i;
+#endif
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec4ui add_saturated(Vec4ui const &a, Vec4ui const &b)
+{
+  Vec4ui sum      = a + b;
+  Vec4ui aorb     = Vec4ui(a | b);
+  Vec4ui overflow = Vec4ui(sum < aorb);  // overflow if a + b < (a | b)
+  return Vec4ui(sum | overflow);         // return 0xFFFFFFFF if overflow
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec4ui sub_saturated(Vec4ui const &a, Vec4ui const &b)
+{
+  Vec4ui diff      = a - b;
+  Vec4ui underflow = Vec4ui(diff > a);       // underflow if a - b > a
+  return _mm_andnot_si128(underflow, diff);  // return 0 if underflow
+}
+
+// function max: a > b ? a : b
+static inline Vec4ui max(Vec4ui const &a, Vec4ui const &b)
+{
+#if INSTRSET >= 5  // SSE4.1
+  return _mm_max_epu32(a, b);
+#else  // SSE2
+  return select(a > b, a, b);
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec4ui min(Vec4ui const &a, Vec4ui const &b)
+{
+#if INSTRSET >= 5  // SSE4.1
+  return _mm_min_epu32(a, b);
+#else  // SSE2
+  return select(a > b, b, a);
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 2 64-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec2q : public Vec128b
+{
+ public:
+  // Default constructor:
+  Vec2q() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec2q(int64_t i)
+  {
+#if defined(_MSC_VER) && _MSC_VER < 1900 && !defined(__INTEL_COMPILER)
+    // MS compiler has no _mm_set1_epi64x in 32 bit mode
+#if defined(__x86_64__)  // 64 bit mode
+#if _MSC_VER < 1700
+    __m128i x1 = _mm_cvtsi64_si128(i);        // 64 bit load
+    xmm        = _mm_unpacklo_epi64(x1, x1);  // broadcast
+#else
+    xmm = _mm_set1_epi64x(i);
+#endif
+#else
+    union
+    {
+      int64_t q[2];
+      int32_t r[4];
+    } u;
+    u.q[0] = u.q[1] = i;
+    xmm             = _mm_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3]);
+    /*    // this will use an mm register and produce store forwarding stall:
+    union {
+        __m64 m;
+        int64_t ii;
+    } u;
+    u.ii = i;
+    xmm = _mm_set1_epi64(u.m);
+            _m_empty();        */
+
+#endif  // __x86_64__
+#else   // Other compilers
+    xmm       = _mm_set1_epi64x(i);
+#endif
+  }
+  // Constructor to build from all elements:
+  Vec2q(int64_t i0, int64_t i1)
+  {
+#if defined(_MSC_VER) && _MSC_VER < 1900 && !defined(__INTEL_COMPILER)
+    // MS compiler has no _mm_set_epi64x in 32 bit mode
+#if defined(__x86_64__)  // 64 bit mode
+#if _MSC_VER < 1700
+    __m128i x0 = _mm_cvtsi64_si128(i0);       // 64 bit load
+    __m128i x1 = _mm_cvtsi64_si128(i1);       // 64 bit load
+    xmm        = _mm_unpacklo_epi64(x0, x1);  // combine
+#else
+    xmm = _mm_set_epi64x(i1, i0);
+#endif
+#else   // MS compiler in 32-bit mode
+    union
+    {
+      int64_t q[2];
+      int32_t r[4];
+    } u;
+    u.q[0] = i0;
+    u.q[1] = i1;
+    // this is inefficient, but other solutions are worse
+    xmm = _mm_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3]);
+#endif  // __x86_64__
+#else   // Other compilers
+    xmm       = _mm_set_epi64x(i1, i0);
+#endif
+  }
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec2q(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec2q &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m128i used in intrinsics
+  operator __m128i() const { return xmm; }
+  // Member function to load from array (unaligned)
+  Vec2q &load(void const *p)
+  {
+    xmm = _mm_loadu_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to load from array (aligned)
+  Vec2q &load_a(void const *p)
+  {
+    xmm = _mm_load_si128((__m128i const *)p);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec2q &load_partial(int n, void const *p)
+  {
+    switch(n)
+      {
+        case 0:
+          *this = 0;
+          break;
+        case 1:
+          // intrinsic for movq is missing!
+          *this = Vec2q(*(int64_t const *)p, 0);
+          break;
+        case 2:
+          load(p);
+          break;
+        default:
+          break;
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    switch(n)
+      {
+        case 1:
+          int64_t q[2];
+          store(q);
+          *(int64_t *)p = q[0];
+          break;
+        case 2:
+          store(p);
+          break;
+        default:
+          break;
+      }
+  }
+  // cut off vector to n elements. The last 2-n elements are set to zero
+  Vec2q &cutoff(int n)
+  {
+    *this = Vec16c(xmm).cutoff(n * 8);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec2q const &insert(uint32_t index, int64_t value)
+  {
+#if INSTRSET >= 5 && defined(__x86_64__)  // SSE4.1 supported, 64 bit mode
+    if(index == 0)
+      {
+        xmm = _mm_insert_epi64(xmm, value, 0);
+      }
+    else
+      {
+        xmm = _mm_insert_epi64(xmm, value, 1);
+      }
+
+#else                    // SSE2
+#if defined(__x86_64__)  // 64 bit mode
+    __m128i v = _mm_cvtsi64_si128(value);  // 64 bit load
+#else
+    union
+    {
+      __m128i m;
+      int64_t ii;
+    } u;
+    u.ii      = value;
+    __m128i v = _mm_loadl_epi64(&u.m);
+#endif
+    if(index == 0)
+      {
+        v   = _mm_unpacklo_epi64(v, v);
+        xmm = _mm_unpackhi_epi64(v, xmm);
+      }
+    else
+      {  // index = 1
+        xmm = _mm_unpacklo_epi64(xmm, v);
+      }
+#endif
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int64_t extract(uint32_t index) const
+  {
+    int64_t x[2];
+    store(x);
+    return x[index & 1];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int64_t operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 2; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec2qb: Vector of 2 Booleans for use with Vec2q and Vec2uq
+ *
+ *****************************************************************************/
+// Definition will be different for the AVX512 instruction set
+class Vec2qb : public Vec2q
+{
+ public:
+  // Default constructor:
+  Vec2qb() {}
+  // Constructor to build from all elements:
+  Vec2qb(bool x0, bool x1) { xmm = Vec2q(-int64_t(x0), -int64_t(x1)); }
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec2qb(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec2qb &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec2qb(bool b) : Vec2q(-int64_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec2qb &operator=(bool b)
+  {
+    *this = Vec2qb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec2qb(int b);
+  Vec2qb &operator=(int x);
+
+ public:
+  Vec2qb &insert(int index, bool a)
+  {
+    Vec2q::insert(index, -(int64_t)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const { return Vec2q::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec2qb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec2qb operator&(Vec2qb const &a, Vec2qb const &b) { return Vec2qb(Vec128b(a) & Vec128b(b)); }
+static inline Vec2qb operator&&(Vec2qb const &a, Vec2qb const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec2qb &operator&=(Vec2qb &a, Vec2qb const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec2qb operator|(Vec2qb const &a, Vec2qb const &b) { return Vec2qb(Vec128b(a) | Vec128b(b)); }
+static inline Vec2qb operator||(Vec2qb const &a, Vec2qb const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec2qb &operator|=(Vec2qb &a, Vec2qb const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2qb operator^(Vec2qb const &a, Vec2qb const &b) { return Vec2qb(Vec128b(a) ^ Vec128b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec2qb &operator^=(Vec2qb &a, Vec2qb const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec2qb operator~(Vec2qb const &a) { return Vec2qb(~Vec128b(a)); }
+
+// vector operator ! : element not
+static inline Vec2qb operator!(Vec2qb const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec2qb andnot(Vec2qb const &a, Vec2qb const &b) { return Vec2qb(andnot(Vec128b(a), Vec128b(b))); }
+
+// Horizontal Boolean functions for Vec2qb
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec2qb const &a) { return _mm_movemask_epi8(a) == 0xFFFF; }
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec2qb const &a)
+{
+#if INSTRSET >= 5  // SSE4.1 supported. Use PTEST
+  return !_mm_testz_si128(a, a);
+#else
+  return _mm_movemask_epi8(a) != 0;
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Operators for Vec2q
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec2q operator+(Vec2q const &a, Vec2q const &b) { return _mm_add_epi64(a, b); }
+
+// vector operator += : add
+static inline Vec2q &operator+=(Vec2q &a, Vec2q const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec2q operator++(Vec2q &a, int)
+{
+  Vec2q a0 = a;
+  a        = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec2q &operator++(Vec2q &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec2q operator-(Vec2q const &a, Vec2q const &b) { return _mm_sub_epi64(a, b); }
+
+// vector operator - : unary minus
+static inline Vec2q operator-(Vec2q const &a) { return _mm_sub_epi64(_mm_setzero_si128(), a); }
+
+// vector operator -= : subtract
+static inline Vec2q &operator-=(Vec2q &a, Vec2q const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec2q operator--(Vec2q &a, int)
+{
+  Vec2q a0 = a;
+  a        = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec2q &operator--(Vec2q &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec2q operator*(Vec2q const &a, Vec2q const &b)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return _mm_mullo_epi64(a, b);
+#elif INSTRSET >= 5  // SSE4.1 supported
+  // instruction does not exist. Split into 32-bit multiplies
+  __m128i bswap   = _mm_shuffle_epi32(b, 0xB1);        // b0H,b0L,b1H,b1L (swap H<->L)
+  __m128i prodlh  = _mm_mullo_epi32(a, bswap);         // a0Lb0H,a0Hb0L,a1Lb1H,a1Hb1L, 32 bit L*H products
+  __m128i zero    = _mm_setzero_si128();               // 0
+  __m128i prodlh2 = _mm_hadd_epi32(prodlh, zero);      // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
+  __m128i prodlh3 = _mm_shuffle_epi32(prodlh2, 0x73);  // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
+  __m128i prodll  = _mm_mul_epu32(a, b);               // a0Lb0L,a1Lb1L, 64 bit unsigned products
+  __m128i prod    = _mm_add_epi64(prodll, prodlh3);    // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
+  return prod;
+#else                // SSE2
+  int64_t aa[2], bb[2];
+  a.store(aa);  // split into elements
+  b.store(bb);
+  return Vec2q(aa[0] * bb[0], aa[1] * bb[1]);                            // multiply elements separetely
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec2q &operator*=(Vec2q &a, Vec2q const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator << : shift left
+static inline Vec2q operator<<(Vec2q const &a, int32_t b) { return _mm_sll_epi64(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator <<= : shift left
+static inline Vec2q &operator<<=(Vec2q &a, int32_t b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec2q operator>>(Vec2q const &a, int32_t b)
+{
+  // instruction does not exist. Split into 32-bit shifts
+  if(b <= 32)
+    {
+      __m128i bb   = _mm_cvtsi32_si128(b);          // b
+      __m128i sra  = _mm_sra_epi32(a, bb);          // a >> b signed dwords
+      __m128i srl  = _mm_srl_epi64(a, bb);          // a >> b unsigned qwords
+      __m128i mask = _mm_setr_epi32(0, -1, 0, -1);  // mask for signed high part
+      return selectb(mask, sra, srl);
+    }
+  else
+    {                                               // b > 32
+      __m128i bm32 = _mm_cvtsi32_si128(b - 32);     // b - 32
+      __m128i sign = _mm_srai_epi32(a, 31);         // sign of a
+      __m128i sra2 = _mm_sra_epi32(a, bm32);        // a >> (b-32) signed dwords
+      __m128i sra3 = _mm_srli_epi64(sra2, 32);      // a >> (b-32) >> 32 (second shift unsigned qword)
+      __m128i mask = _mm_setr_epi32(0, -1, 0, -1);  // mask for high part containing only sign
+      return selectb(mask, sign, sra3);
+    }
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec2q &operator>>=(Vec2q &a, int32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec2qb operator==(Vec2q const &a, Vec2q const &b)
+{
+#if INSTRSET >= 5  // SSE4.1 supported
+  return _mm_cmpeq_epi64(a, b);
+#else  // SSE2
+  // no 64 compare instruction. Do two 32 bit compares
+  __m128i com32  = _mm_cmpeq_epi32(a, b);           // 32 bit compares
+  __m128i com32s = _mm_shuffle_epi32(com32, 0xB1);  // swap low and high dwords
+  __m128i test   = _mm_and_si128(com32, com32s);    // low & high
+  __m128i teste  = _mm_srai_epi32(test, 31);        // extend sign bit to 32 bits
+  __m128i testee = _mm_shuffle_epi32(teste, 0xF5);  // extend sign bit to 64 bits
+  return Vec2qb(Vec2q(testee));
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec2qb operator!=(Vec2q const &a, Vec2q const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return Vec2qb(_mm_comneq_epi64(a, b));
+#else  // SSE2 instruction set
+  return Vec2qb(Vec2q(~(a == b)));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec2qb operator<(Vec2q const &a, Vec2q const &b)
+{
+#if INSTRSET >= 6  // SSE4.2 supported
+  return Vec2qb(Vec2q(_mm_cmpgt_epi64(b, a)));
+#else  // SSE2
+  // no 64 compare instruction. Subtract
+  __m128i s = _mm_sub_epi64(a, b);  // a-b
+  // a < b if a and b have same sign and s < 0 or (a < 0 and b >= 0)
+  // The latter () corrects for overflow
+  __m128i axb    = _mm_xor_si128(a, b);             // a ^ b
+  __m128i anb    = _mm_andnot_si128(b, a);          // a & ~b
+  __m128i snaxb  = _mm_andnot_si128(axb, s);        // s & ~(a ^ b)
+  __m128i or1    = _mm_or_si128(anb, snaxb);        // (a & ~b) | (s & ~(a ^ b))
+  __m128i teste  = _mm_srai_epi32(or1, 31);         // extend sign bit to 32 bits
+  __m128i testee = _mm_shuffle_epi32(teste, 0xF5);  // extend sign bit to 64 bits
+  return testee;
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec2qb operator>(Vec2q const &a, Vec2q const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec2qb operator>=(Vec2q const &a, Vec2q const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return Vec2qb(_mm_comge_epi64(a, b));
+#else  // SSE2 instruction set
+  return Vec2qb(Vec2q(~(a < b)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec2qb operator<=(Vec2q const &a, Vec2q const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec2q operator&(Vec2q const &a, Vec2q const &b) { return Vec2q(Vec128b(a) & Vec128b(b)); }
+static inline Vec2q operator&&(Vec2q const &a, Vec2q const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec2q &operator&=(Vec2q &a, Vec2q const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec2q operator|(Vec2q const &a, Vec2q const &b) { return Vec2q(Vec128b(a) | Vec128b(b)); }
+static inline Vec2q operator||(Vec2q const &a, Vec2q const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec2q &operator|=(Vec2q &a, Vec2q const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2q operator^(Vec2q const &a, Vec2q const &b) { return Vec2q(Vec128b(a) ^ Vec128b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec2q &operator^=(Vec2q &a, Vec2q const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec2q operator~(Vec2q const &a) { return Vec2q(~Vec128b(a)); }
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec2qb operator!(Vec2q const &a) { return a == Vec2q(_mm_setzero_si128()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec2q select(Vec2qb const &s, Vec2q const &a, Vec2q const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec2q if_add(Vec2qb const &f, Vec2q const &a, Vec2q const &b) { return a + (Vec2q(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int64_t horizontal_add(Vec2q const &a)
+{
+  __m128i sum1 = _mm_shuffle_epi32(a, 0x0E);  // high element
+  __m128i sum2 = _mm_add_epi64(a, sum1);      // sum
+#if defined(__x86_64__)
+  return _mm_cvtsi128_si64(sum2);  // 64 bit mode
+#else
+  union
+  {
+    __m128i x;  // silly definition of _mm_storel_epi64 requires __m128i
+    int64_t i;
+  } u;
+  _mm_storel_epi64(&u.x, sum2);
+  return u.i;
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec2q max(Vec2q const &a, Vec2q const &b) { return select(a > b, a, b); }
+
+// function min: a < b ? a : b
+static inline Vec2q min(Vec2q const &a, Vec2q const &b) { return select(a < b, a, b); }
+
+// function abs: a >= 0 ? a : -a
+static inline Vec2q abs(Vec2q const &a)
+{
+#if INSTRSET >= 6                                          // SSE4.2 supported
+  __m128i sign = _mm_cmpgt_epi64(_mm_setzero_si128(), a);  // 0 > a
+#else                                                      // SSE2
+  __m128i signh  = _mm_srai_epi32(a, 31);           // sign in high dword
+  __m128i sign   = _mm_shuffle_epi32(signh, 0xF5);  // copy sign to low dword
+#endif
+  __m128i inv = _mm_xor_si128(a, sign);  // invert bits if negative
+  return _mm_sub_epi64(inv, sign);       // add 1
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec2q abs_saturated(Vec2q const &a)
+{
+  __m128i absa = abs(a);                                        // abs(a)
+#if INSTRSET >= 6                                               // SSE4.2 supported
+  __m128i overfl = _mm_cmpgt_epi64(_mm_setzero_si128(), absa);  // 0 > a
+#else                                                           // SSE2
+  __m128i signh  = _mm_srai_epi32(absa, 31);        // sign in high dword
+  __m128i overfl = _mm_shuffle_epi32(signh, 0xF5);  // copy sign to low dword
+#endif
+  return _mm_add_epi64(absa, overfl);  // subtract 1 if 0x8000000000000000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec2q rotate_left(Vec2q const &a, int b)
+{
+#ifdef __AVX512VL__
+  return _mm_rolv_epi64(a, _mm_set1_epi64x(int64_t(b)));
+#elif defined __XOP__  // AMD XOP instruction set
+  return (Vec2q)_mm_rot_epi64(a, Vec2q(b));
+#else                  // SSE2 instruction set
+  __m128i left  = _mm_sll_epi64(a, _mm_cvtsi32_si128(b & 0x3F));         // a << b
+  __m128i right = _mm_srl_epi64(a, _mm_cvtsi32_si128((64 - b) & 0x3F));  // a >> (64 - b)
+  __m128i rot   = _mm_or_si128(left, right);                             // or
+  return (Vec2q)rot;
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 2 64-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec2uq : public Vec2q
+{
+ public:
+  // Default constructor:
+  Vec2uq() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec2uq(uint64_t i) { xmm = Vec2q(i); }
+  // Constructor to build from all elements:
+  Vec2uq(uint64_t i0, uint64_t i1) { xmm = Vec2q(i0, i1); }
+  // Constructor to convert from type __m128i used in intrinsics:
+  Vec2uq(__m128i const &x) { xmm = x; }
+  // Assignment operator to convert from type __m128i used in intrinsics:
+  Vec2uq &operator=(__m128i const &x)
+  {
+    xmm = x;
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec2uq &load(void const *p)
+  {
+    xmm = _mm_loadu_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to load from array (aligned)
+  Vec2uq &load_a(void const *p)
+  {
+    xmm = _mm_load_si128((__m128i const *)p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec2uq const &insert(uint32_t index, uint64_t value)
+  {
+    Vec2q::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint64_t extract(uint32_t index) const { return Vec2q::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint64_t operator[](uint32_t index) const { return extract(index); }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec2uq operator+(Vec2uq const &a, Vec2uq const &b) { return Vec2uq(Vec2q(a) + Vec2q(b)); }
+
+// vector operator - : subtract
+static inline Vec2uq operator-(Vec2uq const &a, Vec2uq const &b) { return Vec2uq(Vec2q(a) - Vec2q(b)); }
+
+// vector operator * : multiply element by element
+static inline Vec2uq operator*(Vec2uq const &a, Vec2uq const &b) { return Vec2uq(Vec2q(a) * Vec2q(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec2uq operator>>(Vec2uq const &a, uint32_t b) { return _mm_srl_epi64(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec2uq operator>>(Vec2uq const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right logical
+static inline Vec2uq &operator>>=(Vec2uq &a, int b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec2uq operator<<(Vec2uq const &a, uint32_t b) { return Vec2uq((Vec2q)a << (int32_t)b); }
+
+// vector operator << : shift left all elements
+static inline Vec2uq operator<<(Vec2uq const &a, int32_t b) { return Vec2uq((Vec2q)a << b); }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec2qb operator>(Vec2uq const &a, Vec2uq const &b)
+{
+#if defined(__XOP__)  // AMD XOP instruction set
+  return Vec2qb(_mm_comgt_epu64(a, b));
+#elif INSTRSET >= 6  // SSE4.2
+  __m128i sign64 = constant4ui<0, 0x80000000, 0, 0x80000000>();
+  __m128i aflip  = _mm_xor_si128(a, sign64);  // flip sign bits to use signed compare
+  __m128i bflip  = _mm_xor_si128(b, sign64);
+  Vec2q cmp      = _mm_cmpgt_epi64(aflip, bflip);
+  return Vec2qb(cmp);
+#else                // SSE2 instruction set
+  __m128i sign32  = _mm_set1_epi32(0x80000000);       // sign bit of each dword
+  __m128i aflip   = _mm_xor_si128(a, sign32);         // a with sign bits flipped to use signed compare
+  __m128i bflip   = _mm_xor_si128(b, sign32);         // b with sign bits flipped to use signed compare
+  __m128i equal   = _mm_cmpeq_epi32(a, b);            // a == b, dwords
+  __m128i bigger  = _mm_cmpgt_epi32(aflip, bflip);    // a > b, dwords
+  __m128i biggerl = _mm_shuffle_epi32(bigger, 0xA0);  // a > b, low dwords copied to high dwords
+  __m128i eqbig   = _mm_and_si128(equal, biggerl);    // high part equal and low part bigger
+  __m128i hibig   = _mm_or_si128(bigger, eqbig);      // high part bigger or high part equal and low part bigger
+  __m128i big     = _mm_shuffle_epi32(hibig, 0xF5);   // result copied to low part
+  return Vec2qb(Vec2q(big));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec2qb operator<(Vec2uq const &a, Vec2uq const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec2qb operator>=(Vec2uq const &a, Vec2uq const &b)
+{
+#ifdef __XOP__  // AMD XOP instruction set
+  return Vec2qb(_mm_comge_epu64(a, b));
+#else  // SSE2 instruction set
+  return Vec2qb(Vec2q(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec2qb operator<=(Vec2uq const &a, Vec2uq const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec2uq operator&(Vec2uq const &a, Vec2uq const &b) { return Vec2uq(Vec128b(a) & Vec128b(b)); }
+static inline Vec2uq operator&&(Vec2uq const &a, Vec2uq const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec2uq operator|(Vec2uq const &a, Vec2uq const &b) { return Vec2uq(Vec128b(a) | Vec128b(b)); }
+static inline Vec2uq operator||(Vec2uq const &a, Vec2uq const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec2uq operator^(Vec2uq const &a, Vec2uq const &b) { return Vec2uq(Vec128b(a) ^ Vec128b(b)); }
+
+// vector operator ~ : bitwise not
+static inline Vec2uq operator~(Vec2uq const &a) { return Vec2uq(~Vec128b(a)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec2uq select(Vec2qb const &s, Vec2uq const &a, Vec2uq const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec2uq if_add(Vec2qb const &f, Vec2uq const &a, Vec2uq const &b) { return a + (Vec2uq(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint64_t horizontal_add(Vec2uq const &a) { return horizontal_add((Vec2q)a); }
+
+// function max: a > b ? a : b
+static inline Vec2uq max(Vec2uq const &a, Vec2uq const &b) { return select(a > b, a, b); }
+
+// function min: a < b ? a : b
+static inline Vec2uq min(Vec2uq const &a, Vec2uq const &b) { return select(a > b, b, a); }
+
+/*****************************************************************************
+ *
+ *          Vector permute functions
+ *
+ ******************************************************************************
+ *
+ * These permute functions can reorder the elements of a vector and optionally
+ * set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select. A negative index will generate zero. an index of -256 means don't care.
+ *
+ * Example:
+ * Vec4i a(10,11,12,13);         // a is (10,11,12,13)
+ * Vec4i b, c;
+ * b = permute4i<0,0,2,2>(a);    // b is (10,10,12,12)
+ * c = permute4i<3,2,-1,-1>(a);  // c is (13,12, 0, 0)
+ *
+ * The permute functions for vectors of 8-bit integers are inefficient if
+ * the SSSE3 instruction set or later is not enabled.
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+template <int i0, int i1>
+static inline Vec2q permute2q(Vec2q const &a)
+{
+  if(i0 == 0)
+    {
+      if(i1 == 0)
+        {  // 0,0
+          return _mm_unpacklo_epi64(a, a);
+        }
+      else if(i1 == 1 || i1 == -0x100)
+        {  // 0,1
+          return a;
+        }
+      else
+        {  // 0,-1
+          // return _mm_mov_epi64(a); // doesn't work with MS VS 2008
+          return _mm_and_si128(a, constant4i<-1, -1, 0, 0>());
+        }
+    }
+  else if(i0 == 1)
+    {
+      if(i1 == 0)
+        {  // 1,0
+          return _mm_shuffle_epi32(a, 0x4E);
+        }
+      else if(i1 == 1)
+        {  // 1,1
+          return _mm_unpackhi_epi64(a, a);
+        }
+      else
+        {  // 1,-1
+          return _mm_srli_si128(a, 8);
+        }
+    }
+  else
+    {  // i0 < 0
+      if(i1 == 0)
+        {  // -1,0
+          return _mm_slli_si128(a, 8);
+        }
+      else if(i1 == 1)
+        {  // -1,1
+          if(i0 == -0x100)
+            return a;
+          return _mm_and_si128(a, constant4i<0, 0, -1, -1>());
+        }
+      else
+        {  // -1,-1
+          return _mm_setzero_si128();
+        }
+    }
+}
+
+template <int i0, int i1>
+static inline Vec2uq permute2uq(Vec2uq const &a)
+{
+  return Vec2uq(permute2q<i0, i1>((__m128i)a));
+}
+
+// permute vector Vec4i
+template <int i0, int i1, int i2, int i3>
+static inline Vec4i permute4i(Vec4i const &a)
+{
+  // Combine all the indexes into a single bitfield, with 4 bits for each
+  const uint32_t m1 = (i0 & 3) | (i1 & 3) << 4 | (i2 & 3) << 8 | (i3 & 3) << 12;
+
+  // Mask to zero out negative indexes
+  const uint32_t mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12;
+
+  // Mask indicating required zeroing of all indexes, with 4 bits for each, 0 for index = -1, 0xF for index >= 0 or -256
+  const uint32_t ssz =
+      ((i0 & 0x80) ? 0 : 0xF) | ((i1 & 0x80) ? 0 : 0xF) << 4 | ((i2 & 0x80) ? 0 : 0xF) << 8 | ((i3 & 0x80) ? 0 : 0xF) << 12;
+
+  // Mask indicating 0 for don't care, 0xF for non-negative value of required zeroing
+  const uint32_t md = mz | ~ssz;
+
+  // Test if permutation needed
+  const bool do_shuffle = ((m1 ^ 0x00003210) & mz) != 0;
+
+  // is zeroing needed
+  const bool do_zero = (ssz != 0xFFFF);
+
+  if(mz == 0)
+    {
+      return _mm_setzero_si128();  // special case: all zero or don't care
+    }
+  // Test if we can do with 64-bit permute only
+  if((m1 & 0x0101 & mz) == 0      // even indexes are even or negative
+     && (~m1 & 0x1010 & mz) == 0  // odd  indexes are odd  or negative
+     && ((m1 ^ ((m1 + 0x0101) << 4)) & 0xF0F0 & mz & (mz << 4)) ==
+            0  // odd index == preceding even index +1 or at least one of them negative
+     && ((mz ^ (mz << 4)) & 0xF0F0 & md & md << 4) == 0)
+    {  // each pair of indexes are both negative or both positive or one of them don't care
+      const int j0 = i0 >= 0 ? i0 / 2 : (i0 & 0x80) ? i0 : i1 >= 0 ? i1 / 2 : i1;
+      const int j1 = i2 >= 0 ? i2 / 2 : (i2 & 0x80) ? i2 : i3 >= 0 ? i3 / 2 : i3;
+      return Vec4i(permute2q<j0, j1>(Vec2q(a)));  // 64 bit permute
+    }
+#if INSTRSET >= 4  // SSSE3
+  if(do_shuffle && do_zero)
+    {
+      // With SSSE3 we can do both with the PSHUFB instruction
+      const int j0  = (i0 & 3) << 2;
+      const int j1  = (i1 & 3) << 2;
+      const int j2  = (i2 & 3) << 2;
+      const int j3  = (i3 & 3) << 2;
+      __m128i mask1 = constant4i < i0 < 0 ? -1 : j0 | (j0 + 1) << 8 | (j0 + 2) << 16 | (j0 + 3) << 24,
+              i1 < 0 ? -1 : j1 | (j1 + 1) << 8 | (j1 + 2) << 16 | (j1 + 3) << 24,
+              i2 < 0 ? -1 : j2 | (j2 + 1) << 8 | (j2 + 2) << 16 | (j2 + 3) << 24,
+              i3 < 0 ? -1 : j3 | (j3 + 1) << 8 | (j3 + 2) << 16 | (j3 + 3) << 24 > ();
+      return _mm_shuffle_epi8(a, mask1);
+    }
+#endif
+  __m128i t1;
+
+  if(do_shuffle)
+    {  // permute
+      t1 = _mm_shuffle_epi32(a, (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6);
+    }
+  else
+    {
+      t1 = a;
+    }
+  if(do_zero)
+    {  // set some elements to zero
+      __m128i mask2 = constant4i<-int(i0 >= 0), -int(i1 >= 0), -int(i2 >= 0), -int(i3 >= 0)>();
+      t1            = _mm_and_si128(t1, mask2);
+    }
+  return t1;
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4ui permute4ui(Vec4ui const &a)
+{
+  return Vec4ui(permute4i<i0, i1, i2, i3>(a));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8s permute8s(Vec8s const &a)
+{
+  if((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0)
+    {
+      return _mm_setzero_si128();  // special case: all zero
+    }
+#if INSTRSET >= 4  // SSSE3
+
+  // special case: rotate
+  if(i0 >= 0 && i0 < 8 && i1 == ((i0 + 1) & 7) && i2 == ((i0 + 2) & 7) && i3 == ((i0 + 3) & 7) && i4 == ((i0 + 4) & 7) &&
+     i5 == ((i0 + 5) & 7) && i6 == ((i0 + 6) & 7) && i7 == ((i0 + 7) & 7))
+    {
+      if(i0 == 0)
+        return a;  // do nothing
+      return _mm_alignr_epi8(a, a, (i0 & 7) * 2);
+    }
+
+  // General case: Use PSHUFB
+  const int j0 = i0 < 0 ? 0xFFFF : ((i0 & 7) * 2 | ((i0 & 7) * 2 + 1) << 8);
+  const int j1 = i1 < 0 ? 0xFFFF : ((i1 & 7) * 2 | ((i1 & 7) * 2 + 1) << 8);
+  const int j2 = i2 < 0 ? 0xFFFF : ((i2 & 7) * 2 | ((i2 & 7) * 2 + 1) << 8);
+  const int j3 = i3 < 0 ? 0xFFFF : ((i3 & 7) * 2 | ((i3 & 7) * 2 + 1) << 8);
+  const int j4 = i4 < 0 ? 0xFFFF : ((i4 & 7) * 2 | ((i4 & 7) * 2 + 1) << 8);
+  const int j5 = i5 < 0 ? 0xFFFF : ((i5 & 7) * 2 | ((i5 & 7) * 2 + 1) << 8);
+  const int j6 = i6 < 0 ? 0xFFFF : ((i6 & 7) * 2 | ((i6 & 7) * 2 + 1) << 8);
+  const int j7 = i7 < 0 ? 0xFFFF : ((i7 & 7) * 2 | ((i7 & 7) * 2 + 1) << 8);
+  __m128i mask = constant4i<j0 | j1 << 16, j2 | j3 << 16, j4 | j5 << 16, j6 | j7 << 16>();
+  return _mm_shuffle_epi8(a, mask);
+
+#else  // SSE2 has no simple solution. Find the optimal permute method.
+  // Without proper metaprogramming features, we have to use constant expressions
+  // and if-statements to make sure these calculations are resolved at compile time.
+  // All this should produce at most 8 instructions in the final code, depending
+  // on the template parameters.
+
+  // Temporary vectors
+  __m128i t1, t2, t3, t4, t5, t6, t7;
+
+  // Combine all the indexes into a single bitfield, with 4 bits for each
+  const int m1 =
+      (i0 & 7) | (i1 & 7) << 4 | (i2 & 7) << 8 | (i3 & 7) << 12 | (i4 & 7) << 16 | (i5 & 7) << 20 | (i6 & 7) << 24 | (i7 & 7) << 28;
+
+  // Mask to zero out negative indexes
+  const int m2 = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12 |
+                 (i4 < 0 ? 0 : 0xF) << 16 | (i5 < 0 ? 0 : 0xF) << 20 | (i6 < 0 ? 0 : 0xF) << 24 | (i7 < 0 ? 0 : 0xF) << 28;
+
+  // Test if we can do without permute
+  const bool case0 = ((m1 ^ 0x76543210) & m2) == 0;  // all indexes point to their own place or negative
+
+  // Test if we can do with 32-bit permute only
+  const bool case1 = (m1 & 0x01010101 & m2) == 0      // even indexes are even or negative
+                     && (~m1 & 0x10101010 & m2) == 0  // odd  indexes are odd  or negative
+                     && ((m1 ^ ((m1 + 0x01010101) << 4)) & 0xF0F0F0F0 & m2 & (m2 << 4)) ==
+                            0;  // odd index == preceding even index +1 or at least one of them negative
+
+  // Test if we can do with 16-bit permute only
+  const bool case2 =
+      (((m1 & 0x44444444) ^ 0x44440000) & m2) == 0;  // indexes 0-3 point to lower 64 bits, 1-7 to higher 64 bits, or negative
+
+  if(case0)
+    {
+      // no permute needed
+      t7 = a;
+    }
+  else if(case1)
+    {
+      // 32 bit permute only
+      const int j0 = i0 >= 0 ? i0 / 2 : i1 >= 0 ? i1 / 2 : 0;
+      const int j1 = i2 >= 0 ? i2 / 2 : i3 >= 0 ? i3 / 2 : 0;
+      const int j2 = i4 >= 0 ? i4 / 2 : i5 >= 0 ? i5 / 2 : 0;
+      const int j3 = i6 >= 0 ? i6 / 2 : i7 >= 0 ? i7 / 2 : 0;
+      t7           = _mm_shuffle_epi32(a, (j0 & 3) | (j1 & 3) << 2 | (j2 & 3) << 4 | (j3 & 3) << 6);
+    }
+  else if(case2)
+    {
+      // 16 bit permute only
+      const int j0 = i0 >= 0 ? i0 & 3 : 0;
+      const int j1 = i1 >= 0 ? i1 & 3 : 1;
+      const int j2 = i2 >= 0 ? i2 & 3 : 2;
+      const int j3 = i3 >= 0 ? i3 & 3 : 3;
+      const int j4 = i4 >= 0 ? i4 & 3 : 0;
+      const int j5 = i5 >= 0 ? i5 & 3 : 1;
+      const int j6 = i6 >= 0 ? i6 & 3 : 2;
+      const int j7 = i7 >= 0 ? i7 & 3 : 3;
+      if(j0 != 0 || j1 != 1 || j2 != 2 || j3 != 3)
+        {
+          t1 = _mm_shufflelo_epi16(a, j0 | j1 << 2 | j2 << 4 | j3 << 6);
+        }
+      else
+        t1 = a;
+      if(j4 != 0 || j5 != 1 || j6 != 2 || j7 != 3)
+        {
+          t7 = _mm_shufflehi_epi16(t1, j4 | j5 << 2 | j6 << 4 | j7 << 6);
+        }
+      else
+        t7 = t1;
+    }
+  else
+    {
+      // Need at least two permute steps
+
+      // Index to where each dword of a is needed
+      const int nn = (m1 & 0x66666666) | 0x88888888;  // indicate which dwords are needed
+      const int n0 = ((((uint32_t)(nn ^ 0x00000000) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2;
+      const int n1 = ((((uint32_t)(nn ^ 0x22222222) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2;
+      const int n2 = ((((uint32_t)(nn ^ 0x44444444) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2;
+      const int n3 = ((((uint32_t)(nn ^ 0x66666666) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2;
+      // indicate which dwords are needed in low half
+      const int l0 = (n0 & 0xFFFF) != 0;
+      const int l1 = (n1 & 0xFFFF) != 0;
+      const int l2 = (n2 & 0xFFFF) != 0;
+      const int l3 = (n3 & 0xFFFF) != 0;
+      // indicate which dwords are needed in high half
+      const int h0 = (n0 & 0xFFFF0000) != 0;
+      const int h1 = (n1 & 0xFFFF0000) != 0;
+      const int h2 = (n2 & 0xFFFF0000) != 0;
+      const int h3 = (n3 & 0xFFFF0000) != 0;
+
+      // Test if we can do with two permute steps
+      const bool case3 = l0 + l1 + l2 + l3 <= 2 && h0 + h1 + h2 + h3 <= 2;
+
+      if(case3)
+        {
+          // one 32-bit permute followed by one 16-bit permute in each half.
+          // Find permute indices for 32-bit permute
+          const int j0 = l0 ? 0 : l1 ? 1 : l2 ? 2 : 3;
+          const int j1 = l3 ? 3 : l2 ? 2 : l1 ? 1 : 0;
+          const int j2 = h0 ? 0 : h1 ? 1 : h2 ? 2 : 3;
+          const int j3 = h3 ? 3 : h2 ? 2 : h1 ? 1 : 0;
+
+          // Find permute indices for low 16-bit permute
+          const int r0 = i0 < 0 ? 0 : (i0 >> 1 == j0 ? 0 : 2) + (i0 & 1);
+          const int r1 = i1 < 0 ? 1 : (i1 >> 1 == j0 ? 0 : 2) + (i1 & 1);
+          const int r2 = i2 < 0 ? 2 : (i2 >> 1 == j1 ? 2 : 0) + (i2 & 1);
+          const int r3 = i3 < 0 ? 3 : (i3 >> 1 == j1 ? 2 : 0) + (i3 & 1);
+
+          // Find permute indices for high 16-bit permute
+          const int s0 = i4 < 0 ? 0 : (i4 >> 1 == j2 ? 0 : 2) + (i4 & 1);
+          const int s1 = i5 < 0 ? 1 : (i5 >> 1 == j2 ? 0 : 2) + (i5 & 1);
+          const int s2 = i6 < 0 ? 2 : (i6 >> 1 == j3 ? 2 : 0) + (i6 & 1);
+          const int s3 = i7 < 0 ? 3 : (i7 >> 1 == j3 ? 2 : 0) + (i7 & 1);
+
+          // 32-bit permute
+          t1 = _mm_shuffle_epi32(a, j0 | j1 << 2 | j2 << 4 | j3 << 6);
+          // 16-bit permutes
+          if(r0 != 0 || r1 != 1 || r2 != 2 || r3 != 3)
+            {  // 16 bit permute of low  half
+              t2 = _mm_shufflelo_epi16(t1, r0 | r1 << 2 | r2 << 4 | r3 << 6);
+            }
+          else
+            t2 = t1;
+          if(s0 != 0 || s1 != 1 || s2 != 2 || s3 != 3)
+            {  // 16 bit permute of high half
+              t7 = _mm_shufflehi_epi16(t2, s0 | s1 << 2 | s2 << 4 | s3 << 6);
+            }
+          else
+            t7 = t2;
+        }
+      else
+        {
+          // Worst case. We need two sets of 16-bit permutes
+          t1 = _mm_shuffle_epi32(a, 0x4E);  // swap low and high 64-bits
+
+          // Find permute indices for low 16-bit permute from swapped t1
+          const int r0 = i0 < 4 ? 0 : i0 & 3;
+          const int r1 = i1 < 4 ? 1 : i1 & 3;
+          const int r2 = i2 < 4 ? 2 : i2 & 3;
+          const int r3 = i3 < 4 ? 3 : i3 & 3;
+          // Find permute indices for high 16-bit permute from swapped t1
+          const int s0 = i4 < 0 || i4 >= 4 ? 0 : i4 & 3;
+          const int s1 = i5 < 0 || i5 >= 4 ? 1 : i5 & 3;
+          const int s2 = i6 < 0 || i6 >= 4 ? 2 : i6 & 3;
+          const int s3 = i7 < 0 || i7 >= 4 ? 3 : i7 & 3;
+          // Find permute indices for low 16-bit permute from direct a
+          const int u0 = i0 < 0 || i0 >= 4 ? 0 : i0 & 3;
+          const int u1 = i1 < 0 || i1 >= 4 ? 1 : i1 & 3;
+          const int u2 = i2 < 0 || i2 >= 4 ? 2 : i2 & 3;
+          const int u3 = i3 < 0 || i3 >= 4 ? 3 : i3 & 3;
+          // Find permute indices for high 16-bit permute from direct a
+          const int v0 = i4 < 4 ? 0 : i4 & 3;
+          const int v1 = i5 < 4 ? 1 : i5 & 3;
+          const int v2 = i6 < 4 ? 2 : i6 & 3;
+          const int v3 = i7 < 4 ? 3 : i7 & 3;
+
+          // 16-bit permutes
+          if(r0 != 0 || r1 != 1 || r2 != 2 || r3 != 3)
+            {  // 16 bit permute of low  half
+              t2 = _mm_shufflelo_epi16(t1, r0 | r1 << 2 | r2 << 4 | r3 << 6);
+            }
+          else
+            t2 = t1;
+          if(u0 != 0 || u1 != 1 || u2 != 2 || u3 != 3)
+            {  // 16 bit permute of low  half
+              t3 = _mm_shufflelo_epi16(a, u0 | u1 << 2 | u2 << 4 | u3 << 6);
+            }
+          else
+            t3 = a;
+          if(s0 != 0 || s1 != 1 || s2 != 2 || s3 != 3)
+            {  // 16 bit permute of low  half
+              t4 = _mm_shufflehi_epi16(t2, s0 | s1 << 2 | s2 << 4 | s3 << 6);
+            }
+          else
+            t4 = t2;
+          if(v0 != 0 || v1 != 1 || v2 != 2 || v3 != 3)
+            {  // 16 bit permute of low  half
+              t5 = _mm_shufflehi_epi16(t3, v0 | v1 << 2 | v2 << 4 | v3 << 6);
+            }
+          else
+            t5 = t3;
+          // merge data from t4 and t5
+          t6 = constant4i<((i0 & 4) ? 0xFFFF : 0) | ((i1 & 4) ? 0xFFFF0000 : 0), ((i2 & 4) ? 0xFFFF : 0) | ((i3 & 4) ? 0xFFFF0000 : 0),
+                          ((i4 & 4) ? 0 : 0xFFFF) | ((i5 & 4) ? 0 : 0xFFFF0000),
+                          ((i6 & 4) ? 0 : 0xFFFF) | ((i7 & 4) ? 0 : 0xFFFF0000)>();
+          t7 = selectb(t6, t4, t5);  // select between permuted data t4 and t5
+        }
+    }
+  // Set any elements to zero if required
+  if(m2 != -1 && ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80))
+    {
+      // some elements need to be set to 0
+      __m128i mask =
+          constant4i<(i0 < 0 ? 0xFFFF0000 : -1) & (i1 < 0 ? 0x0000FFFF : -1), (i2 < 0 ? 0xFFFF0000 : -1) & (i3 < 0 ? 0x0000FFFF : -1),
+                     (i4 < 0 ? 0xFFFF0000 : -1) & (i5 < 0 ? 0x0000FFFF : -1),
+                     (i6 < 0 ? 0xFFFF0000 : -1) & (i7 < 0 ? 0x0000FFFF : -1)>();
+      return _mm_and_si128(t7, mask);
+    }
+  else
+    {
+      return t7;
+    }
+#endif
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8us permute8us(Vec8us const &a)
+{
+  return Vec8us(permute8s<i0, i1, i2, i3, i4, i5, i6, i7>(a));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16c permute16c(Vec16c const &a)
+{
+  __m128i temp;
+
+  // Combine all even indexes into a single bitfield, with 4 bits for each
+  const uint32_t me = (i0 & 15) | (i2 & 15) << 4 | (i4 & 15) << 8 | (i6 & 15) << 12 | (i8 & 15) << 16 | (i10 & 15) << 20 |
+                      (i12 & 15) << 24 | (i14 & 15) << 28;
+
+  // Combine all odd indexes into a single bitfield, with 4 bits for each
+  const uint32_t mo = (i1 & 15) | (i3 & 15) << 4 | (i5 & 15) << 8 | (i7 & 15) << 12 | (i9 & 15) << 16 | (i11 & 15) << 20 |
+                      (i13 & 15) << 24 | (i15 & 15) << 28;
+
+  // Mask indicating sign of all even indexes, with 4 bits for each, 0 for negative, 0xF for non-negative
+  const uint32_t se = (i0 < 0 ? 0 : 0xF) | (i2 < 0 ? 0 : 0xF) << 4 | (i4 < 0 ? 0 : 0xF) << 8 | (i6 < 0 ? 0 : 0xF) << 12 |
+                      (i8 < 0 ? 0 : 0xF) << 16 | (i10 < 0 ? 0 : 0xF) << 20 | (i12 < 0 ? 0 : 0xF) << 24 | (i14 < 0 ? 0 : 0xF) << 28;
+
+  // Mask indicating sign of all odd indexes, with 4 bits for each, 0 for negative, 0xF for non-negative
+  const uint32_t so = (i1 < 0 ? 0 : 0xF) | (i3 < 0 ? 0 : 0xF) << 4 | (i5 < 0 ? 0 : 0xF) << 8 | (i7 < 0 ? 0 : 0xF) << 12 |
+                      (i9 < 0 ? 0 : 0xF) << 16 | (i11 < 0 ? 0 : 0xF) << 20 | (i13 < 0 ? 0 : 0xF) << 24 | (i15 < 0 ? 0 : 0xF) << 28;
+
+  // Mask indicating sign of all indexes, with 2 bits for each, 0 for negative (means set to zero or don't care), 0x3 for non-negative
+  const uint32_t ss = (se & 0x33333333) | (so & 0xCCCCCCCC);
+
+  // Mask indicating required zeroing of all indexes, with 2 bits for each, 0 for index = -1, 3 for index >= 0 or -256
+  const uint32_t ssz = ((i0 & 0x80) ? 0 : 3) | ((i1 & 0x80) ? 0 : 3) << 2 | ((i2 & 0x80) ? 0 : 3) << 4 | ((i3 & 0x80) ? 0 : 3) << 6 |
+                       ((i4 & 0x80) ? 0 : 3) << 8 | ((i5 & 0x80) ? 0 : 3) << 10 | ((i6 & 0x80) ? 0 : 3) << 12 |
+                       ((i7 & 0x80) ? 0 : 3) << 14 | ((i8 & 0x80) ? 0 : 3) << 16 | ((i9 & 0x80) ? 0 : 3) << 18 |
+                       ((i10 & 0x80) ? 0 : 3) << 20 | ((i11 & 0x80) ? 0 : 3) << 22 | ((i12 & 0x80) ? 0 : 3) << 24 |
+                       ((i13 & 0x80) ? 0 : 3) << 26 | ((i14 & 0x80) ? 0 : 3) << 28 | ((i15 & 0x80) ? 0 : 3) << 30;
+
+  // These indexes are used only to avoid bogus compiler warnings in false branches
+  const int I0  = i0 > 0 ? (i0 & 0xF) : 0;
+  const int I15 = i15 > 0 ? (i15 & 0xF) : 0;
+
+  // special case: all zero
+  if(ss == 0)
+    {
+      return _mm_setzero_si128();
+    }
+
+  // remember if extra zeroing is needed
+  bool do_and_zero = (ssz != 0xFFFFFFFFu);
+
+  // check for special shortcut cases
+  int shortcut = 0;
+
+  // check if any permutation
+  if(((me ^ 0xECA86420) & se) == 0 && ((mo ^ 0xFDB97531) & so) == 0)
+    {
+      shortcut = 1;
+    }
+  // check if we can use punpcklbw
+  else if(((me ^ 0x76543210) & se) == 0 && ((mo ^ 0x76543210) & so) == 0)
+    {
+      shortcut = 2;
+    }
+  // check if we can use punpckhbw
+  else if(((me ^ 0xFEDCBA98) & se) == 0 && ((mo ^ 0xFEDCBA98) & so) == 0)
+    {
+      shortcut = 3;
+    }
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(disable : 4307)  // disable MS warning C4307: '+' : integral constant overflow
+#endif
+
+  // check if we can use byte shift right
+  else if(i0 > 0 && ((me ^ (uint32_t(I0) * 0x11111111u + 0xECA86420u)) & se) == 0 &&
+          ((mo ^ (uint32_t(I0) * 0x11111111u + 0xFDB97531u)) & so) == 0)
+    {
+      shortcut    = 4;
+      do_and_zero = ((0xFFFFFFFFu >> 2 * I0) & ~ssz) != 0;
+    }
+  // check if we can use byte shift left
+  else if(i15 >= 0 && i15 < 15 && ((mo ^ (uint32_t(I15 * 0x11111111u) - (0x02468ACEu & so))) & so) == 0 &&
+          ((me ^ (uint32_t(I15 * 0x11111111u) - (0x13579BDFu & se))) & se) == 0)
+    {
+      shortcut    = 5;
+      do_and_zero = ((0xFFFFFFFFu << 2 * (15 - I15)) & ~ssz) != 0;
+    }
+
+#if INSTRSET >= 4  // SSSE3 (PSHUFB available only under SSSE3)
+
+  // special case: rotate
+  if(i0 > 0 && i0 < 16 && i1 == ((i0 + 1) & 15) && i2 == ((i0 + 2) & 15) && i3 == ((i0 + 3) & 15) && i4 == ((i0 + 4) & 15) &&
+     i5 == ((i0 + 5) & 15) && i6 == ((i0 + 6) & 15) && i7 == ((i0 + 7) & 15) && i8 == ((i0 + 8) & 15) && i9 == ((i0 + 9) & 15) &&
+     i10 == ((i0 + 10) & 15) && i11 == ((i0 + 11) & 15) && i12 == ((i0 + 12) & 15) && i13 == ((i0 + 13) & 15) &&
+     i14 == ((i0 + 14) & 15) && i15 == ((i0 + 15) & 15))
+    {
+      temp     = _mm_alignr_epi8(a, a, i0 & 15);
+      shortcut = -1;
+    }
+  if(shortcut == 0 || do_and_zero)
+    {
+      // general case: use PSHUFB
+      __m128i mask = constant4i<(i0 & 0xFF) | (i1 & 0xFF) << 8 | (i2 & 0xFF) << 16 | (i3 & 0xFF) << 24,
+                                (i4 & 0xFF) | (i5 & 0xFF) << 8 | (i6 & 0xFF) << 16 | (i7 & 0xFF) << 24,
+                                (i8 & 0xFF) | (i9 & 0xFF) << 8 | (i10 & 0xFF) << 16 | (i11 & 0xFF) << 24,
+                                (i12 & 0xFF) | (i13 & 0xFF) << 8 | (i14 & 0xFF) << 16 | (i15 & 0xFF) << 24>();
+      temp         = _mm_shuffle_epi8(a, mask);
+      shortcut     = -1;
+      do_and_zero  = false;
+    }
+
+#endif
+
+  // Check if we can use 16-bit permute. Even numbered indexes must be even and odd numbered
+  // indexes must be equal to the preceding index + 1, except for negative indexes.
+  if(shortcut == 0 && (me & 0x11111111 & se) == 0 && ((mo ^ 0x11111111) & 0x11111111 & so) == 0 &&
+     ((me ^ mo) & 0xEEEEEEEE & se & so) == 0)
+    {
+      temp = permute8s < i0 >= 0 ? i0 / 2 : i1 >= 0 ? i1 / 2 : (i0 | i1), i2 >= 0 ? i2 / 2 : i3 >= 0 ? i3 / 2 : (i2 | i3),
+      i4 >= 0 ? i4 / 2 : i5 >= 0 ? i5 / 2 : (i4 | i5), i6 >= 0 ? i6 / 2 : i7 >= 0 ? i7 / 2 : (i6 | i7),
+      i8 >= 0 ? i8 / 2 : i9 >= 0 ? i9 / 2 : (i8 | i9), i10 >= 0 ? i10 / 2 : i11 >= 0 ? i11 / 2 : (i10 | i11),
+      i12 >= 0 ? i12 / 2 : i13 >= 0 ? i13 / 2 : (i12 | i13), i14 >= 0 ? i14 / 2 : i15 >= 0 ? i15 / 2 : (i14 | i15) > (Vec8s(a));
+      shortcut    = 100;
+      do_and_zero = (se != so && ssz != 0xFFFFFFFFu);
+    }
+
+  // Check if we can use 16-bit permute with bytes swapped. Even numbered indexes must be odd and odd
+  // numbered indexes must be equal to the preceding index - 1, except for negative indexes.
+  // (this case occurs when reversing byte order)
+  if(shortcut == 0 && ((me ^ 0x11111111) & 0x11111111 & se) == 0 && (mo & 0x11111111 & so) == 0 &&
+     ((me ^ mo) & 0xEEEEEEEE & se & so) == 0)
+    {
+      Vec16c swapped = Vec16c(rotate_left(Vec8s(a), 8));  // swap odd and even bytes
+      temp           = permute8s < i0 >= 0 ? i0 / 2 : i1 >= 0 ? i1 / 2 : (i0 | i1), i2 >= 0 ? i2 / 2 : i3 >= 0 ? i3 / 2 : (i2 | i3),
+      i4 >= 0 ? i4 / 2 : i5 >= 0 ? i5 / 2 : (i4 | i5), i6 >= 0 ? i6 / 2 : i7 >= 0 ? i7 / 2 : (i6 | i7),
+      i8 >= 0 ? i8 / 2 : i9 >= 0 ? i9 / 2 : (i8 | i9), i10 >= 0 ? i10 / 2 : i11 >= 0 ? i11 / 2 : (i10 | i11),
+      i12 >= 0 ? i12 / 2 : i13 >= 0 ? i13 / 2 : (i12 | i13), i14 >= 0 ? i14 / 2 : i15 >= 0 ? i15 / 2 : (i14 | i15) > (Vec8s(swapped));
+      shortcut    = 101;
+      do_and_zero = (se != so && ssz != 0xFFFFFFFFu);
+    }
+
+  // all shortcuts end here
+  if(shortcut)
+    {
+      switch(shortcut)
+        {
+          case 1:
+            temp = a;
+            break;
+          case 2:
+            temp = _mm_unpacklo_epi8(a, a);
+            break;
+          case 3:
+            temp = _mm_unpackhi_epi8(a, a);
+            break;
+          case 4:
+            temp = _mm_srli_si128(a, I0);
+            break;
+          case 5:
+            temp = _mm_slli_si128(a, 15 - I15);
+            break;
+          default:
+            break;  // result is already in temp
+        }
+      if(do_and_zero)
+        {
+          // additional zeroing needed
+          __m128i maskz =
+              constant4i<(i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000),
+                         (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000),
+                         (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000),
+                         (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000)>();
+          temp = _mm_and_si128(temp, maskz);
+        }
+      return temp;
+    }
+
+  // complicated cases: use 16-bit permute up to four times
+  const bool e2e = (~me & 0x11111111 & se) != 0;  // even bytes of source to even bytes of destination
+  const bool e2o = (~mo & 0x11111111 & so) != 0;  // even bytes of source to odd  bytes of destination
+  const bool o2e = (me & 0x11111111 & se) != 0;   // odd  bytes of source to even bytes of destination
+  const bool o2o = (mo & 0x11111111 & so) != 0;   // odd  bytes of source to odd  bytes of destination
+
+  Vec16c swapped, te2e, te2o, to2e, to2o, combeven, combodd;
+
+  if(e2o || o2e)
+    swapped = rotate_left(Vec8s(a), 8);  // swap odd and even bytes
+
+  // even-to-even bytes
+  if(e2e)
+    te2e = permute8s < (i0 & 1) ? -1 : i0 / 2, (i2 & 1) ? -1 : i2 / 2, (i4 & 1) ? -1 : i4 / 2, (i6 & 1) ? -1 : i6 / 2,
+    (i8 & 1) ? -1 : i8 / 2, (i10 & 1) ? -1 : i10 / 2, (i12 & 1) ? -1 : i12 / 2, (i14 & 1) ? -1 : i14 / 2 > (Vec8s(a));
+  // odd-to-even bytes
+  if(o2e)
+    to2e = permute8s < (i0 & 1) ? i0 / 2 : -1, (i2 & 1) ? i2 / 2 : -1, (i4 & 1) ? i4 / 2 : -1, (i6 & 1) ? i6 / 2 : -1,
+    (i8 & 1) ? i8 / 2 : -1, (i10 & 1) ? i10 / 2 : -1, (i12 & 1) ? i12 / 2 : -1, (i14 & 1) ? i14 / 2 : -1 > (Vec8s(swapped));
+  // even-to-odd bytes
+  if(e2o)
+    te2o = permute8s < (i1 & 1) ? -1 : i1 / 2, (i3 & 1) ? -1 : i3 / 2, (i5 & 1) ? -1 : i5 / 2, (i7 & 1) ? -1 : i7 / 2,
+    (i9 & 1) ? -1 : i9 / 2, (i11 & 1) ? -1 : i11 / 2, (i13 & 1) ? -1 : i13 / 2, (i15 & 1) ? -1 : i15 / 2 > (Vec8s(swapped));
+  // odd-to-odd bytes
+  if(o2o)
+    to2o = permute8s < (i1 & 1) ? i1 / 2 : -1, (i3 & 1) ? i3 / 2 : -1, (i5 & 1) ? i5 / 2 : -1, (i7 & 1) ? i7 / 2 : -1,
+    (i9 & 1) ? i9 / 2 : -1, (i11 & 1) ? i11 / 2 : -1, (i13 & 1) ? i13 / 2 : -1, (i15 & 1) ? i15 / 2 : -1 > (Vec8s(a));
+
+  if(e2e && o2e)
+    combeven = te2e | to2e;
+  else if(e2e)
+    combeven = te2e;
+  else if(o2e)
+    combeven = to2e;
+  else
+    combeven = _mm_setzero_si128();
+
+  if(e2o && o2o)
+    combodd = te2o | to2o;
+  else if(e2o)
+    combodd = te2o;
+  else if(o2o)
+    combodd = to2o;
+  else
+    combodd = _mm_setzero_si128();
+
+  __m128i maske = constant4i<  // mask used even bytes
+      (i0 < 0 ? 0 : 0xFF) | (i2 < 0 ? 0 : 0xFF0000), (i4 < 0 ? 0 : 0xFF) | (i6 < 0 ? 0 : 0xFF0000),
+      (i8 < 0 ? 0 : 0xFF) | (i10 < 0 ? 0 : 0xFF0000), (i12 < 0 ? 0 : 0xFF) | (i14 < 0 ? 0 : 0xFF0000)>();
+  __m128i masko = constant4i<  // mask used odd bytes
+      (i1 < 0 ? 0 : 0xFF00) | (i3 < 0 ? 0 : 0xFF000000), (i5 < 0 ? 0 : 0xFF00) | (i7 < 0 ? 0 : 0xFF000000),
+      (i9 < 0 ? 0 : 0xFF00) | (i11 < 0 ? 0 : 0xFF000000), (i13 < 0 ? 0 : 0xFF00) | (i15 < 0 ? 0 : 0xFF000000)>();
+
+  return _mm_or_si128(  // combine even and odd bytes
+      _mm_and_si128(combeven, maske), _mm_and_si128(combodd, masko));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16uc permute16uc(Vec16uc const &a)
+{
+  return Vec16uc(permute16c<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a));
+}
+
+/*****************************************************************************
+ *
+ *          Vector blend functions
+ *
+ ******************************************************************************
+ *
+ * These blend functions can mix elements from two different vectors and
+ * optionally set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select, where higher indexes indicate an element from the second source
+ * vector. For example, if each vector has 4 elements, then indexes 0 - 3
+ * will select an element from the first vector and indexes 4 - 7 will select
+ * an element from the second vector. A negative index will generate zero.
+ *
+ * The blend functions for vectors of 8-bit integers are inefficient if
+ * the SSSE3 instruction set or later is not enabled.
+ *
+ * Example:
+ * Vec4i a(100,101,102,103);         // a is (100, 101, 102, 103)
+ * Vec4i b(200,201,202,203);         // b is (200, 201, 202, 203)
+ * Vec4i c;
+ * c = blend4i<1,4,-1,7> (a,b);      // c is (101, 200,   0, 203)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16c blend16c(Vec16c const &a, Vec16c const &b)
+{
+  // Combine bit 0-3 of all even indexes into a single bitfield, with 4 bits for each
+  const int me = (i0 & 15) | (i2 & 15) << 4 | (i4 & 15) << 8 | (i6 & 15) << 12 | (i8 & 15) << 16 | (i10 & 15) << 20 |
+                 (i12 & 15) << 24 | (i14 & 15) << 28;
+
+  // Combine bit 0-3 of all odd indexes into a single bitfield, with 4 bits for each
+  const int mo = (i1 & 15) | (i3 & 15) << 4 | (i5 & 15) << 8 | (i7 & 15) << 12 | (i9 & 15) << 16 | (i11 & 15) << 20 |
+                 (i13 & 15) << 24 | (i15 & 15) << 28;
+
+  // Mask indicating sign of all even indexes, with 4 bits for each, 0 for negative, 0xF for non-negative
+  const int se = (i0 < 0 ? 0 : 0xF) | (i2 < 0 ? 0 : 0xF) << 4 | (i4 < 0 ? 0 : 0xF) << 8 | (i6 < 0 ? 0 : 0xF) << 12 |
+                 (i8 < 0 ? 0 : 0xF) << 16 | (i10 < 0 ? 0 : 0xF) << 20 | (i12 < 0 ? 0 : 0xF) << 24 | (i14 < 0 ? 0 : 0xF) << 28;
+
+  // Mask indicating sign of all odd indexes, with 4 bits for each, 0 for negative, 0xF for non-negative
+  const int so = (i1 < 0 ? 0 : 0xF) | (i3 < 0 ? 0 : 0xF) << 4 | (i5 < 0 ? 0 : 0xF) << 8 | (i7 < 0 ? 0 : 0xF) << 12 |
+                 (i9 < 0 ? 0 : 0xF) << 16 | (i11 < 0 ? 0 : 0xF) << 20 | (i13 < 0 ? 0 : 0xF) << 24 | (i15 < 0 ? 0 : 0xF) << 28;
+
+  // Combine bit 4 of all even indexes into a single bitfield, with 4 bits for each
+  const int ne = (i0 & 16) >> 4 | (i2 & 16) | (i4 & 16) << 4 | (i6 & 16) << 8 | (i8 & 16) << 12 | (i10 & 16) << 16 | (i12 & 16) << 20 |
+                 (i14 & 16) << 24;
+
+  // Combine bit 4 of all odd indexes into a single bitfield, with 4 bits for each
+  const int no = (i1 & 16) >> 4 | (i3 & 16) | (i5 & 16) << 4 | (i7 & 16) << 8 | (i9 & 16) << 12 | (i11 & 16) << 16 | (i13 & 16) << 20 |
+                 (i15 & 16) << 24;
+
+  // Check if zeroing needed
+  const bool do_zero =
+      ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) & 0x80) != 0;  // needs zeroing
+
+  // no elements from b
+  if(((ne & se) | (no & so)) == 0)
+    {
+      return permute16c<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a);
+    }
+
+  // no elements from a
+  if((((ne ^ 0x11111111) & se) | ((no ^ 0x11111111) & so)) == 0)
+    {
+      return permute16c<i0 ^ 16, i1 ^ 16, i2 ^ 16, i3 ^ 16, i4 ^ 16, i5 ^ 16, i6 ^ 16, i7 ^ 16, i8 ^ 16, i9 ^ 16, i10 ^ 16, i11 ^ 16,
+                        i12 ^ 16, i13 ^ 16, i14 ^ 16, i15 ^ 16>(b);
+    }
+  __m128i t;
+
+  // check if we can use punpcklbw
+  if(((me ^ 0x76543210) & se) == 0 && ((mo ^ 0x76543210) & so) == 0)
+    {
+      if((ne & se) == 0 && ((no ^ 0x11111111) & so) == 0)
+        {
+          t = _mm_unpacklo_epi8(a, b);
+        }
+      if((no & so) == 0 && ((ne ^ 0x11111111) & se) == 0)
+        {
+          t = _mm_unpacklo_epi8(b, a);
+        }
+      if(do_zero)
+        {
+          // additional zeroing needed
+          __m128i maskz =
+              constant4i<(i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000),
+                         (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000),
+                         (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000),
+                         (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000)>();
+          t = _mm_and_si128(t, maskz);
+        }
+      return t;
+    }
+
+  // check if we can use punpckhbw
+  if(((me ^ 0xFEDCBA98) & se) == 0 && ((mo ^ 0xFEDCBA98) & so) == 0)
+    {
+      if((ne & se) == 0 && ((no ^ 0x11111111) & so) == 0)
+        {
+          t = _mm_unpackhi_epi8(a, b);
+        }
+      if((no & so) == 0 && ((ne ^ 0x11111111) & se) == 0)
+        {
+          t = _mm_unpackhi_epi8(b, a);
+        }
+      if(do_zero)
+        {
+          // additional zeroing needed
+          __m128i maskz =
+              constant4i<(i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000),
+                         (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000),
+                         (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000),
+                         (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000)>();
+          t = _mm_and_si128(t, maskz);
+        }
+      return t;
+    }
+
+#if INSTRSET >= 4  // SSSE3
+  // special case: shift left
+  if(i0 > 0 && i0 < 16 && i1 == i0 + 1 && i2 == i0 + 2 && i3 == i0 + 3 && i4 == i0 + 4 && i5 == i0 + 5 && i6 == i0 + 6 &&
+     i7 == i0 + 7 && i8 == i0 + 8 && i9 == i0 + 9 && i10 == i0 + 10 && i11 == i0 + 11 && i12 == i0 + 12 && i13 == i0 + 13 &&
+     i14 == i0 + 14 && i15 == i0 + 15)
+    {
+      return _mm_alignr_epi8(b, a, (i0 & 15));
+    }
+
+  // special case: shift right
+  if(i0 > 15 && i0 < 32 && i1 == ((i0 + 1) & 31) && i2 == ((i0 + 2) & 31) && i3 == ((i0 + 3) & 31) && i4 == ((i0 + 4) & 31) &&
+     i5 == ((i0 + 5) & 31) && i6 == ((i0 + 6) & 31) && i7 == ((i0 + 7) & 31) && i8 == ((i0 + 8) & 31) && i9 == ((i0 + 9) & 31) &&
+     i10 == ((i0 + 10) & 31) && i11 == ((i0 + 11) & 31) && i12 == ((i0 + 12) & 31) && i13 == ((i0 + 13) & 31) &&
+     i14 == ((i0 + 14) & 31) && i15 == ((i0 + 15) & 31))
+    {
+      return _mm_alignr_epi8(a, b, (i0 & 15));
+    }
+#endif
+
+#if INSTRSET >= 5  // SSE4.1 supported
+  // special case: blend without permute
+  if(((me ^ 0xECA86420) & se) == 0 && ((mo ^ 0xFDB97531) & so) == 0)
+    {
+      __m128i maskbl =
+          constant4i<((i0 & 16) ? 0xFF : 0) | ((i1 & 16) ? 0xFF00 : 0) | ((i2 & 16) ? 0xFF0000 : 0) | ((i3 & 16) ? 0xFF000000 : 0),
+                     ((i4 & 16) ? 0xFF : 0) | ((i5 & 16) ? 0xFF00 : 0) | ((i6 & 16) ? 0xFF0000 : 0) | ((i7 & 16) ? 0xFF000000 : 0),
+                     ((i8 & 16) ? 0xFF : 0) | ((i9 & 16) ? 0xFF00 : 0) | ((i10 & 16) ? 0xFF0000 : 0) | ((i11 & 16) ? 0xFF000000 : 0),
+                     ((i12 & 16) ? 0xFF : 0) | ((i13 & 16) ? 0xFF00 : 0) | ((i14 & 16) ? 0xFF0000 : 0) |
+                         ((i15 & 16) ? 0xFF000000 : 0)>();
+      t = _mm_blendv_epi8(a, b, maskbl);
+      if(do_zero)
+        {
+          // additional zeroing needed
+          __m128i maskz =
+              constant4i<(i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000),
+                         (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000),
+                         (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000),
+                         (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000)>();
+          t = _mm_and_si128(t, maskz);
+        }
+      return t;
+    }
+#endif  // SSE4.1
+
+#if defined(__XOP__)  // Use AMD XOP instruction VPPERM
+  __m128i mask = constant4i<(i0 < 0 ? 0x80 : (i0 & 31)) | (i1 < 0 ? 0x80 : (i1 & 31)) << 8 | (i2 < 0 ? 0x80 : (i2 & 31)) << 16 |
+                                (i3 < 0 ? 0x80 : (i3 & 31)) << 24,
+                            (i4 < 0 ? 0x80 : (i4 & 31)) | (i5 < 0 ? 0x80 : (i5 & 31)) << 8 | (i6 < 0 ? 0x80 : (i6 & 31)) << 16 |
+                                (i7 < 0 ? 0x80 : (i7 & 31)) << 24,
+                            (i8 < 0 ? 0x80 : (i8 & 31)) | (i9 < 0 ? 0x80 : (i9 & 31)) << 8 | (i10 < 0 ? 0x80 : (i10 & 31)) << 16 |
+                                (i11 < 0 ? 0x80 : (i11 & 31)) << 24,
+                            (i12 < 0 ? 0x80 : (i12 & 31)) | (i13 < 0 ? 0x80 : (i13 & 31)) << 8 | (i14 < 0 ? 0x80 : (i14 & 31)) << 16 |
+                                (i15 < 0 ? 0x80 : (i15 & 31)) << 24>();
+  return _mm_perm_epi8(a, b, mask);
+
+#elif INSTRSET >= 4  // SSSE3
+
+  // general case. Use PSHUFB
+  __m128i maska = constant4i<((i0 & 0x90) ? 0xFF : (i0 & 15)) | ((i1 & 0x90) ? 0xFF : (i1 & 15)) << 8 |
+                                 ((i2 & 0x90) ? 0xFF : (i2 & 15)) << 16 | ((i3 & 0x90) ? 0xFF : (i3 & 15)) << 24,
+                             ((i4 & 0x90) ? 0xFF : (i4 & 15)) | ((i5 & 0x90) ? 0xFF : (i5 & 15)) << 8 |
+                                 ((i6 & 0x90) ? 0xFF : (i6 & 15)) << 16 | ((i7 & 0x90) ? 0xFF : (i7 & 15)) << 24,
+                             ((i8 & 0x90) ? 0xFF : (i8 & 15)) | ((i9 & 0x90) ? 0xFF : (i9 & 15)) << 8 |
+                                 ((i10 & 0x90) ? 0xFF : (i10 & 15)) << 16 | ((i11 & 0x90) ? 0xFF : (i11 & 15)) << 24,
+                             ((i12 & 0x90) ? 0xFF : (i12 & 15)) | ((i13 & 0x90) ? 0xFF : (i13 & 15)) << 8 |
+                                 ((i14 & 0x90) ? 0xFF : (i14 & 15)) << 16 | ((i15 & 0x90) ? 0xFF : (i15 & 15)) << 24>();
+  __m128i maskb =
+      constant4i<(((i0 ^ 0x10) & 0x90) ? 0xFF : (i0 & 15)) | (((i1 ^ 0x10) & 0x90) ? 0xFF : (i1 & 15)) << 8 |
+                     (((i2 ^ 0x10) & 0x90) ? 0xFF : (i2 & 15)) << 16 | (((i3 ^ 0x10) & 0x90) ? 0xFF : (i3 & 15)) << 24,
+                 (((i4 ^ 0x10) & 0x90) ? 0xFF : (i4 & 15)) | (((i5 ^ 0x10) & 0x90) ? 0xFF : (i5 & 15)) << 8 |
+                     (((i6 ^ 0x10) & 0x90) ? 0xFF : (i6 & 15)) << 16 | (((i7 ^ 0x10) & 0x90) ? 0xFF : (i7 & 15)) << 24,
+                 (((i8 ^ 0x10) & 0x90) ? 0xFF : (i8 & 15)) | (((i9 ^ 0x10) & 0x90) ? 0xFF : (i9 & 15)) << 8 |
+                     (((i10 ^ 0x10) & 0x90) ? 0xFF : (i10 & 15)) << 16 | (((i11 ^ 0x10) & 0x90) ? 0xFF : (i11 & 15)) << 24,
+                 (((i12 ^ 0x10) & 0x90) ? 0xFF : (i12 & 15)) | (((i13 ^ 0x10) & 0x90) ? 0xFF : (i13 & 15)) << 8 |
+                     (((i14 ^ 0x10) & 0x90) ? 0xFF : (i14 & 15)) << 16 | (((i15 ^ 0x10) & 0x90) ? 0xFF : (i15 & 15)) << 24>();
+  __m128i a1 = _mm_shuffle_epi8(a, maska);
+  __m128i b1 = _mm_shuffle_epi8(b, maskb);
+  return _mm_or_si128(a1, b1);
+
+#else  // SSE2
+  // combine two permutes
+  __m128i a1 = permute16c < (uint32_t)i0 < 16 ? i0 : -1, (uint32_t)i1 < 16 ? i1 : -1, (uint32_t)i2 < 16 ? i2 : -1,
+          (uint32_t)i3 < 16 ? i3 : -1, (uint32_t)i4 < 16 ? i4 : -1, (uint32_t)i5 < 16 ? i5 : -1, (uint32_t)i6 < 16 ? i6 : -1,
+          (uint32_t)i7 < 16 ? i7 : -1, (uint32_t)i8 < 16 ? i8 : -1, (uint32_t)i9 < 16 ? i9 : -1, (uint32_t)i10 < 16 ? i10 : -1,
+          (uint32_t)i11 < 16 ? i11 : -1, (uint32_t)i12 < 16 ? i12 : -1, (uint32_t)i13 < 16 ? i13 : -1, (uint32_t)i14 < 16 ? i14 : -1,
+          (uint32_t)i15 < 16 ? i15 : -1 > (a);
+  __m128i b1 = permute16c < (uint32_t)(i0 ^ 16) < 16 ? (i0 ^ 16) : -1, (uint32_t)(i1 ^ 16) < 16 ? (i1 ^ 16) : -1,
+          (uint32_t)(i2 ^ 16) < 16 ? (i2 ^ 16) : -1, (uint32_t)(i3 ^ 16) < 16 ? (i3 ^ 16) : -1,
+          (uint32_t)(i4 ^ 16) < 16 ? (i4 ^ 16) : -1, (uint32_t)(i5 ^ 16) < 16 ? (i5 ^ 16) : -1,
+          (uint32_t)(i6 ^ 16) < 16 ? (i6 ^ 16) : -1, (uint32_t)(i7 ^ 16) < 16 ? (i7 ^ 16) : -1,
+          (uint32_t)(i8 ^ 16) < 16 ? (i8 ^ 16) : -1, (uint32_t)(i9 ^ 16) < 16 ? (i9 ^ 16) : -1,
+          (uint32_t)(i10 ^ 16) < 16 ? (i10 ^ 16) : -1, (uint32_t)(i11 ^ 16) < 16 ? (i11 ^ 16) : -1,
+          (uint32_t)(i12 ^ 16) < 16 ? (i12 ^ 16) : -1, (uint32_t)(i13 ^ 16) < 16 ? (i13 ^ 16) : -1,
+          (uint32_t)(i14 ^ 16) < 16 ? (i14 ^ 16) : -1, (uint32_t)(i15 ^ 16) < 16 ? (i15 ^ 16) : -1 > (b);
+  return _mm_or_si128(a1, b1);
+
+#endif
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16uc blend16uc(Vec16uc const &a, Vec16uc const &b)
+{
+  return Vec16uc(blend16c<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a, b));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8s blend8s(Vec8s const &a, Vec8s const &b)
+{
+  // Combine all the indexes into a single bitfield, with 4 bits for each
+  const int m1 = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                 (i6 & 0xF) << 24 | (i7 & 0xF) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12 |
+                 (i4 < 0 ? 0 : 0xF) << 16 | (i5 < 0 ? 0 : 0xF) << 20 | (i6 < 0 ? 0 : 0xF) << 24 | (i7 < 0 ? 0 : 0xF) << 28;
+
+  // Some elements must be set to zero
+  const bool do_zero = (mz != -1) && ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80) != 0;
+
+  // temp contains temporary result, some zeroing needs to be done
+  bool zeroing_pending = false;
+
+  // partially finished result
+  __m128i temp;
+
+  if((m1 & 0x88888888 & mz) == 0)
+    {
+      // no elements from b
+      return permute8s<i0, i1, i2, i3, i4, i5, i6, i7>(a);
+    }
+
+  if(((m1 ^ 0x88888888) & 0x88888888 & mz) == 0)
+    {
+      // no elements from a
+      return permute8s<i0 & ~8, i1 & ~8, i2 & ~8, i3 & ~8, i4 & ~8, i5 & ~8, i6 & ~8, i7 & ~8>(b);
+    }
+
+  // special case: PUNPCKLWD
+  if(((m1 ^ 0xB3A29180) & mz) == 0)
+    {
+      temp = _mm_unpacklo_epi16(a, b);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+  if(((m1 ^ 0x3B2A1908) & mz) == 0)
+    {
+      temp = _mm_unpacklo_epi16(b, a);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+  // special case: PUNPCKHWD
+  if(((m1 ^ 0xF7E6D5C4) & mz) == 0)
+    {
+      temp = _mm_unpackhi_epi16(a, b);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+  if(((m1 ^ 0x7F6E5D4C) & mz) == 0)
+    {
+      temp = _mm_unpackhi_epi16(b, a);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+
+#if INSTRSET >= 4  // SSSE3
+  // special case: shift left
+  if(i0 > 0 && i0 < 8 && ((m1 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) & mz) == 0)
+    {
+      temp = _mm_alignr_epi8(b, a, (i0 & 7) * 2);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+
+  // special case: shift right
+  if(i0 > 8 && i0 < 16 && ((m1 ^ 0x88888888 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) & mz) == 0)
+    {
+      temp = _mm_alignr_epi8(a, b, (i0 & 7) * 2);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+#endif  // SSSE3
+
+#if INSTRSET >= 5  // SSE4.1 supported
+  // special case: blending without permuting
+  if((((m1 & ~0x88888888) ^ 0x76543210) & mz) == 0)
+    {
+      temp = _mm_blend_epi16(a, b,
+                             (i0 >> 3 & 1) | (i1 >> 3 & 1) << 1 | (i2 >> 3 & 1) << 2 | (i3 >> 3 & 1) << 3 | (i4 >> 3 & 1) << 4 |
+                                 (i5 >> 3 & 1) << 5 | (i6 >> 3 & 1) << 6 | (i7 >> 3 & 1) << 7);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+#endif  // SSE4.1
+
+  if(zeroing_pending)
+    {
+      // additional zeroing of temp needed
+      __m128i maskz =
+          constant4i<(i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000), (i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000),
+                     (i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000), (i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000)>();
+      return _mm_and_si128(temp, maskz);
+    }
+
+    // general case
+#ifdef __XOP__  // Use AMD XOP instruction PPERM
+  __m128i mask = constant4i<(i0 < 0 ? 0x8080 : (i0 * 2 & 31) | ((i0 * 2 & 31) + 1) << 8) |
+                                (i1 < 0 ? 0x80800000 : ((i1 * 2 & 31) << 16) | ((i1 * 2 & 31) + 1) << 24),
+                            (i2 < 0 ? 0x8080 : (i2 * 2 & 31) | ((i2 * 2 & 31) + 1) << 8) |
+                                (i3 < 0 ? 0x80800000 : ((i3 * 2 & 31) << 16) | ((i3 * 2 & 31) + 1) << 24),
+                            (i4 < 0 ? 0x8080 : (i4 * 2 & 31) | ((i4 * 2 & 31) + 1) << 8) |
+                                (i5 < 0 ? 0x80800000 : ((i5 * 2 & 31) << 16) | ((i5 * 2 & 31) + 1) << 24),
+                            (i6 < 0 ? 0x8080 : (i6 * 2 & 31) | ((i6 * 2 & 31) + 1) << 8) |
+                                (i7 < 0 ? 0x80800000 : ((i7 * 2 & 31) << 16) | ((i7 * 2 & 31) + 1) << 24)>();
+  return _mm_perm_epi8(a, b, mask);
+#else
+  // combine two permutes
+  __m128i a1 = permute8s < (uint32_t)i0 < 8 ? i0 : -1, (uint32_t)i1 < 8 ? i1 : -1, (uint32_t)i2 < 8 ? i2 : -1,
+          (uint32_t)i3 < 8 ? i3 : -1, (uint32_t)i4 < 8 ? i4 : -1, (uint32_t)i5 < 8 ? i5 : -1, (uint32_t)i6 < 8 ? i6 : -1,
+          (uint32_t)i7 < 8 ? i7 : -1 > (a);
+  __m128i b1 = permute8s < (uint32_t)(i0 ^ 8) < 8 ? (i0 ^ 8) : -1, (uint32_t)(i1 ^ 8) < 8 ? (i1 ^ 8) : -1,
+          (uint32_t)(i2 ^ 8) < 8 ? (i2 ^ 8) : -1, (uint32_t)(i3 ^ 8) < 8 ? (i3 ^ 8) : -1, (uint32_t)(i4 ^ 8) < 8 ? (i4 ^ 8) : -1,
+          (uint32_t)(i5 ^ 8) < 8 ? (i5 ^ 8) : -1, (uint32_t)(i6 ^ 8) < 8 ? (i6 ^ 8) : -1, (uint32_t)(i7 ^ 8) < 8 ? (i7 ^ 8) : -1 > (b);
+  return _mm_or_si128(a1, b1);
+
+#endif
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8us blend8us(Vec8us const &a, Vec8us const &b)
+{
+  return Vec8us(blend8s<i0, i1, i2, i3, i4, i5, i6, i7>(a, b));
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4i blend4i(Vec4i const &a, Vec4i const &b)
+{
+  // Combine all the indexes into a single bitfield, with 8 bits for each
+  const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24;
+
+  // Some elements must be set to zero
+  const bool do_zero = (mz != -1) && ((i0 | i1 | i2 | i3) & 0x80) != 0;
+
+  // temp contains temporary result, some zeroing needs to be done
+  bool zeroing_pending = false;
+
+  // partially finished result
+  __m128i temp;
+#if defined(_MSC_VER) || defined(__clang__)
+  temp = a;  // avoid spurious warning message for temp unused
+#endif
+
+  // special case: no elements from b
+  if((m1 & 0x04040404 & mz) == 0)
+    {
+      return permute4i<i0, i1, i2, i3>(a);
+    }
+
+  // special case: no elements from a
+  if(((m1 ^ 0x04040404) & 0x04040404 & mz) == 0)
+    {
+      return permute4i<i0 & ~4, i1 & ~4, i2 & ~4, i3 & ~4>(b);
+    }
+
+  // special case: PUNPCKLDQ
+  if(((m1 ^ 0x05010400) & mz) == 0)
+    {
+      temp = _mm_unpacklo_epi32(a, b);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+  if(((m1 ^ 0x01050004) & mz) == 0)
+    {
+      temp = _mm_unpacklo_epi32(b, a);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+
+  // special case: PUNPCKHDQ
+  if(((m1 ^ 0x07030602) & mz) == 0)
+    {
+      temp = _mm_unpackhi_epi32(a, b);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+  if(((m1 ^ 0x03070206) & mz) == 0)
+    {
+      temp = _mm_unpackhi_epi32(b, a);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+
+#if INSTRSET >= 4  // SSSE3
+  // special case: shift left
+  if(i0 > 0 && i0 < 4 && ((m1 ^ ((i0 & 3) * 0x01010101u + 0x03020100u)) & mz) == 0)
+    {
+      temp = _mm_alignr_epi8(b, a, (i0 & 3) * 4);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+
+  // special case: shift right
+  if(i0 > 4 && i0 < 8 && ((m1 ^ 0x04040404 ^ ((i0 & 3) * 0x01010101u + 0x03020100u)) & mz) == 0)
+    {
+      temp = _mm_alignr_epi8(a, b, (i0 & 3) * 4);
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+#endif  // SSSE3
+
+#if INSTRSET >= 5  // SSE4.1 supported
+  if((((m1 & ~0x04040404) ^ 0x03020100) & mz) == 0)
+    {
+      // blending without permuting
+      temp = _mm_blend_epi16(
+          a, b, ((i0 >> 2) & 1) * 3 | ((((i1 >> 2) & 1) * 3) << 2) | ((((i2 >> 2) & 1) * 3) << 4) | ((((i3 >> 2) & 1) * 3) << 6));
+      if(do_zero)
+        zeroing_pending = true;
+      else
+        return temp;
+    }
+#endif  // SSE4.1
+
+  if(zeroing_pending)
+    {
+      // additional zeroing of temp needed
+      __m128i maskz = constant4i<(i0 < 0 ? 0 : -1), (i1 < 0 ? 0 : -1), (i2 < 0 ? 0 : -1), (i3 < 0 ? 0 : -1)>();
+      return _mm_and_si128(temp, maskz);
+    }
+
+    // general case
+#ifdef __XOP__  // Use AMD XOP instruction PPERM
+  __m128i mask = constant4i < i0 < 0
+                     ? 0x80808080
+                     : (i0 * 4 & 31) + (((i0 * 4 & 31) + 1) << 8) + (((i0 * 4 & 31) + 2) << 16) + (((i0 * 4 & 31) + 3) << 24),
+          i1 < 0 ? 0x80808080 : (i1 * 4 & 31) + (((i1 * 4 & 31) + 1) << 8) + (((i1 * 4 & 31) + 2) << 16) + (((i1 * 4 & 31) + 3) << 24),
+          i2 < 0 ? 0x80808080 : (i2 * 4 & 31) + (((i2 * 4 & 31) + 1) << 8) + (((i2 * 4 & 31) + 2) << 16) + (((i2 * 4 & 31) + 3) << 24),
+          i3 < 0 ? 0x80808080
+                 : (i3 * 4 & 31) + (((i3 * 4 & 31) + 1) << 8) + (((i3 * 4 & 31) + 2) << 16) + (((i3 * 4 & 31) + 3) << 24) > ();
+  return _mm_perm_epi8(a, b, mask);
+
+#else  // combine two permutes
+  __m128i a1 = permute4i < (uint32_t)i0 < 4 ? i0 : -1, (uint32_t)i1 < 4 ? i1 : -1, (uint32_t)i2 < 4 ? i2 : -1,
+          (uint32_t)i3 < 4 ? i3 : -1 > (a);
+  __m128i b1 = permute4i < (uint32_t)(i0 ^ 4) < 4 ? (i0 ^ 4) : -1, (uint32_t)(i1 ^ 4) < 4 ? (i1 ^ 4) : -1,
+          (uint32_t)(i2 ^ 4) < 4 ? (i2 ^ 4) : -1, (uint32_t)(i3 ^ 4) < 4 ? (i3 ^ 4) : -1 > (b);
+  return _mm_or_si128(a1, b1);
+#endif
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4ui blend4ui(Vec4ui const &a, Vec4ui const &b)
+{
+  return Vec4ui(blend4i<i0, i1, i2, i3>(a, b));
+}
+
+template <int i0, int i1>
+static inline Vec2q blend2q(Vec2q const &a, Vec2q const &b)
+{
+  // Combine all the indexes into a single bitfield, with 8 bits for each
+  const int m1 = (i0 & 3) | (i1 & 3) << 8;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8;
+
+  // no elements from b
+  if((m1 & 0x0202 & mz) == 0)
+    {
+      return permute2q<i0, i1>(a);
+    }
+  // no elements from a
+  if(((m1 ^ 0x0202) & 0x0202 & mz) == 0)
+    {
+      return permute2q<i0 & ~2, i1 & ~2>(b);
+    }
+  // (all cases where one index is -1 or -256 would go to the above cases)
+
+  // special case: PUNPCKLQDQ
+  if(i0 == 0 && i1 == 2)
+    {
+      return _mm_unpacklo_epi64(a, b);
+    }
+  if(i0 == 2 && i1 == 0)
+    {
+      return _mm_unpacklo_epi64(b, a);
+    }
+  // special case: PUNPCKHQDQ
+  if(i0 == 1 && i1 == 3)
+    {
+      return _mm_unpackhi_epi64(a, b);
+    }
+  if(i0 == 3 && i1 == 1)
+    {
+      return _mm_unpackhi_epi64(b, a);
+    }
+
+#if INSTRSET >= 4  // SSSE3
+  // special case: shift left
+  if(i0 == 1 && i1 == 2)
+    {
+      return _mm_alignr_epi8(b, a, 8);
+    }
+  // special case: shift right
+  if(i0 == 3 && i1 == 0)
+    {
+      return _mm_alignr_epi8(a, b, 8);
+    }
+#endif  // SSSE3
+
+#if INSTRSET >= 5  // SSE4.1 supported
+  if(((m1 & ~0x0202) ^ 0x0100) == 0 && mz == 0xFFFF)
+    {
+      // blending without permuting
+      return _mm_blend_epi16(a, b, (i0 >> 1 & 1) * 0xF | ((i1 >> 1 & 1) * 0xF) << 4);
+    }
+#endif  // SSE4.1
+
+  // general case. combine two permutes
+  // (all cases are caught by the above special cases if SSE4.1 or higher is supported)
+  __m128i a1, b1;
+  a1 = permute2q < (uint32_t)i0 < 2 ? i0 : -1, (uint32_t)i1 < 2 ? i1 : -1 > (a);
+  b1 = permute2q < (uint32_t)(i0 ^ 2) < 2 ? (i0 ^ 2) : -1, (uint32_t)(i1 ^ 2) < 2 ? (i1 ^ 2) : -1 > (b);
+  return _mm_or_si128(a1, b1);
+}
+
+template <int i0, int i1>
+static inline Vec2uq blend2uq(Vec2uq const &a, Vec2uq const &b)
+{
+  return Vec2uq(blend2q<i0, i1>((__m128i)a, (__m128i)b));
+}
+
+/*****************************************************************************
+ *
+ *          Vector lookup functions
+ *
+ ******************************************************************************
+ *
+ * These functions use vector elements as indexes into a table.
+ * The table is given as one or more vectors or as an array.
+ *
+ * This can be used for several purposes:
+ *  - table lookup
+ *  - permute or blend with variable indexes
+ *  - blend from more than two sources
+ *  - gather non-contiguous data
+ *
+ * An index out of range may produce any value - the actual value produced is
+ * implementation dependent and may be different for different instruction
+ * sets. An index out of range does not produce an error message or exception.
+ *
+ * Example:
+ * Vec4i a(2,0,0,3);           // index a is (  2,   0,   0,   3)
+ * Vec4i b(100,101,102,103);   // table b is (100, 101, 102, 103)
+ * Vec4i c;
+ * c = lookup4 (a,b);          // c is (102, 100, 100, 103)
+ *
+ *****************************************************************************/
+
+static inline Vec16c lookup16(Vec16c const &index, Vec16c const &table)
+{
+#if INSTRSET >= 5  // SSSE3
+  return _mm_shuffle_epi8(table, index);
+#else
+  uint8_t ii[16];
+  int8_t tt[16], rr[16];
+  table.store(tt);
+  index.store(ii);
+  for(int j = 0; j < 16; j++)
+    rr[j] = tt[ii[j] & 0x0F];
+  return Vec16c().load(rr);
+#endif
+}
+
+static inline Vec16c lookup32(Vec16c const &index, Vec16c const &table0, Vec16c const &table1)
+{
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+  return _mm_perm_epi8(table0, table1, index);
+#elif INSTRSET >= 5  // SSSE3
+  Vec16c r0 = _mm_shuffle_epi8(table0, index + 0x70);           // make negative index for values >= 16
+  Vec16c r1 = _mm_shuffle_epi8(table1, (index ^ 0x10) + 0x70);  // make negative index for values <  16
+  return r0 | r1;
+#else
+  uint8_t ii[16];
+  int8_t tt[16], rr[16];
+  table0.store(tt);
+  table1.store(tt + 16);
+  index.store(ii);
+  for(int j = 0; j < 16; j++)
+    rr[j] = tt[ii[j] & 0x1F];
+  return Vec16c().load(rr);
+#endif
+}
+
+template <int n>
+static inline Vec16c lookup(Vec16c const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 16)
+    return lookup16(index, Vec16c().load(table));
+  if(n <= 32)
+    return lookup32(index, Vec16c().load(table), Vec16c().load((int8_t *)table + 16));
+  // n > 32. Limit index
+  Vec16uc index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec16uc(index) & uint8_t(n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec16uc(index), uint8_t(n - 1));
+    }
+  uint8_t ii[16];
+  index1.store(ii);
+  int8_t rr[16];
+  for(int j = 0; j < 16; j++)
+    {
+      rr[j] = ((int8_t *)table)[ii[j]];
+    }
+  return Vec16c().load(rr);
+}
+
+static inline Vec8s lookup8(Vec8s const &index, Vec8s const &table)
+{
+#if INSTRSET >= 5  // SSSE3
+  return _mm_shuffle_epi8(table, index * 0x202 + 0x100);
+#else
+  int16_t ii[8], tt[8], rr[8];
+  table.store(tt);
+  index.store(ii);
+  for(int j = 0; j < 8; j++)
+    rr[j] = tt[ii[j] & 0x07];
+  return Vec8s().load(rr);
+#endif
+}
+
+static inline Vec8s lookup16(Vec8s const &index, Vec8s const &table0, Vec8s const &table1)
+{
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+  return _mm_perm_epi8(table0, table1, index * 0x202 + 0x100);
+#elif INSTRSET >= 5  // SSSE3
+  Vec8s r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x202) + Vec16c(Vec8s(0x7170)));
+  Vec8s r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x202 ^ 0x1010) + Vec16c(Vec8s(0x7170)));
+  return r0 | r1;
+#else
+  int16_t ii[16], tt[32], rr[16];
+  table0.store(tt);
+  table1.store(tt + 8);
+  index.store(ii);
+  for(int j = 0; j < 16; j++)
+    rr[j] = tt[ii[j] & 0x1F];
+  return Vec8s().load(rr);
+#endif
+}
+
+template <int n>
+static inline Vec8s lookup(Vec8s const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 8)
+    return lookup8(index, Vec8s().load(table));
+  if(n <= 16)
+    return lookup16(index, Vec8s().load(table), Vec8s().load((int16_t *)table + 8));
+  // n > 16. Limit index
+  Vec8us index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec8us(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec8us(index), n - 1);
+    }
+#if INSTRSET >= 8                                                                                         // AVX2. Use VPERMD
+  Vec8s t1 = _mm_i32gather_epi32((const int *)table, __m128i((Vec4i(index1)) & (Vec4i(0x0000FFFF))), 2);  // even positions
+  Vec8s t2 = _mm_i32gather_epi32((const int *)table, _mm_srli_epi32(index1, 16), 2);                      // odd  positions
+  return blend8s<0, 8, 2, 10, 4, 12, 6, 14>(t1, t2);
+#else
+  uint16_t ii[8];
+  index1.store(ii);
+  return Vec8s(((int16_t *)table)[ii[0]], ((int16_t *)table)[ii[1]], ((int16_t *)table)[ii[2]], ((int16_t *)table)[ii[3]],
+               ((int16_t *)table)[ii[4]], ((int16_t *)table)[ii[5]], ((int16_t *)table)[ii[6]], ((int16_t *)table)[ii[7]]);
+#endif
+}
+
+static inline Vec4i lookup4(Vec4i const &index, Vec4i const &table)
+{
+#if INSTRSET >= 5  // SSSE3
+  return _mm_shuffle_epi8(table, index * 0x04040404 + 0x03020100);
+#else
+  return Vec4i(table[index[0]], table[index[1]], table[index[2]], table[index[3]]);
+#endif
+}
+
+static inline Vec4i lookup8(Vec4i const &index, Vec4i const &table0, Vec4i const &table1)
+{
+  // return Vec4i(lookup16(Vec8s(index * 0x20002 + 0x10000), Vec8s(table0), Vec8s(table1)));
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+  return _mm_perm_epi8(table0, table1, index * 0x04040404 + 0x03020100);
+#elif INSTRSET >= 8  // AVX2. Use VPERMD
+  __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1);  // join tables into 256 bit vector
+
+#if defined(_MSC_VER) && _MSC_VER < 1700 && !defined(__INTEL_COMPILER)
+  // bug in MS VS 11 beta: operands in wrong order
+  return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index), table01));
+#elif defined(GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+  // Gcc 4.7.0 also has operands in wrong order
+  return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index), table01));
+#else
+  return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index)));
+#endif  // bug
+
+#elif INSTRSET >= 4  // SSSE3
+  Vec4i r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x04040404) + Vec16c(Vec4i(0x73727170)));
+  Vec4i r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x04040404 ^ 0x10101010) + Vec16c(Vec4i(0x73727170)));
+  return r0 | r1;
+#else                // SSE2
+  int32_t ii[4], tt[8], rr[4];
+  table0.store(tt);
+  table1.store(tt + 4);
+  index.store(ii);
+  for(int j = 0; j < 4; j++)
+    rr[j] = tt[ii[j] & 0x07];
+  return Vec4i().load(rr);
+#endif
+}
+
+static inline Vec4i lookup16(Vec4i const &index, Vec4i const &table0, Vec4i const &table1, Vec4i const &table2, Vec4i const &table3)
+{
+#if INSTRSET >= 8                                                                        // AVX2. Use VPERMD
+  __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1);  // join tables into 256 bit vector
+  __m256i table23 = _mm256_inserti128_si256(_mm256_castsi128_si256(table2), table3, 1);  // join tables into 256 bit vector
+#if defined(_MSC_VER) && _MSC_VER < 1700 && !defined(__INTEL_COMPILER)
+  // bug in MS VS 11 beta: operands in wrong order
+  __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index), table01));
+  __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ^ 8), table23));
+#elif defined(GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+  // Gcc 4.7.0 also has operands in wrong order
+  __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index), table01));
+  __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ^ 8), table23));
+#else
+  __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index)));
+  __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table23, _mm256_castsi128_si256(index ^ 8)));
+#endif  // bug
+  return _mm_blendv_epi8(r0, r1, index > 8);
+
+#elif defined(__XOP__)  // AMD XOP instruction set. Use VPPERM
+  Vec4i r0 = _mm_perm_epi8(table0, table1, ((index)*0x04040404u + 0x63626160u) & 0X9F9F9F9Fu);
+  Vec4i r1 = _mm_perm_epi8(table2, table3, ((index ^ 8) * 0x04040404u + 0x63626160u) & 0X9F9F9F9Fu);
+  return r0 | r1;
+
+#elif INSTRSET >= 5  // SSSE3
+  Vec16c aa = Vec16c(Vec4i(0x73727170));
+  Vec4i r0  = _mm_shuffle_epi8(table0, Vec16c((index)*0x04040404) + aa);
+  Vec4i r1  = _mm_shuffle_epi8(table1, Vec16c((index ^ 4) * 0x04040404) + aa);
+  Vec4i r2  = _mm_shuffle_epi8(table2, Vec16c((index ^ 8) * 0x04040404) + aa);
+  Vec4i r3  = _mm_shuffle_epi8(table3, Vec16c((index ^ 12) * 0x04040404) + aa);
+  return (r0 | r1) | (r2 | r3);
+
+#else  // SSE2
+  int32_t ii[4], tt[16], rr[4];
+  table0.store(tt);
+  table1.store(tt + 4);
+  table2.store(tt + 8);
+  table3.store(tt + 12);
+  index.store(ii);
+  for(int j = 0; j < 4; j++)
+    rr[j] = tt[ii[j] & 0x0F];
+  return Vec4i().load(rr);
+#endif
+}
+
+template <int n>
+static inline Vec4i lookup(Vec4i const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 4)
+    return lookup4(index, Vec4i().load(table));
+  if(n <= 8)
+    return lookup8(index, Vec4i().load(table), Vec4i().load((int32_t *)table + 4));
+  // n > 8. Limit index
+  Vec4ui index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec4ui(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec4ui(index), n - 1);
+    }
+#if INSTRSET >= 8  // AVX2. Use VPERMD
+  return _mm_i32gather_epi32((const int *)table, index1, 4);
+#else
+  uint32_t ii[4];
+  index1.store(ii);
+  return Vec4i(((int32_t *)table)[ii[0]], ((int32_t *)table)[ii[1]], ((int32_t *)table)[ii[2]], ((int32_t *)table)[ii[3]]);
+#endif
+}
+
+static inline Vec2q lookup2(Vec2q const &index, Vec2q const &table)
+{
+#if INSTRSET >= 5  // SSSE3
+  return _mm_shuffle_epi8(table, index * 0x0808080808080808ll + 0x0706050403020100ll);
+#else
+  int64_t ii[2], tt[2];
+  table.store(tt);
+  index.store(ii);
+  return Vec2q(tt[int(ii[0])], tt[int(ii[1])]);
+#endif
+}
+
+template <int n>
+static inline Vec2q lookup(Vec2q const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  // n > 0. Limit index
+  Vec2uq index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec2uq(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1.
+      // There is no 64-bit min instruction, but we can use the 32-bit unsigned min,
+      // since n is a 32-bit integer
+      index1 = Vec2uq(min(Vec2uq(index), constant4i<n - 1, 0, n - 1, 0>()));
+    }
+  uint32_t ii[4];
+  index1.store(ii);  // use only lower 32 bits of each index
+  int64_t const *tt = (int64_t const *)table;
+  return Vec2q(tt[ii[0]], tt[ii[2]]);
+}
+
+/*****************************************************************************
+ *
+ *          Other permutations with variable indexes
+ *
+ *****************************************************************************/
+
+// Function shift_bytes_up: shift whole vector left by b bytes.
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec16c shift_bytes_up(Vec16c const &a, int b)
+{
+  if((uint32_t)b > 15)
+    return _mm_setzero_si128();
+#if INSTRSET >= 4  // SSSE3
+  static const char mask[32] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
+  return Vec16c(_mm_shuffle_epi8(a, Vec16c().load(mask + 16 - b)));
+#else
+  Vec2uq a1 = Vec2uq(a);
+  if(b < 8)
+    {
+      a1 = (a1 << (b * 8)) | (permute2uq<-1, 0>(a1) >> (64 - (b * 8)));
+    }
+  else
+    {
+      a1 = permute2uq<-1, 0>(a1) << ((b - 8) * 8);
+    }
+  return Vec16c(a1);
+#endif
+}
+
+// Function shift_bytes_down: shift whole vector right by b bytes
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec16c shift_bytes_down(Vec16c const &a, int b)
+{
+  if((uint32_t)b > 15)
+    return _mm_setzero_si128();
+#if INSTRSET >= 4  // SSSE3
+  static const char mask[32] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+  return Vec16c(_mm_shuffle_epi8(a, Vec16c().load(mask + b)));
+#else
+  Vec2uq a1 = Vec2uq(a);
+  if(b < 8)
+    {
+      a1 = (a1 >> (b * 8)) | (permute2uq<1, -1>(a1) << (64 - (b * 8)));
+    }
+  else
+    {
+      a1 = permute2uq<1, -1>(a1) >> ((b - 8) * 8);
+    }
+  return Vec16c(a1);
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Gather functions with fixed indexes
+ *
+ *****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3
+template <int i0, int i1, int i2, int i3>
+static inline Vec4i gather4i(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3) >= 0> Negative_array_index;  // Error message if index is negative
+  const int i01min = i0 < i1 ? i0 : i1;
+  const int i23min = i2 < i3 ? i2 : i3;
+  const int imin   = i01min < i23min ? i01min : i23min;
+  const int i01max = i0 > i1 ? i0 : i1;
+  const int i23max = i2 > i3 ? i2 : i3;
+  const int imax   = i01max > i23max ? i01max : i23max;
+  if(imax - imin <= 3)
+    {
+      // load one contiguous block and permute
+      if(imax > 3)
+        {
+          // make sure we don't read past the end of the array
+          Vec4i b = Vec4i().load((int32_t const *)a + imax - 3);
+          return permute4i<i0 - imax + 3, i1 - imax + 3, i2 - imax + 3, i3 - imax + 3>(b);
+        }
+      else
+        {
+          Vec4i b = Vec4i().load((int32_t const *)a + imin);
+          return permute4i<i0 - imin, i1 - imin, i2 - imin, i3 - imin>(b);
+        }
+    }
+  if((i0 < imin + 4 || i0 > imax - 4) && (i1 < imin + 4 || i1 > imax - 4) && (i2 < imin + 4 || i2 > imax - 4) &&
+     (i3 < imin + 4 || i3 > imax - 4))
+    {
+      // load two contiguous blocks and blend
+      Vec4i b      = Vec4i().load((int32_t const *)a + imin);
+      Vec4i c      = Vec4i().load((int32_t const *)a + imax - 3);
+      const int j0 = i0 < imin + 4 ? i0 - imin : 7 - imax + i0;
+      const int j1 = i1 < imin + 4 ? i1 - imin : 7 - imax + i1;
+      const int j2 = i2 < imin + 4 ? i2 - imin : 7 - imax + i2;
+      const int j3 = i3 < imin + 4 ? i3 - imin : 7 - imax + i3;
+      return blend4i<j0, j1, j2, j3>(b, c);
+    }
+    // use AVX2 gather if available
+#if INSTRSET >= 8
+  return _mm_i32gather_epi32((const int *)a, Vec4i(i0, i1, i2, i3), 4);
+#else
+  return lookup<imax + 1>(Vec4i(i0, i1, i2, i3), a);
+#endif
+}
+
+// Load elements from array a with indices i0, i1
+template <int i0, int i1>
+static inline Vec2q gather2q(void const *a)
+{
+  Static_error_check<(i0 | i1) >= 0> Negative_array_index;  // Error message if index is negative
+  const int imin = i0 < i1 ? i0 : i1;
+  const int imax = i0 > i1 ? i0 : i1;
+  if(imax - imin <= 1)
+    {
+      // load one contiguous block and permute
+      if(imax > 1)
+        {
+          // make sure we don't read past the end of the array
+          Vec2q b = Vec2q().load((int64_t const *)a + imax - 1);
+          return permute2q<i0 - imax + 1, i1 - imax + 1>(b);
+        }
+      else
+        {
+          Vec2q b = Vec2q().load((int64_t const *)a + imin);
+          return permute2q<i0 - imin, i1 - imin>(b);
+        }
+    }
+  return Vec2q(((int64_t *)a)[i0], ((int64_t *)a)[i1]);
+}
+
+/*****************************************************************************
+ *
+ *          Vector scatter functions
+ *
+ ******************************************************************************
+ *
+ * These functions write the elements of a vector to arbitrary positions in an
+ * array in memory. Each vector element is written to an array position
+ * determined by an index. An element is not written if the corresponding
+ * index is out of range.
+ * The indexes can be specified as constant template parameters or as an
+ * integer vector.
+ *
+ * The scatter functions are useful if the data are distributed in a sparce
+ * manner into the array. If the array is dense then it is more efficient
+ * to permute the data into the right positions and then write the whole
+ * permuted vector into the array.
+ *
+ * Example:
+ * Vec8q a(10,11,12,13,14,15,16,17);
+ * int64_t b[16] = {0};
+ * scatter<0,2,14,10,1,-1,5,9>(a,b);
+ * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
+ *
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4i const &data, void *array)
+{
+#if defined(__AVX512VL__)
+  __m128i indx   = constant4i<i0, i1, i2, i3>();
+  __mmask16 mask = uint16_t(i0 >= 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3);
+  _mm_mask_i32scatter_epi32((int *)array, mask, indx, data, 4);
+#else
+  int32_t *arr       = (int32_t *)array;
+  const int index[4] = {i0, i1, i2, i3};
+  for(int i = 0; i < 4; i++)
+    {
+      if(index[i] >= 0)
+        arr[index[i]] = data[i];
+    }
+#endif
+}
+
+template <int i0, int i1>
+static inline void scatter(Vec2q const &data, void *array)
+{
+  int64_t *arr = (int64_t *)array;
+  if(i0 >= 0)
+    arr[i0] = data[0];
+  if(i1 >= 0)
+    arr[i1] = data[1];
+}
+
+static inline void scatter(Vec4i const &index, uint32_t limit, Vec4i const &data, void *array)
+{
+#if defined(__AVX512VL__)
+  __mmask16 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit));
+  _mm_mask_i32scatter_epi32((int *)array, mask, index, data, 4);
+#else
+  int32_t *arr = (int32_t *)array;
+  for(int i = 0; i < 4; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        arr[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec2q const &index, uint32_t limit, Vec2q const &data, void *array)
+{
+  int64_t *arr = (int64_t *)array;
+  if(uint64_t(index[0]) < uint64_t(limit))
+    arr[index[0]] = data[0];
+  if(uint64_t(index[1]) < uint64_t(limit))
+    arr[index[1]] = data[1];
+}
+
+static inline void scatter(Vec4i const &index, uint32_t limit, Vec2q const &data, void *array)
+{
+  int64_t *arr = (int64_t *)array;
+  if(uint32_t(index[0]) < limit)
+    arr[index[0]] = data[0];
+  if(uint32_t(index[1]) < limit)
+    arr[index[1]] = data[1];
+}
+
+/*****************************************************************************
+ *
+ *          Functions for conversion between integer sizes
+ *
+ *****************************************************************************/
+
+// Extend 8-bit integers to 16-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 16 bits with sign extension
+static inline Vec8s extend_low(Vec16c const &a)
+{
+  __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(), a);  // 0 > a
+  return _mm_unpacklo_epi8(a, sign);                      // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 8 elements to 16 bits with sign extension
+static inline Vec8s extend_high(Vec16c const &a)
+{
+  __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(), a);  // 0 > a
+  return _mm_unpackhi_epi8(a, sign);                      // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 8 elements to 16 bits with zero extension
+static inline Vec8us extend_low(Vec16uc const &a)
+{
+  return _mm_unpacklo_epi8(a, _mm_setzero_si128());  // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 8 elements to 16 bits with zero extension
+static inline Vec8us extend_high(Vec16uc const &a)
+{
+  return _mm_unpackhi_epi8(a, _mm_setzero_si128());  // interleave with zero extensions
+}
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 4 elements to 32 bits with sign extension
+static inline Vec4i extend_low(Vec8s const &a)
+{
+  __m128i sign = _mm_srai_epi16(a, 15);  // sign bit
+  return _mm_unpacklo_epi16(a, sign);    // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 4 elements to 32 bits with sign extension
+static inline Vec4i extend_high(Vec8s const &a)
+{
+  __m128i sign = _mm_srai_epi16(a, 15);  // sign bit
+  return _mm_unpackhi_epi16(a, sign);    // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 4 elements to 32 bits with zero extension
+static inline Vec4ui extend_low(Vec8us const &a)
+{
+  return _mm_unpacklo_epi16(a, _mm_setzero_si128());  // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 4 elements to 32 bits with zero extension
+static inline Vec4ui extend_high(Vec8us const &a)
+{
+  return _mm_unpackhi_epi16(a, _mm_setzero_si128());  // interleave with zero extensions
+}
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 2 elements to 64 bits with sign extension
+static inline Vec2q extend_low(Vec4i const &a)
+{
+  __m128i sign = _mm_srai_epi32(a, 31);  // sign bit
+  return _mm_unpacklo_epi32(a, sign);    // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 2 elements to 64 bits with sign extension
+static inline Vec2q extend_high(Vec4i const &a)
+{
+  __m128i sign = _mm_srai_epi32(a, 31);  // sign bit
+  return _mm_unpackhi_epi32(a, sign);    // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 2 elements to 64 bits with zero extension
+static inline Vec2uq extend_low(Vec4ui const &a)
+{
+  return _mm_unpacklo_epi32(a, _mm_setzero_si128());  // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 2 elements to 64 bits with zero extension
+static inline Vec2uq extend_high(Vec4ui const &a)
+{
+  return _mm_unpackhi_epi32(a, _mm_setzero_si128());  // interleave with zero extensions
+}
+
+// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec16c compress(Vec8s const &low, Vec8s const &high)
+{
+  __m128i mask  = _mm_set1_epi32(0x00FF00FF);  // mask for low bytes
+  __m128i lowm  = _mm_and_si128(low, mask);    // bytes of low
+  __m128i highm = _mm_and_si128(high, mask);   // bytes of high
+  return _mm_packus_epi16(lowm, highm);        // unsigned pack
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed, with saturation
+static inline Vec16c compress_saturated(Vec8s const &low, Vec8s const &high) { return _mm_packs_epi16(low, high); }
+
+// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
+// Unsigned, overflow wraps around
+static inline Vec16uc compress(Vec8us const &low, Vec8us const &high) { return Vec16uc(compress((Vec8s)low, (Vec8s)high)); }
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Unsigned, with saturation
+static inline Vec16uc compress_saturated(Vec8us const &low, Vec8us const &high)
+{
+#if INSTRSET >= 5                                 // SSE4.1 supported
+  __m128i maxval = _mm_set1_epi32(0x00FF00FF);    // maximum value
+  __m128i minval = _mm_setzero_si128();           // minimum value = 0
+  __m128i low1   = _mm_min_epu16(low, maxval);    // upper limit
+  __m128i high1  = _mm_min_epu16(high, maxval);   // upper limit
+  __m128i low2   = _mm_max_epu16(low1, minval);   // lower limit
+  __m128i high2  = _mm_max_epu16(high1, minval);  // lower limit
+  return _mm_packus_epi16(low2, high2);           // this instruction saturates from signed 32 bit to unsigned 16 bit
+#else
+  __m128i zero    = _mm_setzero_si128();             // 0
+  __m128i signlow = _mm_cmpgt_epi16(zero, low);      // sign bit of low
+  __m128i signhi  = _mm_cmpgt_epi16(zero, high);     // sign bit of high
+  __m128i slow2   = _mm_srli_epi16(signlow, 8);      // FF if low negative
+  __m128i shigh2  = _mm_srli_epi16(signhi, 8);       // FF if high negative
+  __m128i maskns  = _mm_set1_epi32(0x7FFF7FFF);      // mask for removing sign bit
+  __m128i lowns   = _mm_and_si128(low, maskns);      // low,  with sign bit removed
+  __m128i highns  = _mm_and_si128(high, maskns);     // high, with sign bit removed
+  __m128i lowo    = _mm_or_si128(lowns, slow2);      // low,  sign bit replaced by 00FF
+  __m128i higho   = _mm_or_si128(highns, shigh2);    // high, sign bit replaced by 00FF
+  return _mm_packus_epi16(lowo, higho);              // this instruction saturates from signed 16 bit to unsigned 8 bit
+#endif
+}
+
+// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8s compress(Vec4i const &low, Vec4i const &high)
+{
+#if INSTRSET >= 5                              // SSE4.1 supported
+  __m128i mask  = _mm_set1_epi32(0x0000FFFF);  // mask for low words
+  __m128i lowm  = _mm_and_si128(low, mask);    // bytes of low
+  __m128i highm = _mm_and_si128(high, mask);   // bytes of high
+  return _mm_packus_epi32(lowm, highm);        // unsigned pack
+#else
+  __m128i low1  = _mm_shufflelo_epi16(low, 0xD8);    // low words in place
+  __m128i high1 = _mm_shufflelo_epi16(high, 0xD8);   // low words in place
+  __m128i low2  = _mm_shufflehi_epi16(low1, 0xD8);   // low words in place
+  __m128i high2 = _mm_shufflehi_epi16(high1, 0xD8);  // low words in place
+  __m128i low3  = _mm_shuffle_epi32(low2, 0xD8);     // low dwords of low  to pos. 0 and 32
+  __m128i high3 = _mm_shuffle_epi32(high2, 0xD8);    // low dwords of high to pos. 0 and 32
+  return _mm_unpacklo_epi64(low3, high3);            // interleave
+#endif
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed with saturation
+static inline Vec8s compress_saturated(Vec4i const &low, Vec4i const &high)
+{
+  return _mm_packs_epi32(low, high);  // pack with signed saturation
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8us compress(Vec4ui const &low, Vec4ui const &high) { return Vec8us(compress((Vec4i)low, (Vec4i)high)); }
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Unsigned, with saturation
+static inline Vec8us compress_saturated(Vec4ui const &low, Vec4ui const &high)
+{
+#if INSTRSET >= 5                                 // SSE4.1 supported
+  __m128i maxval = _mm_set1_epi32(0x0000FFFF);    // maximum value
+  __m128i minval = _mm_setzero_si128();           // minimum value = 0
+  __m128i low1   = _mm_min_epu32(low, maxval);    // upper limit
+  __m128i high1  = _mm_min_epu32(high, maxval);   // upper limit
+  __m128i low2   = _mm_max_epu32(low1, minval);   // lower limit
+  __m128i high2  = _mm_max_epu32(high1, minval);  // lower limit
+  return _mm_packus_epi32(low2, high2);           // this instruction saturates from signed 32 bit to unsigned 16 bit
+#else
+  __m128i zero     = _mm_setzero_si128();            // 0
+  __m128i lowzero  = _mm_cmpeq_epi16(low, zero);     // for each word is zero
+  __m128i highzero = _mm_cmpeq_epi16(high, zero);    // for each word is zero
+  __m128i mone     = _mm_set1_epi32(-1);             // FFFFFFFF
+  __m128i lownz    = _mm_xor_si128(lowzero, mone);   // for each word is nonzero
+  __m128i highnz   = _mm_xor_si128(highzero, mone);  // for each word is nonzero
+  __m128i lownz2   = _mm_srli_epi32(lownz, 16);      // shift down to low dword
+  __m128i highnz2  = _mm_srli_epi32(highnz, 16);     // shift down to low dword
+  __m128i lowsatur = _mm_or_si128(low, lownz2);      // low, saturated
+  __m128i hisatur  = _mm_or_si128(high, highnz2);    // high, saturated
+  return Vec8us(compress(Vec4i(lowsatur), Vec4i(hisatur)));
+#endif
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec4i compress(Vec2q const &low, Vec2q const &high)
+{
+  __m128i low2  = _mm_shuffle_epi32(low, 0xD8);   // low dwords of low  to pos. 0 and 32
+  __m128i high2 = _mm_shuffle_epi32(high, 0xD8);  // low dwords of high to pos. 0 and 32
+  return _mm_unpacklo_epi64(low2, high2);         // interleave
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+// This function is very inefficient unless the SSE4.2 instruction set is supported
+static inline Vec4i compress_saturated(Vec2q const &low, Vec2q const &high)
+{
+  Vec2q maxval = _mm_set_epi32(0, 0x7FFFFFFF, 0, 0x7FFFFFFF);
+  Vec2q minval = _mm_set_epi32(-1, 0x80000000, -1, 0x80000000);
+  Vec2q low1   = min(low, maxval);
+  Vec2q high1  = min(high, maxval);
+  Vec2q low2   = max(low1, minval);
+  Vec2q high2  = max(high1, minval);
+  return compress(low2, high2);
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec4ui compress(Vec2uq const &low, Vec2uq const &high) { return Vec4ui(compress((Vec2q)low, (Vec2q)high)); }
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec4ui compress_saturated(Vec2uq const &low, Vec2uq const &high)
+{
+  __m128i zero     = _mm_setzero_si128();            // 0
+  __m128i lowzero  = _mm_cmpeq_epi32(low, zero);     // for each dword is zero
+  __m128i highzero = _mm_cmpeq_epi32(high, zero);    // for each dword is zero
+  __m128i mone     = _mm_set1_epi32(-1);             // FFFFFFFF
+  __m128i lownz    = _mm_xor_si128(lowzero, mone);   // for each dword is nonzero
+  __m128i highnz   = _mm_xor_si128(highzero, mone);  // for each dword is nonzero
+  __m128i lownz2   = _mm_srli_epi64(lownz, 32);      // shift down to low dword
+  __m128i highnz2  = _mm_srli_epi64(highnz, 32);     // shift down to low dword
+  __m128i lowsatur = _mm_or_si128(low, lownz2);      // low, saturated
+  __m128i hisatur  = _mm_or_si128(high, highnz2);    // high, saturated
+  return Vec4ui(compress(Vec2q(lowsatur), Vec2q(hisatur)));
+}
+
+/*****************************************************************************
+ *
+ *          Helper functions for division and bit scan
+ *
+ *****************************************************************************/
+
+// Define popcount function. Gives sum of bits
+#if INSTRSET >= 6  // SSE4.2
+                   // popcnt instruction is not officially part of the SSE4.2 instruction set,
+                   // but available in all known processors with SSE4.2
+#if defined(__GNUC__) || defined(__clang__)
+static inline uint32_t vml_popcnt(uint32_t a) __attribute__((pure));
+static inline uint32_t vml_popcnt(uint32_t a)
+{
+  uint32_t r;
+  __asm("popcnt %1, %0" : "=r"(r) : "r"(a) :);
+  return r;
+}
+#else
+static inline uint32_t vml_popcnt(uint32_t a)
+{
+  return _mm_popcnt_u32(a);  // MS intrinsic
+}
+#endif  // platform
+#else   // no SSE4.2
+static inline uint32_t vml_popcnt(uint32_t a)
+{
+  // popcnt instruction not available
+  uint32_t b = a - ((a >> 1) & 0x55555555);
+  uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333);
+  uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F;
+  uint32_t e = d * 0x01010101;
+  return e >> 24;
+}
+#endif
+
+// Define bit-scan-forward function. Gives index to lowest set bit
+#if defined(__GNUC__) || defined(__clang__)
+static inline uint32_t bit_scan_forward(uint32_t a) __attribute__((pure));
+static inline uint32_t bit_scan_forward(uint32_t a)
+{
+  uint32_t r;
+  __asm("bsfl %1, %0" : "=r"(r) : "r"(a) :);
+  return r;
+}
+#else
+static inline uint32_t bit_scan_forward(uint32_t a)
+{
+  unsigned long r;
+  _BitScanForward(&r, a);  // defined in intrin.h for MS and Intel compilers
+  return r;
+}
+#endif
+
+// Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a))
+#if defined(__GNUC__) || defined(__clang__)
+static inline uint32_t bit_scan_reverse(uint32_t a) __attribute__((pure));
+static inline uint32_t bit_scan_reverse(uint32_t a)
+{
+  uint32_t r;
+  __asm("bsrl %1, %0" : "=r"(r) : "r"(a) :);
+  return r;
+}
+#else
+static inline uint32_t bit_scan_reverse(uint32_t a)
+{
+  unsigned long r;
+  _BitScanReverse(&r, a);  // defined in intrin.h for MS and Intel compilers
+  return r;
+}
+#endif
+
+// Same function, for compile-time constants.
+// We need template metaprogramming for calculating this function at compile time.
+// This may take a long time to compile because of the template recursion.
+// Todo: replace this with a constexpr function when C++14 becomes available
+template <uint32_t n>
+struct BitScanR
+{
+  enum
+  {
+    val = (n >= 0x10 ? 4 + (BitScanR<(n >> 4)>::val) : n < 2 ? 0 : n < 4 ? 1 : n < 8 ? 2 : 3)
+  };
+};
+template <>
+struct BitScanR<0>
+{
+  enum
+  {
+    val = 0
+  };
+};  // Avoid infinite template recursion
+
+#define bit_scan_reverse_const(n) (BitScanR<n>::val)  // n must be a valid compile-time constant
+
+/*****************************************************************************
+ *
+ *          Integer division operators
+ *
+ ******************************************************************************
+ *
+ * The instruction set does not support integer vector division. Instead, we
+ * are using a method for fast integer division based on multiplication and
+ * shift operations. This method is faster than simple integer division if the
+ * same divisor is used multiple times.
+ *
+ * All elements in a vector are divided by the same divisor. It is not possible
+ * to divide different elements of the same vector by different divisors.
+ *
+ * The parameters used for fast division are stored in an object of a
+ * Divisor class. This object can be created implicitly, for example in:
+ *        Vec4i a, b; int c;
+ *        a = b / c;
+ * or explicitly as:
+ *        a = b / Divisor_i(c);
+ *
+ * It takes more time to compute the parameters used for fast division than to
+ * do the division. Therefore, it is advantageous to use the same divisor object
+ * multiple times. For example, to divide 80 unsigned short integers by 10:
+ *
+ *        uint16_t dividends[80], quotients[80];         // numbers to work with
+ *        Divisor_us div10(10);                          // make divisor object for dividing by 10
+ *        Vec8us temp;                                   // temporary vector
+ *        for (int i = 0; i < 80; i += 8) {              // loop for 4 elements per iteration
+ *            temp.load(dividends+i);                    // load 4 elements
+ *            temp /= div10;                             // divide each element by 10
+ *            temp.store(quotients+i);                   // store 4 elements
+ *        }
+ *
+ * The parameters for fast division can also be computed at compile time. This is
+ * an advantage if the divisor is known at compile time. Use the const_int or const_uint
+ * macro to do this. For example, for signed integers:
+ *        Vec8s a, b;
+ *        a = b / const_int(10);
+ * Or, for unsigned integers:
+ *        Vec8us a, b;
+ *        a = b / const_uint(10);
+ *
+ * The division of a vector of 16-bit integers is faster than division of a vector
+ * of other integer sizes.
+ *
+ *
+ * Mathematical formula, used for signed division with fixed or variable divisor:
+ * (From T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+ * Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+ * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556 )
+ * x = dividend
+ * d = abs(divisor)
+ * w = integer word size, bits
+ * L = ceil(log2(d)) = bit_scan_reverse(d-1)+1
+ * L = max(L,1)
+ * m = 1 + 2^(w+L-1)/d - 2^w                      [division should overflow to 0 if d = 1]
+ * sh1 = L-1
+ * q = x + (m*x >> w)                             [high part of signed multiplication with 2w bits]
+ * q = (q >> sh1) - (x<0 ? -1 : 0)
+ * if (divisor < 0) q = -q
+ * result trunc(x/d) = q
+ *
+ * Mathematical formula, used for unsigned division with variable divisor:
+ * (Also from T. Granlund and P. L. Montgomery)
+ * x = dividend
+ * d = divisor
+ * w = integer word size, bits
+ * L = ceil(log2(d)) = bit_scan_reverse(d-1)+1
+ * m = 1 + 2^w * (2^L-d) / d                      [2^L should overflow to 0 if L = w]
+ * sh1 = min(L,1)
+ * sh2 = max(L-1,0)
+ * t = m*x >> w                                   [high part of unsigned multiplication with 2w bits]
+ * result floor(x/d) = (((x-t) >> sh1) + t) >> sh2
+ *
+ * Mathematical formula, used for unsigned division with fixed divisor:
+ * (From Terje Mathisen, unpublished)
+ * x = dividend
+ * d = divisor
+ * w = integer word size, bits
+ * b = floor(log2(d)) = bit_scan_reverse(d)
+ * f = 2^(w+b) / d                                [exact division]
+ * If f is an integer then d is a power of 2 then go to case A
+ * If the fractional part of f is < 0.5 then go to case B
+ * If the fractional part of f is > 0.5 then go to case C
+ * Case A:  [shift only]
+ * result = x >> b
+ * Case B:  [round down f and compensate by adding one to x]
+ * result = ((x+1)*floor(f)) >> (w+b)             [high part of unsigned multiplication with 2w bits]
+ * Case C:  [round up f, no compensation for rounding error]
+ * result = (x*ceil(f)) >> (w+b)                  [high part of unsigned multiplication with 2w bits]
+ *
+ *
+ *****************************************************************************/
+
+// encapsulate parameters for fast division on vector of 4 32-bit signed integers
+class Divisor_i
+{
+ protected:
+  __m128i multiplier;  // multiplier used in fast division
+  __m128i shift1;      // shift count used in fast division
+  __m128i sign;        // sign of divisor
+ public:
+  Divisor_i(){};  // Default constructor
+  Divisor_i(int32_t d)
+  {  // Constructor with divisor
+    set(d);
+  }
+  Divisor_i(int m, int s1, int sgn)
+  {  // Constructor with precalculated multiplier, shift and sign
+    multiplier = _mm_set1_epi32(m);
+    shift1     = _mm_cvtsi32_si128(s1);
+    sign       = _mm_set1_epi32(sgn);
+  }
+  void set(int32_t d)
+  {  // Set or change divisor, calculate parameters
+    const int32_t d1 = ::abs(d);
+    int32_t sh, m;
+    if(d1 > 1)
+      {
+        sh = bit_scan_reverse(d1 - 1);  // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1
+        m  = int32_t((int64_t(1) << (32 + sh)) / d1 - ((int64_t(1) << 32) - 1));  // calculate multiplier
+      }
+    else
+      {
+        m  = 1;  // for d1 = 1
+        sh = 0;
+        if(d == 0)
+          m /= d;  // provoke error here if d = 0
+        if(uint32_t(d) == 0x80000000u)
+          {  // fix overflow for this special case
+            m  = 0x80000001;
+            sh = 30;
+          }
+      }
+    multiplier = _mm_set1_epi32(m);               // broadcast multiplier
+    shift1     = _mm_setr_epi32(sh, 0, 0, 0);     // shift count
+    sign       = _mm_set1_epi32(d < 0 ? -1 : 0);  // sign of divisor
+  }
+  __m128i getm() const
+  {  // get multiplier
+    return multiplier;
+  }
+  __m128i gets1() const
+  {  // get shift count
+    return shift1;
+  }
+  __m128i getsign() const
+  {  // get sign of divisor
+    return sign;
+  }
+};
+
+// encapsulate parameters for fast division on vector of 4 32-bit unsigned integers
+class Divisor_ui
+{
+ protected:
+  __m128i multiplier;  // multiplier used in fast division
+  __m128i shift1;      // shift count 1 used in fast division
+  __m128i shift2;      // shift count 2 used in fast division
+ public:
+  Divisor_ui(){};  // Default constructor
+  Divisor_ui(uint32_t d)
+  {  // Constructor with divisor
+    set(d);
+  }
+  Divisor_ui(uint32_t m, int s1, int s2)
+  {  // Constructor with precalculated multiplier and shifts
+    multiplier = _mm_set1_epi32(m);
+    shift1     = _mm_setr_epi32(s1, 0, 0, 0);
+    shift2     = _mm_setr_epi32(s2, 0, 0, 0);
+  }
+  void set(uint32_t d)
+  {  // Set or change divisor, calculate parameters
+    uint32_t L, L2, sh1, sh2, m;
+    switch(d)
+      {
+        case 0:
+          m = sh1 = sh2 = 1 / d;  // provoke error for d = 0
+          break;
+        case 1:
+          m   = 1;
+          sh1 = sh2 = 0;  // parameters for d = 1
+          break;
+        case 2:
+          m   = 1;
+          sh1 = 1;
+          sh2 = 0;  // parameters for d = 2
+          break;
+        default:                                             // general case for d > 2
+          L   = bit_scan_reverse(d - 1) + 1;                 // ceil(log2(d))
+          L2  = L < 32 ? 1 << L : 0;                         // 2^L, overflow to 0 if L = 32
+          m   = 1 + uint32_t((uint64_t(L2 - d) << 32) / d);  // multiplier
+          sh1 = 1;
+          sh2 = L - 1;  // shift counts
+      }
+    multiplier = _mm_set1_epi32(m);
+    shift1     = _mm_setr_epi32(sh1, 0, 0, 0);
+    shift2     = _mm_setr_epi32(sh2, 0, 0, 0);
+  }
+  __m128i getm() const
+  {  // get multiplier
+    return multiplier;
+  }
+  __m128i gets1() const
+  {  // get shift count 1
+    return shift1;
+  }
+  __m128i gets2() const
+  {  // get shift count 2
+    return shift2;
+  }
+};
+
+// encapsulate parameters for fast division on vector of 8 16-bit signed integers
+class Divisor_s
+{
+ protected:
+  __m128i multiplier;  // multiplier used in fast division
+  __m128i shift1;      // shift count used in fast division
+  __m128i sign;        // sign of divisor
+ public:
+  Divisor_s(){};  // Default constructor
+  Divisor_s(int16_t d)
+  {  // Constructor with divisor
+    set(d);
+  }
+  Divisor_s(int16_t m, int s1, int sgn)
+  {  // Constructor with precalculated multiplier, shift and sign
+    multiplier = _mm_set1_epi16(m);
+    shift1     = _mm_setr_epi32(s1, 0, 0, 0);
+    sign       = _mm_set1_epi32(sgn);
+  }
+  void set(int16_t d)
+  {  // Set or change divisor, calculate parameters
+    const int32_t d1 = ::abs(d);
+    int32_t sh, m;
+    if(d1 > 1)
+      {
+        sh = bit_scan_reverse(d1 - 1);  // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1
+        m  = ((int32_t(1) << (16 + sh)) / d1 - ((int32_t(1) << 16) - 1));  // calculate multiplier
+      }
+    else
+      {
+        m  = 1;  // for d1 = 1
+        sh = 0;
+        if(d == 0)
+          m /= d;  // provoke error here if d = 0
+        if(uint16_t(d) == 0x8000u)
+          {  // fix overflow for this special case
+            m  = 0x8001;
+            sh = 14;
+          }
+      }
+    multiplier = _mm_set1_epi16(int16_t(m));      // broadcast multiplier
+    shift1     = _mm_setr_epi32(sh, 0, 0, 0);     // shift count
+    sign       = _mm_set1_epi32(d < 0 ? -1 : 0);  // sign of divisor
+  }
+  __m128i getm() const
+  {  // get multiplier
+    return multiplier;
+  }
+  __m128i gets1() const
+  {  // get shift count
+    return shift1;
+  }
+  __m128i getsign() const
+  {  // get sign of divisor
+    return sign;
+  }
+};
+
+// encapsulate parameters for fast division on vector of 8 16-bit unsigned integers
+class Divisor_us
+{
+ protected:
+  __m128i multiplier;  // multiplier used in fast division
+  __m128i shift1;      // shift count 1 used in fast division
+  __m128i shift2;      // shift count 2 used in fast division
+ public:
+  Divisor_us(){};  // Default constructor
+  Divisor_us(uint16_t d)
+  {  // Constructor with divisor
+    set(d);
+  }
+  Divisor_us(uint16_t m, int s1, int s2)
+  {  // Constructor with precalculated multiplier and shifts
+    multiplier = _mm_set1_epi16(m);
+    shift1     = _mm_setr_epi32(s1, 0, 0, 0);
+    shift2     = _mm_setr_epi32(s2, 0, 0, 0);
+  }
+  void set(uint16_t d)
+  {  // Set or change divisor, calculate parameters
+    uint16_t L, L2, sh1, sh2, m;
+    switch(d)
+      {
+        case 0:
+          m = sh1 = sh2 = 1 / d;  // provoke error for d = 0
+          break;
+        case 1:
+          m   = 1;
+          sh1 = sh2 = 0;  // parameters for d = 1
+          break;
+        case 2:
+          m   = 1;
+          sh1 = 1;
+          sh2 = 0;  // parameters for d = 2
+          break;
+        default:                                             // general case for d > 2
+          L   = (uint16_t)bit_scan_reverse(d - 1) + 1;       // ceil(log2(d))
+          L2  = uint16_t(1 << L);                            // 2^L, overflow to 0 if L = 16
+          m   = 1 + uint16_t((uint32_t(L2 - d) << 16) / d);  // multiplier
+          sh1 = 1;
+          sh2 = L - 1;  // shift counts
+      }
+    multiplier = _mm_set1_epi16(m);
+    shift1     = _mm_setr_epi32(sh1, 0, 0, 0);
+    shift2     = _mm_setr_epi32(sh2, 0, 0, 0);
+  }
+  __m128i getm() const
+  {  // get multiplier
+    return multiplier;
+  }
+  __m128i gets1() const
+  {  // get shift count 1
+    return shift1;
+  }
+  __m128i gets2() const
+  {  // get shift count 2
+    return shift2;
+  }
+};
+
+// vector operator / : divide each element by divisor
+
+// vector of 4 32-bit signed integers
+static inline Vec4i operator/(Vec4i const &a, Divisor_i const &d)
+{
+#if defined(__XOP__) && defined(GCC_VERSION) && GCC_VERSION <= 40702 /*??*/ && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#define XOP_MUL_BUG  // GCC has bug in XOP multiply
+// Bug found in GCC version 4.7.0 and 4.7.1
+#endif
+// todo: test this when GCC bug is fixed
+#if defined(__XOP__) && !defined(XOP_MUL_BUG)
+  __m128i t1  = _mm_mul_epi32(a, d.getm());                          // 32x32->64 bit signed multiplication of a[0] and a[2]
+  __m128i t2  = _mm_srli_epi64(t1, 32);                              // high dword of result 0 and 2
+  __m128i t3  = _mm_macchi_epi32(a, d.getm(), _mm_setzero_si128());  // 32x32->64 bit signed multiplication of a[1] and a[3]
+  __m128i t5  = _mm_set_epi32(-1, 0, -1, 0);                         // mask of dword 1 and 3
+  __m128i t7  = _mm_blendv_epi8(t2, t3, t5);                         // blend two results
+  __m128i t8  = _mm_add_epi32(t7, a);                                // add
+  __m128i t9  = _mm_sra_epi32(t8, d.gets1());                        // shift right arithmetic
+  __m128i t10 = _mm_srai_epi32(a, 31);                               // sign of a
+  __m128i t11 = _mm_sub_epi32(t10, d.getsign());                     // sign of a - sign of d
+  __m128i t12 = _mm_sub_epi32(t9, t11);                              // + 1 if a < 0, -1 if d < 0
+  return _mm_xor_si128(t12, d.getsign());                            // change sign if divisor negative
+
+#elif INSTRSET >= 5 && !defined(XOP_MUL_BUG)  // SSE4.1 supported
+  __m128i t1 = _mm_mul_epi32(a, d.getm());        // 32x32->64 bit signed multiplication of a[0] and a[2]
+  __m128i t2 = _mm_srli_epi64(t1, 32);            // high dword of result 0 and 2
+  __m128i t3 = _mm_srli_epi64(a, 32);             // get a[1] and a[3] into position for multiplication
+  __m128i t4 = _mm_mul_epi32(t3, d.getm());       // 32x32->64 bit signed multiplication of a[1] and a[3]
+  __m128i t5 = _mm_set_epi32(-1, 0, -1, 0);       // mask of dword 1 and 3
+  __m128i t7 = _mm_blendv_epi8(t2, t4, t5);       // blend two results
+  __m128i t8 = _mm_add_epi32(t7, a);              // add
+  __m128i t9 = _mm_sra_epi32(t8, d.gets1());      // shift right arithmetic
+  __m128i t10 = _mm_srai_epi32(a, 31);            // sign of a
+  __m128i t11 = _mm_sub_epi32(t10, d.getsign());  // sign of a - sign of d
+  __m128i t12 = _mm_sub_epi32(t9, t11);           // + 1 if a < 0, -1 if d < 0
+  return _mm_xor_si128(t12, d.getsign());         // change sign if divisor negative
+#else                                         // not SSE4.1
+  __m128i t1 = _mm_mul_epu32(a, d.getm());   // 32x32->64 bit unsigned multiplication of a[0] and a[2]
+  __m128i t2 = _mm_srli_epi64(t1, 32);       // high dword of result 0 and 2
+  __m128i t3 = _mm_srli_epi64(a, 32);        // get a[1] and a[3] into position for multiplication
+  __m128i t4 = _mm_mul_epu32(t3, d.getm());  // 32x32->64 bit unsigned multiplication of a[1] and a[3]
+  __m128i t5 = _mm_set_epi32(-1, 0, -1, 0);  // mask of dword 1 and 3
+  __m128i t6 = _mm_and_si128(t4, t5);        // high dword of result 1 and 3
+  __m128i t7 = _mm_or_si128(t2, t6);         // combine all four results of unsigned high mul into one vector
+  // convert unsigned to signed high multiplication (from: H S Warren: Hacker's delight, 2003, p. 132)
+  __m128i u1  = _mm_srai_epi32(a, 31);           // sign of a
+  __m128i u2  = _mm_srai_epi32(d.getm(), 31);    // sign of m [ m is always negative, except for abs(d) = 1 ]
+  __m128i u3  = _mm_and_si128(d.getm(), u1);     // m * sign of a
+  __m128i u4  = _mm_and_si128(a, u2);            // a * sign of m
+  __m128i u5  = _mm_add_epi32(u3, u4);           // sum of sign corrections
+  __m128i u6  = _mm_sub_epi32(t7, u5);           // high multiplication result converted to signed
+  __m128i t8  = _mm_add_epi32(u6, a);            // add a
+  __m128i t9  = _mm_sra_epi32(t8, d.gets1());    // shift right arithmetic
+  __m128i t10 = _mm_sub_epi32(u1, d.getsign());  // sign of a - sign of d
+  __m128i t11 = _mm_sub_epi32(t9, t10);          // + 1 if a < 0, -1 if d < 0
+  return _mm_xor_si128(t11, d.getsign());        // change sign if divisor negative
+#endif
+}
+
+// vector of 4 32-bit unsigned integers
+static inline Vec4ui operator/(Vec4ui const &a, Divisor_ui const &d)
+{
+  __m128i t1 = _mm_mul_epu32(a, d.getm());   // 32x32->64 bit unsigned multiplication of a[0] and a[2]
+  __m128i t2 = _mm_srli_epi64(t1, 32);       // high dword of result 0 and 2
+  __m128i t3 = _mm_srli_epi64(a, 32);        // get a[1] and a[3] into position for multiplication
+  __m128i t4 = _mm_mul_epu32(t3, d.getm());  // 32x32->64 bit unsigned multiplication of a[1] and a[3]
+  __m128i t5 = _mm_set_epi32(-1, 0, -1, 0);  // mask of dword 1 and 3
+#if INSTRSET >= 5                            // SSE4.1 supported
+  __m128i t7 = _mm_blendv_epi8(t2, t4, t5);  // blend two results
+#else
+  __m128i t6 = _mm_and_si128(t4, t5);             // high dword of result 1 and 3
+  __m128i t7 = _mm_or_si128(t2, t6);              // combine all four results into one vector
+#endif
+  __m128i t8  = _mm_sub_epi32(a, t7);          // subtract
+  __m128i t9  = _mm_srl_epi32(t8, d.gets1());  // shift right logical
+  __m128i t10 = _mm_add_epi32(t7, t9);         // add
+  return _mm_srl_epi32(t10, d.gets2());        // shift right logical
+}
+
+// vector of 8 16-bit signed integers
+static inline Vec8s operator/(Vec8s const &a, Divisor_s const &d)
+{
+  __m128i t1 = _mm_mulhi_epi16(a, d.getm());    // multiply high signed words
+  __m128i t2 = _mm_add_epi16(t1, a);            // + a
+  __m128i t3 = _mm_sra_epi16(t2, d.gets1());    // shift right arithmetic
+  __m128i t4 = _mm_srai_epi16(a, 15);           // sign of a
+  __m128i t5 = _mm_sub_epi16(t4, d.getsign());  // sign of a - sign of d
+  __m128i t6 = _mm_sub_epi16(t3, t5);           // + 1 if a < 0, -1 if d < 0
+  return _mm_xor_si128(t6, d.getsign());        // change sign if divisor negative
+}
+
+// vector of 8 16-bit unsigned integers
+static inline Vec8us operator/(Vec8us const &a, Divisor_us const &d)
+{
+  __m128i t1 = _mm_mulhi_epu16(a, d.getm());  // multiply high unsigned words
+  __m128i t2 = _mm_sub_epi16(a, t1);          // subtract
+  __m128i t3 = _mm_srl_epi16(t2, d.gets1());  // shift right logical
+  __m128i t4 = _mm_add_epi16(t1, t3);         // add
+  return _mm_srl_epi16(t4, d.gets2());        // shift right logical
+}
+
+// vector of 16 8-bit signed integers
+static inline Vec16c operator/(Vec16c const &a, Divisor_s const &d)
+{
+  // expand into two Vec8s
+  Vec8s low  = extend_low(a) / d;
+  Vec8s high = extend_high(a) / d;
+  return compress(low, high);
+}
+
+// vector of 16 8-bit unsigned integers
+static inline Vec16uc operator/(Vec16uc const &a, Divisor_us const &d)
+{
+  // expand into two Vec8s
+  Vec8us low  = extend_low(a) / d;
+  Vec8us high = extend_high(a) / d;
+  return compress(low, high);
+}
+
+// vector operator /= : divide
+static inline Vec8s &operator/=(Vec8s &a, Divisor_s const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator /= : divide
+static inline Vec8us &operator/=(Vec8us &a, Divisor_us const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator /= : divide
+static inline Vec4i &operator/=(Vec4i &a, Divisor_i const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator /= : divide
+static inline Vec4ui &operator/=(Vec4ui &a, Divisor_ui const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator /= : divide
+static inline Vec16c &operator/=(Vec16c &a, Divisor_s const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator /= : divide
+static inline Vec16uc &operator/=(Vec16uc &a, Divisor_us const &d)
+{
+  a = a / d;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Integer division 2: divisor is a compile-time constant
+ *
+ *****************************************************************************/
+
+// Divide Vec4i by compile-time constant
+template <int32_t d>
+static inline Vec4i divide_by_i(Vec4i const &x)
+{
+  Static_error_check<(d != 0)> Dividing_by_zero;  // Error message if dividing by zero
+  if(d == 1)
+    return x;
+  if(d == -1)
+    return -x;
+  if(uint32_t(d) == 0x80000000u)
+    return Vec4i(x == Vec4i(0x80000000)) & 1;  // prevent overflow when changing sign
+  const uint32_t d1 =
+      d > 0 ? uint32_t(d) : uint32_t(-d);  // compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits)
+  if((d1 & (d1 - 1)) == 0)
+    {
+      // d1 is a power of 2. use shift
+      const int k = bit_scan_reverse_const(d1);
+      __m128i sign;
+      if(k > 1)
+        sign = _mm_srai_epi32(x, k - 1);
+      else
+        sign = x;                                     // k copies of sign bit
+      __m128i bias   = _mm_srli_epi32(sign, 32 - k);  // bias = x >= 0 ? 0 : k-1
+      __m128i xpbias = _mm_add_epi32(x, bias);        // x + bias
+      __m128i q      = _mm_srai_epi32(xpbias, k);     // (x + bias) >> k
+      if(d > 0)
+        return q;                                    // d > 0: return  q
+      return _mm_sub_epi32(_mm_setzero_si128(), q);  // d < 0: return -q
+    }
+  // general case
+  const int32_t sh   = bit_scan_reverse_const(uint32_t(d1) - 1);  // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case)
+  const int32_t mult = int(1 + (uint64_t(1) << (32 + sh)) / uint32_t(d1) - (int64_t(1) << 32));  // multiplier
+  const Divisor_i div(mult, sh, d < 0 ? -1 : 0);
+  return x / div;
+}
+
+// define Vec4i a / const_int(d)
+template <int32_t d>
+static inline Vec4i operator/(Vec4i const &a, Const_int_t<d>)
+{
+  return divide_by_i<d>(a);
+}
+
+// define Vec4i a / const_uint(d)
+template <uint32_t d>
+static inline Vec4i operator/(Vec4i const &a, Const_uint_t<d>)
+{
+  Static_error_check<(d < 0x80000000u)> Error_overflow_dividing_signed_by_unsigned;  // Error: dividing signed by overflowing unsigned
+  return divide_by_i<int32_t(d)>(a);                                                 // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec4i &operator/=(Vec4i &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec4i &operator/=(Vec4i &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// Divide Vec4ui by compile-time constant
+template <uint32_t d>
+static inline Vec4ui divide_by_ui(Vec4ui const &x)
+{
+  Static_error_check<(d != 0)> Dividing_by_zero;  // Error message if dividing by zero
+  if(d == 1)
+    return x;                               // divide by 1
+  const int b = bit_scan_reverse_const(d);  // floor(log2(d))
+  if((uint32_t(d) & (uint32_t(d) - 1)) == 0)
+    {
+      // d is a power of 2. use shift
+      return _mm_srli_epi32(x, b);  // x >> b
+    }
+  // general case (d > 2)
+  uint32_t mult         = uint32_t((uint64_t(1) << (b + 32)) / d);         // multiplier = 2^(32+b) / d
+  const uint64_t rem    = (uint64_t(1) << (b + 32)) - uint64_t(d) * mult;  // remainder 2^(32+b) % d
+  const bool round_down = (2 * rem < d);                                   // check if fraction is less than 0.5
+  if(!round_down)
+    {
+      mult = mult + 1;  // round up mult
+    }
+  // do 32*32->64 bit unsigned multiplication and get high part of result
+  const __m128i multv = _mm_set_epi32(0, mult, 0, mult);  // zero-extend mult and broadcast
+  __m128i t1          = _mm_mul_epu32(x, multv);          // 32x32->64 bit unsigned multiplication of x[0] and x[2]
+  if(round_down)
+    {
+      t1 = _mm_add_epi64(t1, multv);  // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+  __m128i t2 = _mm_srli_epi64(t1, 32);    // high dword of result 0 and 2
+  __m128i t3 = _mm_srli_epi64(x, 32);     // get x[1] and x[3] into position for multiplication
+  __m128i t4 = _mm_mul_epu32(t3, multv);  // 32x32->64 bit unsigned multiplication of x[1] and x[3]
+  if(round_down)
+    {
+      t4 = _mm_add_epi64(t4, multv);  // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+  __m128i t5 = _mm_set_epi32(-1, 0, -1, 0);  // mask of dword 1 and 3
+#if INSTRSET >= 5                            // SSE4.1 supported
+  __m128i t7 = _mm_blendv_epi8(t2, t4, t5);  // blend two results
+#else
+  __m128i t6 = _mm_and_si128(t4, t5);             // high dword of result 1 and 3
+  __m128i t7 = _mm_or_si128(t2, t6);              // combine all four results into one vector
+#endif
+  Vec4ui q = _mm_srli_epi32(t7, b);  // shift right by b
+  return q;                          // no overflow possible
+}
+
+// define Vec4ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec4ui operator/(Vec4ui const &a, Const_uint_t<d>)
+{
+  return divide_by_ui<d>(a);
+}
+
+// define Vec4ui a / const_int(d)
+template <int32_t d>
+static inline Vec4ui operator/(Vec4ui const &a, Const_int_t<d>)
+{
+  Static_error_check<(d >= 0)> Error_dividing_unsigned_by_negative;  // Error: dividing unsigned by negative is ambiguous
+  return divide_by_ui<d>(a);                                         // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec4ui &operator/=(Vec4ui &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec4ui &operator/=(Vec4ui &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// Divide Vec8s by compile-time constant
+template <int d>
+static inline Vec8s divide_by_i(Vec8s const &x)
+{
+  const int16_t d0 = int16_t(d);                   // truncate d to 16 bits
+  Static_error_check<(d0 != 0)> Dividing_by_zero;  // Error message if dividing by zero
+  if(d0 == 1)
+    return x;  // divide by  1
+  if(d0 == -1)
+    return -x;  // divide by -1
+  if(uint16_t(d0) == 0x8000u)
+    return Vec8s(x == Vec8s(0x8000)) & 1;  // prevent overflow when changing sign
+  // if (d > 0x7FFF || d < -0x8000) return 0;                      // not relevant when d truncated to 16 bits
+  const uint16_t d1 = d0 > 0 ? d0 : -d0;  // compile-time abs(d0)
+  if((d1 & (d1 - 1)) == 0)
+    {
+      // d is a power of 2. use shift
+      const int k = bit_scan_reverse_const(uint32_t(d1));
+      __m128i sign;
+      if(k > 1)
+        sign = _mm_srai_epi16(x, k - 1);
+      else
+        sign = x;                                     // k copies of sign bit
+      __m128i bias   = _mm_srli_epi16(sign, 16 - k);  // bias = x >= 0 ? 0 : k-1
+      __m128i xpbias = _mm_add_epi16(x, bias);        // x + bias
+      __m128i q      = _mm_srai_epi16(xpbias, k);     // (x + bias) >> k
+      if(d0 > 0)
+        return q;                                    // d0 > 0: return  q
+      return _mm_sub_epi16(_mm_setzero_si128(), q);  // d0 < 0: return -q
+    }
+  // general case
+  const int L        = bit_scan_reverse_const(uint16_t(d1 - 1)) + 1;            // ceil(log2(d)). (d < 2 handled above)
+  const int16_t mult = int16_t(1 + (1u << (15 + L)) / uint32_t(d1) - 0x10000);  // multiplier
+  const int shift1   = L - 1;
+  const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1);
+  return x / div;
+}
+
+// define Vec8s a / const_int(d)
+template <int d>
+static inline Vec8s operator/(Vec8s const &a, Const_int_t<d>)
+{
+  return divide_by_i<d>(a);
+}
+
+// define Vec8s a / const_uint(d)
+template <uint32_t d>
+static inline Vec8s operator/(Vec8s const &a, Const_uint_t<d>)
+{
+  Static_error_check<(d < 0x8000u)> Error_overflow_dividing_signed_by_unsigned;  // Error: dividing signed by overflowing unsigned
+  return divide_by_i<int(d)>(a);                                                 // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8s &operator/=(Vec8s &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8s &operator/=(Vec8s &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// Divide Vec8us by compile-time constant
+template <uint32_t d>
+static inline Vec8us divide_by_ui(Vec8us const &x)
+{
+  const uint16_t d0 = uint16_t(d);                 // truncate d to 16 bits
+  Static_error_check<(d0 != 0)> Dividing_by_zero;  // Error message if dividing by zero
+  if(d0 == 1)
+    return x;                                // divide by 1
+  const int b = bit_scan_reverse_const(d0);  // floor(log2(d))
+  if((d0 & (d0 - 1)) == 0)
+    {
+      // d is a power of 2. use shift
+      return _mm_srli_epi16(x, b);  // x >> b
+    }
+  // general case (d > 2)
+  uint16_t mult         = uint16_t((uint32_t(1) << (b + 16)) / d0);         // multiplier = 2^(32+b) / d
+  const uint32_t rem    = (uint32_t(1) << (b + 16)) - uint32_t(d0) * mult;  // remainder 2^(32+b) % d
+  const bool round_down = (2 * rem < d0);                                   // check if fraction is less than 0.5
+  Vec8us x1             = x;
+  if(round_down)
+    {
+      x1 = x1 + 1;  // round down mult and compensate by adding 1 to x
+    }
+  else
+    {
+      mult = mult + 1;  // round up mult. no compensation needed
+    }
+  const __m128i multv = _mm_set1_epi16(mult);        // broadcast mult
+  __m128i xm          = _mm_mulhi_epu16(x1, multv);  // high part of 16x16->32 bit unsigned multiplication
+  Vec8us q            = _mm_srli_epi16(xm, b);       // shift right by b
+  if(round_down)
+    {
+      Vec8sb overfl = (x1 == (Vec8us)_mm_setzero_si128());  // check for overflow of x+1
+      return select(overfl, Vec8us(mult >> b), q);          // deal with overflow (rarely needed)
+    }
+  else
+    {
+      return q;  // no overflow possible
+    }
+}
+
+// define Vec8us a / const_uint(d)
+template <uint32_t d>
+static inline Vec8us operator/(Vec8us const &a, Const_uint_t<d>)
+{
+  return divide_by_ui<d>(a);
+}
+
+// define Vec8us a / const_int(d)
+template <int d>
+static inline Vec8us operator/(Vec8us const &a, Const_int_t<d>)
+{
+  Static_error_check<(d >= 0)> Error_dividing_unsigned_by_negative;  // Error: dividing unsigned by negative is ambiguous
+  return divide_by_ui<d>(a);                                         // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8us &operator/=(Vec8us &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8us &operator/=(Vec8us &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// define Vec16c a / const_int(d)
+template <int d>
+static inline Vec16c operator/(Vec16c const &a, Const_int_t<d>)
+{
+  // expand into two Vec8s
+  Vec8s low  = extend_low(a) / Const_int_t<d>();
+  Vec8s high = extend_high(a) / Const_int_t<d>();
+  return compress(low, high);
+}
+
+// define Vec16c a / const_uint(d)
+template <uint32_t d>
+static inline Vec16c operator/(Vec16c const &a, Const_uint_t<d>)
+{
+  Static_error_check<(uint8_t(d) < 0x80u)>
+      Error_overflow_dividing_signed_by_unsigned;  // Error: dividing signed by overflowing unsigned
+  return a / Const_int_t<d>();                     // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16c &operator/=(Vec16c &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16c &operator/=(Vec16c &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// define Vec16uc a / const_uint(d)
+template <uint32_t d>
+static inline Vec16uc operator/(Vec16uc const &a, Const_uint_t<d>)
+{
+  // expand into two Vec8usc
+  Vec8us low  = extend_low(a) / Const_uint_t<d>();
+  Vec8us high = extend_high(a) / Const_uint_t<d>();
+  return compress(low, high);
+}
+
+// define Vec16uc a / const_int(d)
+template <int d>
+static inline Vec16uc operator/(Vec16uc const &a, Const_int_t<d>)
+{
+  Static_error_check<(int8_t(d) >= 0)> Error_dividing_unsigned_by_negative;  // Error: dividing unsigned by negative is ambiguous
+  return a / Const_uint_t<d>();                                              // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16uc &operator/=(Vec16uc &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16uc &operator/=(Vec16uc &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal scan functions
+ *
+ *****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec16cb const &x)
+{
+  uint32_t a = _mm_movemask_epi8(x);
+  if(a == 0)
+    return -1;
+  int32_t b = bit_scan_forward(a);
+  return b;
+}
+
+static inline int horizontal_find_first(Vec8sb const &x)
+{
+  return horizontal_find_first(Vec16cb(x)) >> 1;  // must use signed shift
+}
+
+static inline int horizontal_find_first(Vec4ib const &x)
+{
+  return horizontal_find_first(Vec16cb(x)) >> 2;  // must use signed shift
+}
+
+static inline int horizontal_find_first(Vec2qb const &x)
+{
+  return horizontal_find_first(Vec16cb(x)) >> 3;  // must use signed shift
+}
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec16cb const &x)
+{
+  uint32_t a = _mm_movemask_epi8(x);
+  return vml_popcnt(a);
+}
+
+static inline uint32_t horizontal_count(Vec8sb const &x) { return horizontal_count(Vec16cb(x)) >> 1; }
+
+static inline uint32_t horizontal_count(Vec4ib const &x) { return horizontal_count(Vec16cb(x)) >> 2; }
+
+static inline uint32_t horizontal_count(Vec2qb const &x) { return horizontal_count(Vec16cb(x)) >> 3; }
+
+/*****************************************************************************
+ *
+ *          Boolean <-> bitfield conversion functions
+ *
+ *****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16cb const &x) { return (uint16_t)_mm_movemask_epi8(x); }
+
+// to_Vec16bc: convert integer bitfield to boolean vector
+static inline Vec16cb to_Vec16cb(uint16_t x)
+{
+  static const uint32_t table[16] = {// lookup-table
+                                     0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF,
+                                     0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF};
+  uint32_t a0                     = table[x & 0xF];
+  uint32_t a1                     = table[(x >> 4) & 0xF];
+  uint32_t a2                     = table[(x >> 8) & 0xF];
+  uint32_t a3                     = table[(x >> 12) & 0xF];
+  return Vec16cb(Vec16c(Vec4ui(a0, a1, a2, a3)));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8sb const &x)
+{
+  __m128i a = _mm_packs_epi16(x, x);  // 16-bit words to bytes
+  return (uint8_t)_mm_movemask_epi8(a);
+}
+
+// to_Vec8sb: convert integer bitfield to boolean vector
+static inline Vec8sb to_Vec8sb(uint8_t x)
+{
+  static const uint32_t table[16] = {// lookup-table
+                                     0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF,
+                                     0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF};
+  uint32_t a0                     = table[x & 0xF];
+  uint32_t a1                     = table[(x >> 4) & 0xF];
+  Vec4ui b                        = Vec4ui(a0, a1, a0, a1);
+  return _mm_unpacklo_epi8(b, b);  // duplicate bytes to 16-bit words
+}
+
+#if INSTRSET < 9 || MAX_VECTOR_SIZE < 512
+// These functions are defined in Vectori512.h if AVX512 instruction set is used
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4ib const &x)
+{
+  __m128i a = _mm_packs_epi32(x, x);  // 32-bit dwords to 16-bit words
+  __m128i b = _mm_packs_epi16(a, a);  // 16-bit words to bytes
+  return _mm_movemask_epi8(b) & 0xF;
+}
+
+// to_Vec4ib: convert integer bitfield to boolean vector
+static inline Vec4ib to_Vec4ib(uint8_t x)
+{
+  static const uint32_t table[16] = {// lookup-table
+                                     0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF,
+                                     0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF};
+  uint32_t a                      = table[x & 0xF];            // 4 bytes
+  __m128i b                       = _mm_cvtsi32_si128(a);      // transfer to vector register
+  __m128i c                       = _mm_unpacklo_epi8(b, b);   // duplicate bytes to 16-bit words
+  __m128i d                       = _mm_unpacklo_epi16(c, c);  // duplicate 16-bit words to 32-bit dwords
+  return d;
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec2qb const &x)
+{
+  uint32_t a = _mm_movemask_epi8(x);
+  return (a & 1) | ((a >> 7) & 2);
+}
+
+// to_Vec2qb: convert integer bitfield to boolean vector
+static inline Vec2qb to_Vec2qb(uint8_t x) { return Vec2qb(Vec2q(-(x & 1), -((x >> 1) & 1))); }
+
+#else  // function prototypes here only
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4ib x);
+
+// to_Vec4ib: convert integer bitfield to boolean vector
+static inline Vec4ib to_Vec4ib(uint8_t x);
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec2qb x);
+
+// to_Vec2qb: convert integer bitfield to boolean vector
+static inline Vec2qb to_Vec2qb(uint8_t x);
+
+#endif  // INSTRSET < 9 || MAX_VECTOR_SIZE < 512
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORI128_H
diff --git a/src/vectorclass/vectori256.h b/src/vectorclass/vectori256.h
new file mode 100644
index 0000000000000000000000000000000000000000..56668ab78742ffc00dc0d3582657549931928065
--- /dev/null
+++ b/src/vectorclass/vectori256.h
@@ -0,0 +1,5468 @@
+/****************************  vectori256.h   *******************************
+ * Author:        Agner Fog
+ * Date created:  2012-05-30
+ * Last modified: 2017-02-19
+ * Version:       1.27
+ * Project:       vector classes
+ * Description:
+ * Header file defining integer vector classes as interface to intrinsic
+ * functions in x86 microprocessors with AVX2 and later instruction sets.
+ *
+ * Instructions:
+ * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
+ * instruction set, which must be at least AVX2.
+ *
+ * The following vector classes are defined here:
+ * Vec256b   Vector of 256  1-bit unsigned  integers or Booleans
+ * Vec32c    Vector of  32  8-bit signed    integers
+ * Vec32uc   Vector of  32  8-bit unsigned  integers
+ * Vec32cb   Vector of  32  Booleans for use with Vec32c and Vec32uc
+ * Vec16s    Vector of  16  16-bit signed   integers
+ * Vec16us   Vector of  16  16-bit unsigned integers
+ * Vec16sb   Vector of  16  Booleans for use with Vec16s and Vec16us
+ * Vec8i     Vector of   8  32-bit signed   integers
+ * Vec8ui    Vector of   8  32-bit unsigned integers
+ * Vec8ib    Vector of   8  Booleans for use with Vec8i and Vec8ui
+ * Vec4q     Vector of   4  64-bit signed   integers
+ * Vec4uq    Vector of   4  64-bit unsigned integers
+ * Vec4qb    Vector of   4  Booleans for use with Vec4q and Vec4uq
+ *
+ * Each vector object is represented internally in the CPU as a 256-bit register.
+ * This header file defines operators and functions for these vectors.
+ *
+ * For example:
+ * Vec8i a(1,2,3,4,5,6,7,8), b(9,10,11,12,13,14,15,16), c;
+ * c = a + b;     // now c contains (10,12,14,16,18,20,22,24)
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
+ *****************************************************************************/
+
+// check combination of header files
+#if defined(VECTORI256_H)
+#if VECTORI256_H != 2
+#error Two different versions of vectori256.h included
+#endif
+#else
+#define VECTORI256_H 2
+
+#ifdef VECTORF256_H
+#error Please put header file vectori256.h before vectorf256.h
+#endif
+
+#if INSTRSET < 8  // AVX2 required
+#error Wrong instruction set for vectori256.h, AVX2 required or use vectori256e.h
+#endif
+
+#include "vectori128.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/*****************************************************************************
+ *
+ *         Join two 128-bit vectors
+ *
+ *****************************************************************************/
+#define set_m128ir(lo, hi) _mm256_inserti128_si256(_mm256_castsi128_si256(lo), (hi), 1)
+
+/*****************************************************************************
+ *
+ *          Vector of 256 1-bit unsigned integers or Booleans
+ *
+ *****************************************************************************/
+class Vec256b
+{
+ protected:
+  __m256i ymm;  // Integer vector
+ public:
+  // Default constructor:
+  Vec256b() {}
+  // Constructor to broadcast the same value into all elements
+  // Removed because of undesired implicit conversions
+  // Vec256b(int i) {
+  //    ymm = _mm256_set1_epi32(-(i & 1));}
+
+  // Constructor to build from two Vec128b:
+  Vec256b(Vec128b const &a0, Vec128b const &a1) { ymm = set_m128ir(a0, a1); }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec256b(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec256b &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m256i used in intrinsics
+  operator __m256i() const { return ymm; }
+  // Member function to load from array (unaligned)
+  Vec256b &load(void const *p)
+  {
+    ymm = _mm256_loadu_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+  Vec256b &load_a(void const *p)
+  {
+    ymm = _mm256_load_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(void *p) const { _mm256_storeu_si256((__m256i *)p, ymm); }
+  // Member function to store into array, aligned by 32
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+  void store_a(void *p) const { _mm256_store_si256((__m256i *)p, ymm); }
+  // Member function to change a single bit
+  // Note: This function is inefficient. Use load function if changing more than one bit
+  Vec256b const &set_bit(uint32_t index, int value)
+  {
+    static uint64_t m[8] = {0, 0, 0, 0, 1, 0, 0, 0};
+    int wi               = (index >> 6) & 3;  // qword index
+    int bi               = index & 0x3F;      // bit index within qword w
+
+    __m256i mask = Vec256b().load(m + 4 - wi);                     // 1 in qword number wi
+    mask         = _mm256_sll_epi64(mask, _mm_cvtsi32_si128(bi));  // mask with bit number b set
+    if(value & 1)
+      {
+        ymm = _mm256_or_si256(mask, ymm);
+      }
+    else
+      {
+        ymm = _mm256_andnot_si256(mask, ymm);
+      }
+    return *this;
+  }
+  // Member function to get a single bit
+  // Note: This function is inefficient. Use store function if reading more than one bit
+  int get_bit(uint32_t index) const
+  {
+    union
+    {
+      __m256i x;
+      uint8_t i[32];
+    } u;
+    u.x    = ymm;
+    int wi = (index >> 3) & 0x1F;  // byte index
+    int bi = index & 7;            // bit index within byte w
+    return (u.i[wi] >> bi) & 1;
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return get_bit(index) != 0; }
+  // Member functions to split into two Vec128b:
+  Vec128b get_low() const { return _mm256_castsi256_si128(ymm); }
+  Vec128b get_high() const { return _mm256_extractf128_si256(ymm, 1); }
+  static int size() { return 256; }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec256b operator&(Vec256b const &a, Vec256b const &b) { return _mm256_and_si256(a, b); }
+static inline Vec256b operator&&(Vec256b const &a, Vec256b const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec256b operator|(Vec256b const &a, Vec256b const &b) { return _mm256_or_si256(a, b); }
+static inline Vec256b operator||(Vec256b const &a, Vec256b const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec256b operator^(Vec256b const &a, Vec256b const &b) { return _mm256_xor_si256(a, b); }
+
+// vector operator ~ : bitwise not
+static inline Vec256b operator~(Vec256b const &a) { return _mm256_xor_si256(a, _mm256_set1_epi32(-1)); }
+
+// vector operator &= : bitwise and
+static inline Vec256b &operator&=(Vec256b &a, Vec256b const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec256b &operator|=(Vec256b &a, Vec256b const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec256b &operator^=(Vec256b &a, Vec256b const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec256b andnot(Vec256b const &a, Vec256b const &b) { return _mm256_andnot_si256(b, a); }
+
+/*****************************************************************************
+ *
+ *          Generate compile-time constant vector
+ *
+ *****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory.
+// Can be converted to any integer vector type
+template <int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7>
+static inline __m256i constant8i()
+{
+  static const union
+  {
+    int32_t i[8];
+    __m256i ymm;
+  } u = {{i0, i1, i2, i3, i4, i5, i6, i7}};
+  return u.ymm;
+}
+
+template <uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7>
+static inline __m256i constant8ui()
+{
+  return constant8i<int32_t(i0), int32_t(i1), int32_t(i2), int32_t(i3), int32_t(i4), int32_t(i5), int32_t(i6), int32_t(i7)>();
+}
+
+/*****************************************************************************
+ *
+ *          selectb function
+ *
+ *****************************************************************************/
+// Select between two sources, byte by byte. Used in various functions and operators
+// Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
+// Only bit 7 in each byte of s is checked,
+static inline __m256i selectb(__m256i const &s, __m256i const &a, __m256i const &b) { return _mm256_blendv_epi8(b, a, s); }
+
+/*****************************************************************************
+ *
+ *          Horizontal Boolean functions
+ *
+ *****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec256b const &a)
+{
+  return _mm256_testc_si256(a, constant8i<-1, -1, -1, -1, -1, -1, -1, -1>()) != 0;
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec256b const &a) { return !_mm256_testz_si256(a, a); }
+
+/*****************************************************************************
+ *
+ *          Vector of 32 8-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec32c : public Vec256b
+{
+ public:
+  // Default constructor:
+  Vec32c() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec32c(int i) { ymm = _mm256_set1_epi8((char)i); }
+  // Constructor to build from all elements:
+  Vec32c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7, int8_t i8, int8_t i9, int8_t i10,
+         int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15, int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20,
+         int8_t i21, int8_t i22, int8_t i23, int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30,
+         int8_t i31)
+  {
+    ymm = _mm256_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22,
+                           i23, i24, i25, i26, i27, i28, i29, i30, i31);
+  }
+  // Constructor to build from two Vec16c:
+  Vec32c(Vec16c const &a0, Vec16c const &a1) { ymm = set_m128ir(a0, a1); }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec32c(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec32c &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m256i used in intrinsics
+  operator __m256i() const { return ymm; }
+  // Member function to load from array (unaligned)
+  Vec32c &load(void const *p)
+  {
+    ymm = _mm256_loadu_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec32c &load_a(void const *p)
+  {
+    ymm = _mm256_load_si256((__m256i const *)p);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec32c &load_partial(int n, void const *p)
+  {
+    if(n <= 0)
+      {
+        *this = 0;
+      }
+    else if(n <= 16)
+      {
+        *this = Vec32c(Vec16c().load_partial(n, p), 0);
+      }
+    else if(n < 32)
+      {
+        *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n - 16, (char const *)p + 16));
+      }
+    else
+      {
+        load(p);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n <= 0)
+      {
+        return;
+      }
+    else if(n <= 16)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n < 32)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 16, (char *)p + 16);
+      }
+    else
+      {
+        store(p);
+      }
+  }
+  // cut off vector to n elements. The last 32-n elements are set to zero
+  Vec32c &cutoff(int n)
+  {
+    if(uint32_t(n) >= 32)
+      return *this;
+    static const union
+    {
+      int32_t i[16];
+      char c[64];
+    } mask = {{-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}};
+    *this &= Vec32c().load(mask.c + 32 - n);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec32c const &insert(uint32_t index, int8_t value)
+  {
+    static const int8_t maskl[64] = {0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    __m256i broad                 = _mm256_set1_epi8(value);                            // broadcast value into all elements
+    __m256i mask = _mm256_loadu_si256((__m256i const *)(maskl + 32 - (index & 0x1F)));  // mask with FF at index position
+    ymm          = selectb(mask, broad, ymm);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int8_t extract(uint32_t index) const
+  {
+    int8_t x[32];
+    store(x);
+    return x[index & 0x1F];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int8_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec16c:
+  Vec16c get_low() const { return _mm256_castsi256_si128(ymm); }
+  Vec16c get_high() const
+  {
+#if defined(_MSC_VER) && _MSC_VER <= 1700 && !defined(__INTEL_COMPILER)
+    return _mm256_extractf128_si256(ymm, 1);  // workaround bug in MS compiler VS 11
+#else
+    return _mm256_extracti128_si256(ymm, 1);
+#endif
+  }
+  static int size() { return 32; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec32cb: Vector of 32 Booleans for use with Vec32c and Vec32uc
+ *
+ *****************************************************************************/
+
+class Vec32cb : public Vec32c
+{
+ public:
+  // Default constructor:
+  Vec32cb() {}
+  // Constructor to build from all elements:
+  Vec32cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, bool x8, bool x9, bool x10, bool x11, bool x12,
+          bool x13, bool x14, bool x15, bool x16, bool x17, bool x18, bool x19, bool x20, bool x21, bool x22, bool x23, bool x24,
+          bool x25, bool x26, bool x27, bool x28, bool x29, bool x30, bool x31)
+      : Vec32c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), -int8_t(x8),
+               -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15), -int8_t(x16),
+               -int8_t(x17), -int8_t(x18), -int8_t(x19), -int8_t(x20), -int8_t(x21), -int8_t(x22), -int8_t(x23), -int8_t(x24),
+               -int8_t(x25), -int8_t(x26), -int8_t(x27), -int8_t(x28), -int8_t(x29), -int8_t(x30), -int8_t(x31))
+  {
+  }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec32cb(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec32cb &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec32cb(bool b) : Vec32c(-int8_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec32cb &operator=(bool b)
+  {
+    *this = Vec32cb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec32cb(int b);
+  Vec32cb &operator=(int x);
+
+ public:
+  // Member functions to split into two Vec16c:
+  Vec16cb get_low() const { return Vec16cb(Vec32c::get_low()); }
+  Vec16cb get_high() const { return Vec16cb(Vec32c::get_high()); }
+  Vec32cb &insert(int index, bool a)
+  {
+    Vec32c::insert(index, -(int)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const { return Vec32c::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec32cb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec32cb operator&(Vec32cb const &a, Vec32cb const &b) { return Vec32cb(Vec256b(a) & Vec256b(b)); }
+static inline Vec32cb operator&&(Vec32cb const &a, Vec32cb const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec32cb &operator&=(Vec32cb &a, Vec32cb const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32cb operator|(Vec32cb const &a, Vec32cb const &b) { return Vec32cb(Vec256b(a) | Vec256b(b)); }
+static inline Vec32cb operator||(Vec32cb const &a, Vec32cb const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec32cb &operator|=(Vec32cb &a, Vec32cb const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32cb operator^(Vec32cb const &a, Vec32cb const &b) { return Vec32cb(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec32cb &operator^=(Vec32cb &a, Vec32cb const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32cb operator~(Vec32cb const &a) { return Vec32cb(~Vec256b(a)); }
+
+// vector operator ! : element not
+static inline Vec32cb operator!(Vec32cb const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec32cb andnot(Vec32cb const &a, Vec32cb const &b) { return Vec32cb(andnot(Vec256b(a), Vec256b(b))); }
+
+/*****************************************************************************
+ *
+ *          Operators for Vec32c
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec32c operator+(Vec32c const &a, Vec32c const &b) { return _mm256_add_epi8(a, b); }
+
+// vector operator += : add
+static inline Vec32c &operator+=(Vec32c &a, Vec32c const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec32c operator++(Vec32c &a, int)
+{
+  Vec32c a0 = a;
+  a         = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec32c &operator++(Vec32c &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec32c operator-(Vec32c const &a, Vec32c const &b) { return _mm256_sub_epi8(a, b); }
+
+// vector operator - : unary minus
+static inline Vec32c operator-(Vec32c const &a) { return _mm256_sub_epi8(_mm256_setzero_si256(), a); }
+
+// vector operator -= : add
+static inline Vec32c &operator-=(Vec32c &a, Vec32c const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec32c operator--(Vec32c &a, int)
+{
+  Vec32c a0 = a;
+  a         = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec32c &operator--(Vec32c &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec32c operator*(Vec32c const &a, Vec32c const &b)
+{
+  // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies
+  __m256i aodd    = _mm256_srli_epi16(a, 8);         // odd numbered elements of a
+  __m256i bodd    = _mm256_srli_epi16(b, 8);         // odd numbered elements of b
+  __m256i muleven = _mm256_mullo_epi16(a, b);        // product of even numbered elements
+  __m256i mulodd  = _mm256_mullo_epi16(aodd, bodd);  // product of odd  numbered elements
+  mulodd          = _mm256_slli_epi16(mulodd, 8);    // put odd numbered elements back in place
+  __m256i mask    = _mm256_set1_epi32(0x00FF00FF);   // mask for even positions
+  __m256i product = selectb(mask, muleven, mulodd);  // interleave even and odd
+  return product;
+}
+
+// vector operator *= : multiply
+static inline Vec32c &operator*=(Vec32c &a, Vec32c const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32c operator<<(Vec32c const &a, int b)
+{
+  uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;                      // mask to remove bits that are shifted out
+  __m256i am    = _mm256_and_si256(a, _mm256_set1_epi8((char)mask));  // remove bits that will overflow
+  __m256i res   = _mm256_sll_epi16(am, _mm_cvtsi32_si128(b));         // 16-bit shifts
+  return res;
+}
+
+// vector operator <<= : shift left
+static inline Vec32c &operator<<=(Vec32c &a, int b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic all elements
+static inline Vec32c operator>>(Vec32c const &a, int b)
+{
+  __m256i aeven = _mm256_slli_epi16(a, 8);                            // even numbered elements of a. get sign bit in position
+  aeven         = _mm256_sra_epi16(aeven, _mm_cvtsi32_si128(b + 8));  // shift arithmetic, back to position
+  __m256i aodd  = _mm256_sra_epi16(a, _mm_cvtsi32_si128(b));          // shift odd numbered elements arithmetic
+  __m256i mask  = _mm256_set1_epi32(0x00FF00FF);                      // mask for even positions
+  __m256i res   = selectb(mask, aeven, aodd);                         // interleave even and odd
+  return res;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32c &operator>>=(Vec32c &a, int b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec32cb operator==(Vec32c const &a, Vec32c const &b) { return _mm256_cmpeq_epi8(a, b); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec32cb operator!=(Vec32c const &a, Vec32c const &b) { return Vec32cb(Vec32c(~(a == b))); }
+
+// vector operator > : returns true for elements for which a > b (signed)
+static inline Vec32cb operator>(Vec32c const &a, Vec32c const &b) { return _mm256_cmpgt_epi8(a, b); }
+
+// vector operator < : returns true for elements for which a < b (signed)
+static inline Vec32cb operator<(Vec32c const &a, Vec32c const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec32cb operator>=(Vec32c const &a, Vec32c const &b) { return Vec32cb(Vec32c(~(b > a))); }
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec32cb operator<=(Vec32c const &a, Vec32c const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec32c operator&(Vec32c const &a, Vec32c const &b) { return Vec32c(Vec256b(a) & Vec256b(b)); }
+static inline Vec32c operator&&(Vec32c const &a, Vec32c const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec32c &operator&=(Vec32c &a, Vec32c const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32c operator|(Vec32c const &a, Vec32c const &b) { return Vec32c(Vec256b(a) | Vec256b(b)); }
+static inline Vec32c operator||(Vec32c const &a, Vec32c const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec32c &operator|=(Vec32c &a, Vec32c const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32c operator^(Vec32c const &a, Vec32c const &b) { return Vec32c(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec32c &operator^=(Vec32c &a, Vec32c const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32c operator~(Vec32c const &a) { return Vec32c(~Vec256b(a)); }
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec32cb operator!(Vec32c const &a) { return _mm256_cmpeq_epi8(a, _mm256_setzero_si256()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+static inline Vec32c select(Vec32cb const &s, Vec32c const &a, Vec32c const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32c if_add(Vec32cb const &f, Vec32c const &a, Vec32c const &b) { return a + (Vec32c(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add(Vec32c const &a)
+{
+  __m256i sum1 = _mm256_sad_epu8(a, _mm256_setzero_si256());
+  __m256i sum2 = _mm256_shuffle_epi32(sum1, 2);
+  __m256i sum3 = _mm256_add_epi16(sum1, sum2);
+#if defined(_MSC_VER) && _MSC_VER <= 1700 && !defined(__INTEL_COMPILER)
+  __m128i sum4 = _mm256_extractf128_si256(sum3, 1);  // bug in MS VS 11
+#else
+  __m128i sum4 = _mm256_extracti128_si256(sum3, 1);
+#endif
+  __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3), sum4);
+  int8_t sum6  = (int8_t)_mm_cvtsi128_si32(sum5);  // truncate to 8 bits
+  return sum6;                                     // sign extend to 32 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x(Vec32c const &a)
+{
+  __m256i aeven = _mm256_slli_epi16(a, 8);        // even numbered elements of a. get sign bit in position
+  aeven         = _mm256_srai_epi16(aeven, 8);    // sign extend even numbered elements
+  __m256i aodd  = _mm256_srai_epi16(a, 8);        // sign extend odd  numbered elements
+  __m256i sum1  = _mm256_add_epi16(aeven, aodd);  // add even and odd elements
+  __m256i sum2  = _mm256_hadd_epi16(sum1, sum1);  // horizontally add 2x8 elements in 3 steps
+  __m256i sum3  = _mm256_hadd_epi16(sum2, sum2);
+  __m256i sum4  = _mm256_hadd_epi16(sum3, sum3);
+#if defined(_MSC_VER) && _MSC_VER <= 1700 && !defined(__INTEL_COMPILER)
+  __m128i sum5  = _mm256_extractf128_si256(sum4, 1);                  // bug in MS VS 11
+#else
+  __m128i sum5 = _mm256_extracti128_si256(sum4, 1);  // get high sum
+#endif
+  __m128i sum6  = _mm_add_epi16(_mm256_castsi256_si128(sum4), sum5);  // add high and low sum
+  int16_t sum7  = (int16_t)_mm_cvtsi128_si32(sum6);                   // 16 bit sum
+  return sum7;                                                        // sign extend to 32 bits
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec32c add_saturated(Vec32c const &a, Vec32c const &b) { return _mm256_adds_epi8(a, b); }
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec32c sub_saturated(Vec32c const &a, Vec32c const &b) { return _mm256_subs_epi8(a, b); }
+
+// function max: a > b ? a : b
+static inline Vec32c max(Vec32c const &a, Vec32c const &b) { return _mm256_max_epi8(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec32c min(Vec32c const &a, Vec32c const &b) { return _mm256_min_epi8(a, b); }
+
+// function abs: a >= 0 ? a : -a
+static inline Vec32c abs(Vec32c const &a) { return _mm256_sign_epi8(a, a); }
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec32c abs_saturated(Vec32c const &a)
+{
+  __m256i absa   = abs(a);                                           // abs(a)
+  __m256i overfl = _mm256_cmpgt_epi8(_mm256_setzero_si256(), absa);  // 0 > a
+  return _mm256_add_epi8(absa, overfl);                              // subtract 1 if 0x80
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec32c rotate_left(Vec32c const &a, int b)
+{
+  __m128i bb        = _mm_cvtsi32_si128(b & 7);              // b modulo 8
+  __m128i mbb       = _mm_cvtsi32_si128((8 - b) & 7);        // 8-b modulo 8
+  __m256i maskeven  = _mm256_set1_epi32(0x00FF00FF);         // mask for even numbered bytes
+  __m256i even      = _mm256_and_si256(a, maskeven);         // even numbered bytes of a
+  __m256i odd       = _mm256_andnot_si256(maskeven, a);      // odd numbered bytes of a
+  __m256i evenleft  = _mm256_sll_epi16(even, bb);            // even bytes of a << b
+  __m256i oddleft   = _mm256_sll_epi16(odd, bb);             // odd  bytes of a << b
+  __m256i evenright = _mm256_srl_epi16(even, mbb);           // even bytes of a >> 8-b
+  __m256i oddright  = _mm256_srl_epi16(odd, mbb);            // odd  bytes of a >> 8-b
+  __m256i evenrot   = _mm256_or_si256(evenleft, evenright);  // even bytes of a rotated
+  __m256i oddrot    = _mm256_or_si256(oddleft, oddright);    // odd  bytes of a rotated
+  __m256i allrot    = selectb(maskeven, evenrot, oddrot);    // all  bytes rotated
+  return allrot;
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 16 8-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec32uc : public Vec32c
+{
+ public:
+  // Default constructor:
+  Vec32uc() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec32uc(uint32_t i) { ymm = _mm256_set1_epi8((char)i); }
+  // Constructor to build from all elements:
+  Vec32uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, uint8_t i8, uint8_t i9,
+          uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15, uint8_t i16, uint8_t i17, uint8_t i18,
+          uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23, uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27,
+          uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31)
+  {
+    ymm = _mm256_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22,
+                           i23, i24, i25, i26, i27, i28, i29, i30, i31);
+  }
+  // Constructor to build from two Vec16uc:
+  Vec32uc(Vec16uc const &a0, Vec16uc const &a1) { ymm = set_m128ir(a0, a1); }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec32uc(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec32uc &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec32uc &load(void const *p)
+  {
+    ymm = _mm256_loadu_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec32uc &load_a(void const *p)
+  {
+    ymm = _mm256_load_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec32uc const &insert(uint32_t index, uint8_t value)
+  {
+    Vec32c::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint8_t extract(uint32_t index) const { return Vec32c::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint8_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec16uc:
+  Vec16uc get_low() const { return _mm256_castsi256_si128(ymm); }
+  Vec16uc get_high() const { return _mm256_extractf128_si256(ymm, 1); }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec32uc operator+(Vec32uc const &a, Vec32uc const &b) { return Vec32uc(Vec32c(a) + Vec32c(b)); }
+
+// vector operator - : subtract
+static inline Vec32uc operator-(Vec32uc const &a, Vec32uc const &b) { return Vec32uc(Vec32c(a) - Vec32c(b)); }
+
+// vector operator * : multiply
+static inline Vec32uc operator*(Vec32uc const &a, Vec32uc const &b) { return Vec32uc(Vec32c(a) * Vec32c(b)); }
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator<<(Vec32uc const &a, uint32_t b)
+{
+  uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;                      // mask to remove bits that are shifted out
+  __m256i am    = _mm256_and_si256(a, _mm256_set1_epi8((char)mask));  // remove bits that will overflow
+  __m256i res   = _mm256_sll_epi16(am, _mm_cvtsi32_si128(b));         // 16-bit shifts
+  return res;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator<<(Vec32uc const &a, int32_t b) { return a << (uint32_t)b; }
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator>>(Vec32uc const &a, uint32_t b)
+{
+  uint32_t mask = (uint32_t)0xFF << (uint32_t)b;                      // mask to remove bits that are shifted out
+  __m256i am    = _mm256_and_si256(a, _mm256_set1_epi8((char)mask));  // remove bits that will overflow
+  __m256i res   = _mm256_srl_epi16(am, _mm_cvtsi32_si128(b));         // 16-bit shifts
+  return res;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator>>(Vec32uc const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32uc &operator>>=(Vec32uc &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec32cb operator>=(Vec32uc const &a, Vec32uc const &b)
+{
+  return _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a);  // a == max(a,b)
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec32cb operator<=(Vec32uc const &a, Vec32uc const &b) { return b >= a; }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec32cb operator>(Vec32uc const &a, Vec32uc const &b) { return Vec32cb(Vec32c(~(b >= a))); }
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec32cb operator<(Vec32uc const &a, Vec32uc const &b) { return b > a; }
+
+// vector operator & : bitwise and
+static inline Vec32uc operator&(Vec32uc const &a, Vec32uc const &b) { return Vec32uc(Vec256b(a) & Vec256b(b)); }
+static inline Vec32uc operator&&(Vec32uc const &a, Vec32uc const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec32uc operator|(Vec32uc const &a, Vec32uc const &b) { return Vec32uc(Vec256b(a) | Vec256b(b)); }
+static inline Vec32uc operator||(Vec32uc const &a, Vec32uc const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec32uc operator^(Vec32uc const &a, Vec32uc const &b) { return Vec32uc(Vec256b(a) ^ Vec256b(b)); }
+
+// vector operator ~ : bitwise not
+static inline Vec32uc operator~(Vec32uc const &a) { return Vec32uc(~Vec256b(a)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec32uc select(Vec32cb const &s, Vec32uc const &a, Vec32uc const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32uc if_add(Vec32cb const &f, Vec32uc const &a, Vec32uc const &b) { return a + (Vec32uc(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+// (Note: horizontal_add_x(Vec32uc) is slightly faster)
+static inline uint32_t horizontal_add(Vec32uc const &a)
+{
+  __m256i sum1 = _mm256_sad_epu8(a, _mm256_setzero_si256());
+  __m256i sum2 = _mm256_shuffle_epi32(sum1, 2);
+  __m256i sum3 = _mm256_add_epi16(sum1, sum2);
+#if defined(_MSC_VER) && _MSC_VER <= 1700 && !defined(__INTEL_COMPILER)
+  __m128i sum4 = _mm256_extractf128_si256(sum3, 1);  // bug in MS compiler VS 11
+#else
+  __m128i sum4 = _mm256_extracti128_si256(sum3, 1);
+#endif
+  __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3), sum4);
+  uint8_t sum6 = (uint8_t)_mm_cvtsi128_si32(sum5);  // truncate to 8 bits
+  return sum6;                                      // zero extend to 32 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x(Vec32uc const &a)
+{
+  __m256i sum1 = _mm256_sad_epu8(a, _mm256_setzero_si256());
+  __m256i sum2 = _mm256_shuffle_epi32(sum1, 2);
+  __m256i sum3 = _mm256_add_epi16(sum1, sum2);
+#if defined(_MSC_VER) && _MSC_VER <= 1700 && !defined(__INTEL_COMPILER)
+  __m128i sum4 = _mm256_extractf128_si256(sum3, 1);  // bug in MS compiler VS 11
+#else
+  __m128i sum4 = _mm256_extracti128_si256(sum3, 1);
+#endif
+  __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3), sum4);
+  return _mm_cvtsi128_si32(sum5);
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec32uc add_saturated(Vec32uc const &a, Vec32uc const &b) { return _mm256_adds_epu8(a, b); }
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec32uc sub_saturated(Vec32uc const &a, Vec32uc const &b) { return _mm256_subs_epu8(a, b); }
+
+// function max: a > b ? a : b
+static inline Vec32uc max(Vec32uc const &a, Vec32uc const &b) { return _mm256_max_epu8(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec32uc min(Vec32uc const &a, Vec32uc const &b) { return _mm256_min_epu8(a, b); }
+
+/*****************************************************************************
+ *
+ *          Vector of 16 16-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec16s : public Vec256b
+{
+ public:
+  // Default constructor:
+  Vec16s() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec16s(int i) { ymm = _mm256_set1_epi16((int16_t)i); }
+  // Constructor to build from all elements:
+  Vec16s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7, int16_t i8, int16_t i9,
+         int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15)
+  {
+    ymm = _mm256_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+  }
+  // Constructor to build from two Vec8s:
+  Vec16s(Vec8s const &a0, Vec8s const &a1) { ymm = set_m128ir(a0, a1); }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec16s(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec16s &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m256i used in intrinsics
+  operator __m256i() const { return ymm; }
+  // Member function to load from array (unaligned)
+  Vec16s &load(void const *p)
+  {
+    ymm = _mm256_loadu_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec16s &load_a(void const *p)
+  {
+    ymm = _mm256_load_si256((__m256i const *)p);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec16s &load_partial(int n, void const *p)
+  {
+    if(n <= 0)
+      {
+        *this = 0;
+      }
+    else if(n <= 8)
+      {
+        *this = Vec16s(Vec8s().load_partial(n, p), 0);
+      }
+    else if(n < 16)
+      {
+        *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n - 8, (int16_t const *)p + 8));
+      }
+    else
+      {
+        load(p);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n <= 0)
+      {
+        return;
+      }
+    else if(n <= 8)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n < 16)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 8, (int16_t *)p + 8);
+      }
+    else
+      {
+        store(p);
+      }
+  }
+  // cut off vector to n elements. The last 16-n elements are set to zero
+  Vec16s &cutoff(int n)
+  {
+    *this = Vec32c(*this).cutoff(n * 2);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec16s const &insert(uint32_t index, int16_t value)
+  {
+    static const int16_t m[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    __m256i mask               = Vec256b().load(m + 16 - (index & 0x0F));
+    __m256i broad              = _mm256_set1_epi16(value);
+    ymm                        = selectb(mask, broad, ymm);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int16_t extract(uint32_t index) const
+  {
+    int16_t x[16];
+    store(x);
+    return x[index & 0x0F];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int16_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec8s:
+  Vec8s get_low() const { return _mm256_castsi256_si128(ymm); }
+  Vec8s get_high() const { return _mm256_extractf128_si256(ymm, 1); }
+  static int size() { return 16; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec16sb: Vector of 16 Booleans for use with Vec16s and Vec16us
+ *
+ *****************************************************************************/
+class Vec16sb : public Vec16s
+{
+ public:
+  // Default constructor:
+  Vec16sb() {}
+  // Constructor to build from all elements:
+  Vec16sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, bool x8, bool x9, bool x10, bool x11, bool x12,
+          bool x13, bool x14, bool x15)
+      : Vec16s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7),
+               -int16_t(x8), -int16_t(x9), -int16_t(x10), -int16_t(x11), -int16_t(x12), -int16_t(x13), -int16_t(x14), -int16_t(x15))
+  {
+  }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec16sb(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec16sb &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec16sb(bool b) : Vec16s(-int16_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec16sb &operator=(bool b)
+  {
+    *this = Vec16sb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec16sb(int b);
+  Vec16sb &operator=(int x);
+
+ public:
+  Vec8sb get_low() const { return Vec8sb(Vec16s::get_low()); }
+  Vec8sb get_high() const { return Vec8sb(Vec16s::get_high()); }
+  Vec16sb &insert(int index, bool a)
+  {
+    Vec16s::insert(index, -(int)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const { return Vec16s::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec16sb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec16sb operator&(Vec16sb const &a, Vec16sb const &b) { return Vec16sb(Vec256b(a) & Vec256b(b)); }
+static inline Vec16sb operator&&(Vec16sb const &a, Vec16sb const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec16sb &operator&=(Vec16sb &a, Vec16sb const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16sb operator|(Vec16sb const &a, Vec16sb const &b) { return Vec16sb(Vec256b(a) | Vec256b(b)); }
+static inline Vec16sb operator||(Vec16sb const &a, Vec16sb const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec16sb &operator|=(Vec16sb &a, Vec16sb const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16sb operator^(Vec16sb const &a, Vec16sb const &b) { return Vec16sb(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec16sb &operator^=(Vec16sb &a, Vec16sb const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16sb operator~(Vec16sb const &a) { return Vec16sb(~Vec256b(a)); }
+
+// vector operator ! : element not
+static inline Vec16sb operator!(Vec16sb const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec16sb andnot(Vec16sb const &a, Vec16sb const &b) { return Vec16sb(andnot(Vec256b(a), Vec256b(b))); }
+
+/*****************************************************************************
+ *
+ *          Operators for Vec16s
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16s operator+(Vec16s const &a, Vec16s const &b) { return _mm256_add_epi16(a, b); }
+
+// vector operator += : add
+static inline Vec16s &operator+=(Vec16s &a, Vec16s const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec16s operator++(Vec16s &a, int)
+{
+  Vec16s a0 = a;
+  a         = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec16s &operator++(Vec16s &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16s operator-(Vec16s const &a, Vec16s const &b) { return _mm256_sub_epi16(a, b); }
+
+// vector operator - : unary minus
+static inline Vec16s operator-(Vec16s const &a) { return _mm256_sub_epi16(_mm256_setzero_si256(), a); }
+
+// vector operator -= : subtract
+static inline Vec16s &operator-=(Vec16s &a, Vec16s const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec16s operator--(Vec16s &a, int)
+{
+  Vec16s a0 = a;
+  a         = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec16s &operator--(Vec16s &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16s operator*(Vec16s const &a, Vec16s const &b) { return _mm256_mullo_epi16(a, b); }
+
+// vector operator *= : multiply
+static inline Vec16s &operator*=(Vec16s &a, Vec16s const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+// vector operator << : shift left
+static inline Vec16s operator<<(Vec16s const &a, int b) { return _mm256_sll_epi16(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator <<= : shift left
+static inline Vec16s &operator<<=(Vec16s &a, int b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16s operator>>(Vec16s const &a, int b) { return _mm256_sra_epi16(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16s &operator>>=(Vec16s &a, int b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16sb operator==(Vec16s const &a, Vec16s const &b) { return _mm256_cmpeq_epi16(a, b); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16sb operator!=(Vec16s const &a, Vec16s const &b) { return Vec16sb(Vec16s(~(a == b))); }
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16sb operator>(Vec16s const &a, Vec16s const &b) { return _mm256_cmpgt_epi16(a, b); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16sb operator<(Vec16s const &a, Vec16s const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16sb operator>=(Vec16s const &a, Vec16s const &b) { return Vec16sb(Vec16s(~(b > a))); }
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16sb operator<=(Vec16s const &a, Vec16s const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec16s operator&(Vec16s const &a, Vec16s const &b) { return Vec16s(Vec256b(a) & Vec256b(b)); }
+static inline Vec16s operator&&(Vec16s const &a, Vec16s const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec16s &operator&=(Vec16s &a, Vec16s const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16s operator|(Vec16s const &a, Vec16s const &b) { return Vec16s(Vec256b(a) | Vec256b(b)); }
+static inline Vec16s operator||(Vec16s const &a, Vec16s const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec16s &operator|=(Vec16s &a, Vec16s const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16s operator^(Vec16s const &a, Vec16s const &b) { return Vec16s(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec16s &operator^=(Vec16s &a, Vec16s const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16s operator~(Vec16s const &a) { return Vec16s(~Vec256b(a)); }
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec16sb operator!(Vec16s const &a) { return _mm256_cmpeq_epi16(a, _mm256_setzero_si256()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16s select(Vec16sb const &s, Vec16s const &a, Vec16s const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16s if_add(Vec16sb const &f, Vec16s const &a, Vec16s const &b) { return a + (Vec16s(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add(Vec16s const &a)
+{
+  __m256i sum1 = _mm256_hadd_epi16(a, a);  // horizontally add 2x8 elements in 3 steps
+  __m256i sum2 = _mm256_hadd_epi16(sum1, sum1);
+  __m256i sum3 = _mm256_hadd_epi16(sum2, sum2);
+#if defined(_MSC_VER) && _MSC_VER <= 1700 && !defined(__INTEL_COMPILER)
+  __m128i sum4 = _mm256_extractf128_si256(sum3, 1);                  // bug in MS compiler VS 11
+#else
+  __m128i sum4 = _mm256_extracti128_si256(sum3, 1);  // get high part
+#endif
+  __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3), sum4);  // add low and high parts
+  int16_t sum6 = (int16_t)_mm_cvtsi128_si32(sum5);                   // truncate to 16 bits
+  return sum6;                                                       // sign extend to 32 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int32_t horizontal_add_x(Vec16s const &a)
+{
+  __m256i aeven = _mm256_slli_epi32(a, 16);       // even numbered elements of a. get sign bit in position
+  aeven         = _mm256_srai_epi32(aeven, 16);   // sign extend even numbered elements
+  __m256i aodd  = _mm256_srai_epi32(a, 16);       // sign extend odd  numbered elements
+  __m256i sum1  = _mm256_add_epi32(aeven, aodd);  // add even and odd elements
+  __m256i sum2  = _mm256_hadd_epi32(sum1, sum1);  // horizontally add 2x4 elements in 2 steps
+  __m256i sum3  = _mm256_hadd_epi32(sum2, sum2);
+#if defined(_MSC_VER) && _MSC_VER <= 1700 && !defined(__INTEL_COMPILER)
+  __m128i sum4  = _mm256_extractf128_si256(sum3, 1);  // bug in MS compiler VS 11
+#else
+  __m128i sum4 = _mm256_extracti128_si256(sum3, 1);
+#endif
+  __m128i sum5  = _mm_add_epi32(_mm256_castsi256_si128(sum3), sum4);
+  return _mm_cvtsi128_si32(sum5);
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16s add_saturated(Vec16s const &a, Vec16s const &b) { return _mm256_adds_epi16(a, b); }
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16s sub_saturated(Vec16s const &a, Vec16s const &b) { return _mm256_subs_epi16(a, b); }
+
+// function max: a > b ? a : b
+static inline Vec16s max(Vec16s const &a, Vec16s const &b) { return _mm256_max_epi16(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec16s min(Vec16s const &a, Vec16s const &b) { return _mm256_min_epi16(a, b); }
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16s abs(Vec16s const &a) { return _mm256_sign_epi16(a, a); }
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16s abs_saturated(Vec16s const &a)
+{
+  __m256i absa   = abs(a);                       // abs(a)
+  __m256i overfl = _mm256_srai_epi16(absa, 15);  // sign
+  return _mm256_add_epi16(absa, overfl);         // subtract 1 if 0x8000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16s rotate_left(Vec16s const &a, int b)
+{
+  __m256i left  = _mm256_sll_epi16(a, _mm_cvtsi32_si128(b & 0x0F));         // a << b
+  __m256i right = _mm256_srl_epi16(a, _mm_cvtsi32_si128((16 - b) & 0x0F));  // a >> (16 - b)
+  __m256i rot   = _mm256_or_si256(left, right);                             // or
+  return rot;
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 16 16-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec16us : public Vec16s
+{
+ public:
+  // Default constructor:
+  Vec16us() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec16us(uint32_t i) { ymm = _mm256_set1_epi16((int16_t)i); }
+  // Constructor to build from all elements:
+  Vec16us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7, uint16_t i8,
+          uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15)
+  {
+    ymm = _mm256_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+  }
+  // Constructor to build from two Vec8us:
+  Vec16us(Vec8us const &a0, Vec8us const &a1) { ymm = set_m128ir(a0, a1); }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec16us(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec16us &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec16us &load(void const *p)
+  {
+    ymm = _mm256_loadu_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec16us &load_a(void const *p)
+  {
+    ymm = _mm256_load_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec16us const &insert(uint32_t index, uint16_t value)
+  {
+    Vec16s::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint16_t extract(uint32_t index) const { return Vec16s::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint16_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec8us:
+  Vec8us get_low() const { return _mm256_castsi256_si128(ymm); }
+  Vec8us get_high() const { return _mm256_extractf128_si256(ymm, 1); }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16us operator+(Vec16us const &a, Vec16us const &b) { return Vec16us(Vec16s(a) + Vec16s(b)); }
+
+// vector operator - : subtract
+static inline Vec16us operator-(Vec16us const &a, Vec16us const &b) { return Vec16us(Vec16s(a) - Vec16s(b)); }
+
+// vector operator * : multiply
+static inline Vec16us operator*(Vec16us const &a, Vec16us const &b) { return Vec16us(Vec16s(a) * Vec16s(b)); }
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator>>(Vec16us const &a, uint32_t b) { return _mm256_srl_epi16(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator>>(Vec16us const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right artihmetic
+static inline Vec16us &operator>>=(Vec16us &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16us operator<<(Vec16us const &a, uint32_t b) { return _mm256_sll_epi16(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator << : shift left all elements
+static inline Vec16us operator<<(Vec16us const &a, int32_t b) { return a << (uint32_t)b; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16sb operator>=(Vec16us const &a, Vec16us const &b)
+{
+  __m256i max_ab = _mm256_max_epu16(a, b);  // max(a,b), unsigned
+  return _mm256_cmpeq_epi16(a, max_ab);     // a == max(a,b)
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16sb operator<=(Vec16us const &a, Vec16us const &b) { return b >= a; }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16sb operator>(Vec16us const &a, Vec16us const &b) { return Vec16sb(Vec16s(~(b >= a))); }
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16sb operator<(Vec16us const &a, Vec16us const &b) { return b > a; }
+
+// vector operator & : bitwise and
+static inline Vec16us operator&(Vec16us const &a, Vec16us const &b) { return Vec16us(Vec256b(a) & Vec256b(b)); }
+static inline Vec16us operator&&(Vec16us const &a, Vec16us const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec16us operator|(Vec16us const &a, Vec16us const &b) { return Vec16us(Vec256b(a) | Vec256b(b)); }
+static inline Vec16us operator||(Vec16us const &a, Vec16us const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec16us operator^(Vec16us const &a, Vec16us const &b) { return Vec16us(Vec256b(a) ^ Vec256b(b)); }
+
+// vector operator ~ : bitwise not
+static inline Vec16us operator~(Vec16us const &a) { return Vec16us(~Vec256b(a)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16us select(Vec16sb const &s, Vec16us const &a, Vec16us const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16us if_add(Vec16sb const &f, Vec16us const &a, Vec16us const &b) { return a + (Vec16us(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add(Vec16us const &a)
+{
+  __m256i sum1 = _mm256_hadd_epi16(a, a);  // horizontally add 2x8 elements in 3 steps
+  __m256i sum2 = _mm256_hadd_epi16(sum1, sum1);
+  __m256i sum3 = _mm256_hadd_epi16(sum2, sum2);
+#if defined(_MSC_VER) && _MSC_VER <= 1700 && !defined(__INTEL_COMPILER)
+  __m128i sum4 = _mm256_extractf128_si256(sum3, 1);                  // bug in MS compiler VS 11
+#else
+  __m128i sum4 = _mm256_extracti128_si256(sum3, 1);                         // get high part
+#endif
+  __m128i sum5 = _mm_add_epi32(_mm256_castsi256_si128(sum3), sum4);  // add low and high parts
+  return _mm_cvtsi128_si32(sum5);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x(Vec16us const &a)
+{
+  __m256i mask  = _mm256_set1_epi32(0x0000FFFF);  // mask for even positions
+  __m256i aeven = _mm256_and_si256(a, mask);      // even numbered elements of a
+  __m256i aodd  = _mm256_srli_epi32(a, 16);       // zero extend odd numbered elements
+  __m256i sum1  = _mm256_add_epi32(aeven, aodd);  // add even and odd elements
+  __m256i sum2  = _mm256_hadd_epi32(sum1, sum1);  // horizontally add 2x4 elements in 2 steps
+  __m256i sum3  = _mm256_hadd_epi32(sum2, sum2);
+#if defined(_MSC_VER) && _MSC_VER <= 1700 && !defined(__INTEL_COMPILER)
+  __m128i sum4  = _mm256_extractf128_si256(sum3, 1);                  // bug in MS compiler VS 11
+#else
+  __m128i sum4 = _mm256_extracti128_si256(sum3, 1);                         // get high part
+#endif
+  __m128i sum5  = _mm_add_epi32(_mm256_castsi256_si128(sum3), sum4);  // add low and high parts
+  return _mm_cvtsi128_si32(sum5);
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16us add_saturated(Vec16us const &a, Vec16us const &b) { return _mm256_adds_epu16(a, b); }
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16us sub_saturated(Vec16us const &a, Vec16us const &b) { return _mm256_subs_epu16(a, b); }
+
+// function max: a > b ? a : b
+static inline Vec16us max(Vec16us const &a, Vec16us const &b) { return _mm256_max_epu16(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec16us min(Vec16us const &a, Vec16us const &b) { return _mm256_min_epu16(a, b); }
+
+/*****************************************************************************
+ *
+ *          Vector of 8 32-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec8i : public Vec256b
+{
+ public:
+  // Default constructor:
+  Vec8i() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8i(int i) { ymm = _mm256_set1_epi32(i); }
+  // Constructor to build from all elements:
+  Vec8i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7)
+  {
+    ymm = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7);
+  }
+  // Constructor to build from two Vec4i:
+  Vec8i(Vec4i const &a0, Vec4i const &a1) { ymm = set_m128ir(a0, a1); }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec8i(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec8i &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m256i used in intrinsics
+  operator __m256i() const { return ymm; }
+  // Member function to load from array (unaligned)
+  Vec8i &load(void const *p)
+  {
+    ymm = _mm256_loadu_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec8i &load_a(void const *p)
+  {
+    ymm = _mm256_load_si256((__m256i const *)p);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec8i &load_partial(int n, void const *p)
+  {
+    if(n <= 0)
+      {
+        *this = 0;
+      }
+    else if(n <= 4)
+      {
+        *this = Vec8i(Vec4i().load_partial(n, p), 0);
+      }
+    else if(n < 8)
+      {
+        *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n - 4, (int32_t const *)p + 4));
+      }
+    else
+      {
+        load(p);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n <= 0)
+      {
+        return;
+      }
+    else if(n <= 4)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n < 8)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 4, (int32_t *)p + 4);
+      }
+    else
+      {
+        store(p);
+      }
+  }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec8i &cutoff(int n)
+  {
+    *this = Vec32c(*this).cutoff(n * 4);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8i const &insert(uint32_t index, int32_t value)
+  {
+    static const int32_t maskl[16] = {0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0};
+    __m256i broad                  = _mm256_set1_epi32(value);                 // broadcast value into all elements
+    __m256i mask                   = Vec256b().load(maskl + 8 - (index & 7));  // mask with FFFFFFFF at index position
+    ymm                            = selectb(mask, broad, ymm);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int32_t extract(uint32_t index) const
+  {
+    int32_t x[8];
+    store(x);
+    return x[index & 7];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int32_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4i:
+  Vec4i get_low() const { return _mm256_castsi256_si128(ymm); }
+  Vec4i get_high() const { return _mm256_extractf128_si256(ymm, 1); }
+  static int size() { return 8; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec8ib: Vector of 8 Booleans for use with Vec8i and Vec8ui
+ *
+ *****************************************************************************/
+
+class Vec8ib : public Vec8i
+{
+ public:
+  // Default constructor:
+  Vec8ib() {}
+  // Constructor to build from all elements:
+  Vec8ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7)
+      : Vec8i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3), -int32_t(x4), -int32_t(x5), -int32_t(x6), -int32_t(x7))
+  {
+  }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec8ib(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec8ib &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec8ib(bool b) : Vec8i(-int32_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec8ib &operator=(bool b)
+  {
+    *this = Vec8ib(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec8ib(int b);
+  Vec8ib &operator=(int x);
+
+ public:
+  Vec4ib get_low() const { return Vec4ib(Vec8i::get_low()); }
+  Vec4ib get_high() const { return Vec4ib(Vec8i::get_high()); }
+  Vec8ib &insert(int index, bool a)
+  {
+    Vec8i::insert(index, -(int)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const { return Vec8i::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec8ib
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8ib operator&(Vec8ib const &a, Vec8ib const &b) { return Vec8ib(Vec256b(a) & Vec256b(b)); }
+static inline Vec8ib operator&&(Vec8ib const &a, Vec8ib const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec8ib &operator&=(Vec8ib &a, Vec8ib const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8ib operator|(Vec8ib const &a, Vec8ib const &b) { return Vec8ib(Vec256b(a) | Vec256b(b)); }
+static inline Vec8ib operator||(Vec8ib const &a, Vec8ib const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec8ib &operator|=(Vec8ib &a, Vec8ib const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8ib operator^(Vec8ib const &a, Vec8ib const &b) { return Vec8ib(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec8ib &operator^=(Vec8ib &a, Vec8ib const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8ib operator~(Vec8ib const &a) { return Vec8ib(~Vec256b(a)); }
+
+// vector operator ! : element not
+static inline Vec8ib operator!(Vec8ib const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec8ib andnot(Vec8ib const &a, Vec8ib const &b) { return Vec8ib(andnot(Vec256b(a), Vec256b(b))); }
+
+/*****************************************************************************
+ *
+ *          Operators for Vec8i
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8i operator+(Vec8i const &a, Vec8i const &b) { return _mm256_add_epi32(a, b); }
+
+// vector operator += : add
+static inline Vec8i &operator+=(Vec8i &a, Vec8i const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec8i operator++(Vec8i &a, int)
+{
+  Vec8i a0 = a;
+  a        = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec8i &operator++(Vec8i &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8i operator-(Vec8i const &a, Vec8i const &b) { return _mm256_sub_epi32(a, b); }
+
+// vector operator - : unary minus
+static inline Vec8i operator-(Vec8i const &a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); }
+
+// vector operator -= : subtract
+static inline Vec8i &operator-=(Vec8i &a, Vec8i const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec8i operator--(Vec8i &a, int)
+{
+  Vec8i a0 = a;
+  a        = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec8i &operator--(Vec8i &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8i operator*(Vec8i const &a, Vec8i const &b) { return _mm256_mullo_epi32(a, b); }
+
+// vector operator *= : multiply
+static inline Vec8i &operator*=(Vec8i &a, Vec8i const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+// vector operator << : shift left
+static inline Vec8i operator<<(Vec8i const &a, int32_t b) { return _mm256_sll_epi32(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator <<= : shift left
+static inline Vec8i &operator<<=(Vec8i &a, int32_t b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8i operator>>(Vec8i const &a, int32_t b) { return _mm256_sra_epi32(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8i &operator>>=(Vec8i &a, int32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8ib operator==(Vec8i const &a, Vec8i const &b) { return _mm256_cmpeq_epi32(a, b); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8ib operator!=(Vec8i const &a, Vec8i const &b) { return Vec8ib(Vec8i(~(a == b))); }
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8ib operator>(Vec8i const &a, Vec8i const &b) { return _mm256_cmpgt_epi32(a, b); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8ib operator<(Vec8i const &a, Vec8i const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8ib operator>=(Vec8i const &a, Vec8i const &b) { return Vec8ib(Vec8i(~(b > a))); }
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8ib operator<=(Vec8i const &a, Vec8i const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec8i operator&(Vec8i const &a, Vec8i const &b) { return Vec8i(Vec256b(a) & Vec256b(b)); }
+static inline Vec8i operator&&(Vec8i const &a, Vec8i const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec8i &operator&=(Vec8i &a, Vec8i const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8i operator|(Vec8i const &a, Vec8i const &b) { return Vec8i(Vec256b(a) | Vec256b(b)); }
+static inline Vec8i operator||(Vec8i const &a, Vec8i const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec8i &operator|=(Vec8i &a, Vec8i const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8i operator^(Vec8i const &a, Vec8i const &b) { return Vec8i(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec8i &operator^=(Vec8i &a, Vec8i const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8i operator~(Vec8i const &a) { return Vec8i(~Vec256b(a)); }
+
+// vector operator ! : returns true for elements == 0
+static inline Vec8ib operator!(Vec8i const &a) { return _mm256_cmpeq_epi32(a, _mm256_setzero_si256()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8i select(Vec8ib const &s, Vec8i const &a, Vec8i const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8i if_add(Vec8ib const &f, Vec8i const &a, Vec8i const &b) { return a + (Vec8i(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add(Vec8i const &a)
+{
+  __m256i sum1 = _mm256_hadd_epi32(a, a);  // horizontally add 2x4 elements in 2 steps
+  __m256i sum2 = _mm256_hadd_epi32(sum1, sum1);
+#if defined(_MSC_VER) && _MSC_VER <= 1700 && !defined(__INTEL_COMPILER)
+  __m128i sum3 = _mm256_extractf128_si256(sum2, 1);                  // bug in MS VS 11
+#else
+  __m128i sum3 = _mm256_extracti128_si256(sum2, 1);                         // get high part
+#endif
+  __m128i sum4 = _mm_add_epi32(_mm256_castsi256_si128(sum2), sum3);  // add low and high parts
+  return _mm_cvtsi128_si32(sum4);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+// static inline int64_t horizontal_add_x (Vec8i const & a); // defined below
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec8i add_saturated(Vec8i const &a, Vec8i const &b)
+{
+  __m256i sum    = _mm256_add_epi32(a, b);         // a + b
+  __m256i axb    = _mm256_xor_si256(a, b);         // check if a and b have different sign
+  __m256i axs    = _mm256_xor_si256(a, sum);       // check if a and sum have different sign
+  __m256i overf1 = _mm256_andnot_si256(axb, axs);  // check if sum has wrong sign
+  __m256i overf2 = _mm256_srai_epi32(overf1, 31);  // -1 if overflow
+  __m256i asign  = _mm256_srli_epi32(a, 31);       // 1  if a < 0
+  __m256i sat1   = _mm256_srli_epi32(overf2, 1);   // 7FFFFFFF if overflow
+  __m256i sat2   = _mm256_add_epi32(sat1, asign);  // 7FFFFFFF if positive overflow 80000000 if negative overflow
+  return selectb(overf2, sat2, sum);               // sum if not overflow, else sat2
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec8i sub_saturated(Vec8i const &a, Vec8i const &b)
+{
+  __m256i diff   = _mm256_sub_epi32(a, b);         // a + b
+  __m256i axb    = _mm256_xor_si256(a, b);         // check if a and b have different sign
+  __m256i axs    = _mm256_xor_si256(a, diff);      // check if a and sum have different sign
+  __m256i overf1 = _mm256_and_si256(axb, axs);     // check if sum has wrong sign
+  __m256i overf2 = _mm256_srai_epi32(overf1, 31);  // -1 if overflow
+  __m256i asign  = _mm256_srli_epi32(a, 31);       // 1  if a < 0
+  __m256i sat1   = _mm256_srli_epi32(overf2, 1);   // 7FFFFFFF if overflow
+  __m256i sat2   = _mm256_add_epi32(sat1, asign);  // 7FFFFFFF if positive overflow 80000000 if negative overflow
+  return selectb(overf2, sat2, diff);              // diff if not overflow, else sat2
+}
+
+// function max: a > b ? a : b
+static inline Vec8i max(Vec8i const &a, Vec8i const &b) { return _mm256_max_epi32(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec8i min(Vec8i const &a, Vec8i const &b) { return _mm256_min_epi32(a, b); }
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8i abs(Vec8i const &a) { return _mm256_sign_epi32(a, a); }
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8i abs_saturated(Vec8i const &a)
+{
+  __m256i absa   = abs(a);                       // abs(a)
+  __m256i overfl = _mm256_srai_epi32(absa, 31);  // sign
+  return _mm256_add_epi32(absa, overfl);         // subtract 1 if 0x80000000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8i rotate_left(Vec8i const &a, int b)
+{
+#ifdef __AVX512VL__
+  return _mm256_rolv_epi32(a, _mm256_set1_epi32(b));
+#else
+  __m256i left = _mm256_sll_epi32(a, _mm_cvtsi32_si128(b & 0x1F));          // a << b
+  __m256i right = _mm256_srl_epi32(a, _mm_cvtsi32_si128((32 - b) & 0x1F));  // a >> (32 - b)
+  __m256i rot = _mm256_or_si256(left, right);                               // or
+  return rot;
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 8 32-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec8ui : public Vec8i
+{
+ public:
+  // Default constructor:
+  Vec8ui() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8ui(uint32_t i) { ymm = _mm256_set1_epi32(i); }
+  // Constructor to build from all elements:
+  Vec8ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
+  {
+    ymm = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7);
+  }
+  // Constructor to build from two Vec4ui:
+  Vec8ui(Vec4ui const &a0, Vec4ui const &a1) { ymm = set_m128ir(a0, a1); }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec8ui(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec8ui &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec8ui &load(void const *p)
+  {
+    ymm = _mm256_loadu_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec8ui &load_a(void const *p)
+  {
+    ymm = _mm256_load_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8ui const &insert(uint32_t index, uint32_t value)
+  {
+    Vec8i::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint32_t extract(uint32_t index) const { return Vec8i::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint32_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4ui:
+  Vec4ui get_low() const { return _mm256_castsi256_si128(ymm); }
+  Vec4ui get_high() const { return _mm256_extractf128_si256(ymm, 1); }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8ui operator+(Vec8ui const &a, Vec8ui const &b) { return Vec8ui(Vec8i(a) + Vec8i(b)); }
+
+// vector operator - : subtract
+static inline Vec8ui operator-(Vec8ui const &a, Vec8ui const &b) { return Vec8ui(Vec8i(a) - Vec8i(b)); }
+
+// vector operator * : multiply
+static inline Vec8ui operator*(Vec8ui const &a, Vec8ui const &b) { return Vec8ui(Vec8i(a) * Vec8i(b)); }
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator>>(Vec8ui const &a, uint32_t b) { return _mm256_srl_epi32(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator>>(Vec8ui const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right logical
+static inline Vec8ui &operator>>=(Vec8ui &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8ui operator<<(Vec8ui const &a, uint32_t b) { return Vec8ui((Vec8i)a << (int32_t)b); }
+
+// vector operator << : shift left all elements
+static inline Vec8ui operator<<(Vec8ui const &a, int32_t b) { return Vec8ui((Vec8i)a << (int32_t)b); }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8ib operator>(Vec8ui const &a, Vec8ui const &b)
+{
+  __m256i signbit = _mm256_set1_epi32(0x80000000);
+  __m256i a1      = _mm256_xor_si256(a, signbit);
+  __m256i b1      = _mm256_xor_si256(b, signbit);
+  return _mm256_cmpgt_epi32(a1, b1);  // signed compare
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8ib operator<(Vec8ui const &a, Vec8ui const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8ib operator>=(Vec8ui const &a, Vec8ui const &b)
+{
+  __m256i max_ab = _mm256_max_epu32(a, b);  // max(a,b), unsigned
+  return _mm256_cmpeq_epi32(a, max_ab);     // a == max(a,b)
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8ib operator<=(Vec8ui const &a, Vec8ui const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec8ui operator&(Vec8ui const &a, Vec8ui const &b) { return Vec8ui(Vec256b(a) & Vec256b(b)); }
+static inline Vec8ui operator&&(Vec8ui const &a, Vec8ui const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec8ui operator|(Vec8ui const &a, Vec8ui const &b) { return Vec8ui(Vec256b(a) | Vec256b(b)); }
+static inline Vec8ui operator||(Vec8ui const &a, Vec8ui const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec8ui operator^(Vec8ui const &a, Vec8ui const &b) { return Vec8ui(Vec256b(a) ^ Vec256b(b)); }
+
+// vector operator ~ : bitwise not
+static inline Vec8ui operator~(Vec8ui const &a) { return Vec8ui(~Vec256b(a)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8ui select(Vec8ib const &s, Vec8ui const &a, Vec8ui const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8ui if_add(Vec8ib const &f, Vec8ui const &a, Vec8ui const &b) { return a + (Vec8ui(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add(Vec8ui const &a) { return horizontal_add((Vec8i)a); }
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are zero extended before adding to avoid overflow
+// static inline uint64_t horizontal_add_x (Vec8ui const & a); // defined later
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec8ui add_saturated(Vec8ui const &a, Vec8ui const &b)
+{
+  Vec8ui sum      = a + b;
+  Vec8ui aorb     = Vec8ui(a | b);
+  Vec8ui overflow = Vec8ui(sum < aorb);  // overflow if a + b < (a | b)
+  return Vec8ui(sum | overflow);         // return 0xFFFFFFFF if overflow
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec8ui sub_saturated(Vec8ui const &a, Vec8ui const &b)
+{
+  Vec8ui diff      = a - b;
+  Vec8ui underflow = Vec8ui(diff > a);          // underflow if a - b > a
+  return _mm256_andnot_si256(underflow, diff);  // return 0 if underflow
+}
+
+// function max: a > b ? a : b
+static inline Vec8ui max(Vec8ui const &a, Vec8ui const &b) { return _mm256_max_epu32(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec8ui min(Vec8ui const &a, Vec8ui const &b) { return _mm256_min_epu32(a, b); }
+
+/*****************************************************************************
+ *
+ *          Vector of 4 64-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec4q : public Vec256b
+{
+ public:
+  // Default constructor:
+  Vec4q() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec4q(int64_t i)
+  {
+#if defined(_MSC_VER) && _MSC_VER < 1900 && !defined(__x86_64__) && !defined(__INTEL_COMPILER)
+    // MS compiler cannot use _mm256_set1_epi64x in 32 bit mode, and
+    // cannot put 64-bit values into xmm register without using
+    // mmx registers, and it makes no emms
+    union
+    {
+      int64_t q[4];
+      int32_t r[8];
+    } u;
+    u.q[0] = u.q[1] = u.q[2] = u.q[3] = i;
+    ymm                               = _mm256_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3], u.r[4], u.r[5], u.r[6], u.r[7]);
+#else
+    ymm = _mm256_set1_epi64x(i);
+#endif
+  }
+  // Constructor to build from all elements:
+  Vec4q(int64_t i0, int64_t i1, int64_t i2, int64_t i3)
+  {
+#if defined(_MSC_VER) && _MSC_VER < 1900 && !defined(__x86_64__) && !defined(__INTEL_COMPILER)
+    // MS compiler cannot put 64-bit values into xmm register without using
+    // mmx registers, and it makes no emms
+    union
+    {
+      int64_t q[4];
+      int32_t r[8];
+    } u;
+    u.q[0] = i0;
+    u.q[1] = i1;
+    u.q[2] = i2;
+    u.q[3] = i3;
+    ymm    = _mm256_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3], u.r[4], u.r[5], u.r[6], u.r[7]);
+#else
+    ymm = _mm256_setr_epi64x(i0, i1, i2, i3);
+#endif
+  }
+  // Constructor to build from two Vec2q:
+  Vec4q(Vec2q const &a0, Vec2q const &a1) { ymm = set_m128ir(a0, a1); }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec4q(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec4q &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m256i used in intrinsics
+  operator __m256i() const { return ymm; }
+  // Member function to load from array (unaligned)
+  Vec4q &load(void const *p)
+  {
+    ymm = _mm256_loadu_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec4q &load_a(void const *p)
+  {
+    ymm = _mm256_load_si256((__m256i const *)p);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec4q &load_partial(int n, void const *p)
+  {
+    if(n <= 0)
+      {
+        *this = 0;
+      }
+    else if(n <= 2)
+      {
+        *this = Vec4q(Vec2q().load_partial(n, p), 0);
+      }
+    else if(n < 4)
+      {
+        *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n - 2, (int64_t const *)p + 2));
+      }
+    else
+      {
+        load(p);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n <= 0)
+      {
+        return;
+      }
+    else if(n <= 2)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n < 4)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 2, (int64_t *)p + 2);
+      }
+    else
+      {
+        store(p);
+      }
+  }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec4q &cutoff(int n)
+  {
+    *this = Vec32c(*this).cutoff(n * 8);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4q const &insert(uint32_t index, int64_t value)
+  {
+    Vec4q x(value);
+    switch(index)
+      {
+        case 0:
+          ymm = _mm256_blend_epi32(ymm, x, 0x03);
+          break;
+        case 1:
+          ymm = _mm256_blend_epi32(ymm, x, 0x0C);
+          break;
+        case 2:
+          ymm = _mm256_blend_epi32(ymm, x, 0x30);
+          break;
+        case 3:
+          ymm = _mm256_blend_epi32(ymm, x, 0xC0);
+          break;
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int64_t extract(uint32_t index) const
+  {
+    int64_t x[4];
+    store(x);
+    return x[index & 3];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int64_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec2q:
+  Vec2q get_low() const { return _mm256_castsi256_si128(ymm); }
+  Vec2q get_high() const { return _mm256_extractf128_si256(ymm, 1); }
+  static int size() { return 4; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec4qb: Vector of 4 Booleans for use with Vec4q and Vec4uq
+ *
+ *****************************************************************************/
+
+class Vec4qb : public Vec4q
+{
+ public:
+  // Default constructor:
+  Vec4qb() {}
+  // Constructor to build from all elements:
+  Vec4qb(bool x0, bool x1, bool x2, bool x3) : Vec4q(-int64_t(x0), -int64_t(x1), -int64_t(x2), -int64_t(x3)) {}
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec4qb(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec4qb &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec4qb(bool b) : Vec4q(-int64_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec4qb &operator=(bool b)
+  {
+    *this = Vec4qb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec4qb(int b);
+  Vec4qb &operator=(int x);
+
+ public:
+  // Member functions to split into two Vec2qb:
+  Vec2qb get_low() const { return Vec2qb(Vec4q::get_low()); }
+  Vec2qb get_high() const { return Vec2qb(Vec4q::get_high()); }
+  Vec4qb &insert(int index, bool a)
+  {
+    Vec4q::insert(index, -(int64_t)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const { return Vec4q::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec4qb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4qb operator&(Vec4qb const &a, Vec4qb const &b) { return Vec4qb(Vec256b(a) & Vec256b(b)); }
+static inline Vec4qb operator&&(Vec4qb const &a, Vec4qb const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec4qb &operator&=(Vec4qb &a, Vec4qb const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4qb operator|(Vec4qb const &a, Vec4qb const &b) { return Vec4qb(Vec256b(a) | Vec256b(b)); }
+static inline Vec4qb operator||(Vec4qb const &a, Vec4qb const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec4qb &operator|=(Vec4qb &a, Vec4qb const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4qb operator^(Vec4qb const &a, Vec4qb const &b) { return Vec4qb(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec4qb &operator^=(Vec4qb &a, Vec4qb const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4qb operator~(Vec4qb const &a) { return Vec4qb(~Vec256b(a)); }
+
+// vector operator ! : element not
+static inline Vec4qb operator!(Vec4qb const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec4qb andnot(Vec4qb const &a, Vec4qb const &b) { return Vec4qb(andnot(Vec256b(a), Vec256b(b))); }
+
+/*****************************************************************************
+ *
+ *          Operators for Vec4q
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4q operator+(Vec4q const &a, Vec4q const &b) { return _mm256_add_epi64(a, b); }
+
+// vector operator += : add
+static inline Vec4q &operator+=(Vec4q &a, Vec4q const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec4q operator++(Vec4q &a, int)
+{
+  Vec4q a0 = a;
+  a        = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec4q &operator++(Vec4q &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4q operator-(Vec4q const &a, Vec4q const &b) { return _mm256_sub_epi64(a, b); }
+
+// vector operator - : unary minus
+static inline Vec4q operator-(Vec4q const &a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); }
+
+// vector operator -= : subtract
+static inline Vec4q &operator-=(Vec4q &a, Vec4q const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec4q operator--(Vec4q &a, int)
+{
+  Vec4q a0 = a;
+  a        = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec4q &operator--(Vec4q &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4q operator*(Vec4q const &a, Vec4q const &b)
+{
+#if defined(__AVX512DQ__) && defined(__AVX512VL__)
+  return _mm256_mullo_epi64(a, b);
+#else
+  // instruction does not exist. Split into 32-bit multiplies
+  __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);          // swap H<->L
+  __m256i prodlh = _mm256_mullo_epi32(a, bswap);          // 32 bit L*H products
+  __m256i zero = _mm256_setzero_si256();                  // 0
+  __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);      // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
+  __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);  // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
+  __m256i prodll = _mm256_mul_epu32(a, b);                // a0Lb0L,a1Lb1L, 64 bit unsigned products
+  __m256i prod = _mm256_add_epi64(prodll, prodlh3);       // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
+  return prod;
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec4q &operator*=(Vec4q &a, Vec4q const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator << : shift left
+static inline Vec4q operator<<(Vec4q const &a, int32_t b) { return _mm256_sll_epi64(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator <<= : shift left
+static inline Vec4q &operator<<=(Vec4q &a, int32_t b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec4q operator>>(Vec4q const &a, int32_t b)
+{
+  // instruction does not exist. Split into 32-bit shifts
+  if(b <= 32)
+    {
+      __m128i bb   = _mm_cvtsi32_si128(b);                      // b
+      __m256i sra  = _mm256_sra_epi32(a, bb);                   // a >> b signed dwords
+      __m256i srl  = _mm256_srl_epi64(a, bb);                   // a >> b unsigned qwords
+      __m256i mask = constant8i<0, -1, 0, -1, 0, -1, 0, -1>();  // mask for signed high part
+      return selectb(mask, sra, srl);
+    }
+  else
+    {                                                           // b > 32
+      __m128i bm32 = _mm_cvtsi32_si128(b - 32);                 // b - 32
+      __m256i sign = _mm256_srai_epi32(a, 31);                  // sign of a
+      __m256i sra2 = _mm256_sra_epi32(a, bm32);                 // a >> (b-32) signed dwords
+      __m256i sra3 = _mm256_srli_epi64(sra2, 32);               // a >> (b-32) >> 32 (second shift unsigned qword)
+      __m256i mask = constant8i<0, -1, 0, -1, 0, -1, 0, -1>();  // mask for high part containing only sign
+      return selectb(mask, sign, sra3);
+    }
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec4q &operator>>=(Vec4q &a, int32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4qb operator==(Vec4q const &a, Vec4q const &b) { return _mm256_cmpeq_epi64(a, b); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4qb operator!=(Vec4q const &a, Vec4q const &b) { return Vec4qb(Vec4q(~(a == b))); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4qb operator<(Vec4q const &a, Vec4q const &b) { return _mm256_cmpgt_epi64(b, a); }
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4qb operator>(Vec4q const &a, Vec4q const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec4qb operator>=(Vec4q const &a, Vec4q const &b) { return Vec4qb(Vec4q(~(a < b))); }
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec4qb operator<=(Vec4q const &a, Vec4q const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec4q operator&(Vec4q const &a, Vec4q const &b) { return Vec4q(Vec256b(a) & Vec256b(b)); }
+static inline Vec4q operator&&(Vec4q const &a, Vec4q const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec4q &operator&=(Vec4q &a, Vec4q const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4q operator|(Vec4q const &a, Vec4q const &b) { return Vec4q(Vec256b(a) | Vec256b(b)); }
+static inline Vec4q operator||(Vec4q const &a, Vec4q const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec4q &operator|=(Vec4q &a, Vec4q const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4q operator^(Vec4q const &a, Vec4q const &b) { return Vec4q(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec4q &operator^=(Vec4q &a, Vec4q const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4q operator~(Vec4q const &a) { return Vec4q(~Vec256b(a)); }
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec4qb operator!(Vec4q const &a) { return a == Vec4q(_mm256_setzero_si256()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4q select(Vec4qb const &s, Vec4q const &a, Vec4q const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4q if_add(Vec4qb const &f, Vec4q const &a, Vec4q const &b) { return a + (Vec4q(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int64_t horizontal_add(Vec4q const &a)
+{
+  __m256i sum1 = _mm256_shuffle_epi32(a, 0x0E);                      // high element
+  __m256i sum2 = _mm256_add_epi64(a, sum1);                          // sum
+#if defined(_MSC_VER) && _MSC_VER <= 1700 && !defined(__INTEL_COMPILER)
+  __m128i sum3 = _mm256_extractf128_si256(sum2, 1);                  // bug in MS compiler VS 11
+#else
+  __m128i sum3 = _mm256_extracti128_si256(sum2, 1);  // get high part
+#endif
+  __m128i sum4 = _mm_add_epi64(_mm256_castsi256_si128(sum2), sum3);  // add low and high parts
+#if defined(__x86_64__)
+  return _mm_cvtsi128_si64(sum4);                                    // 64 bit mode
+#else
+  union
+  {
+    __m128i x;  // silly definition of _mm256_storel_epi64 requires __m256i
+    uint64_t i;
+  } u;
+  _mm_storel_epi64(&u.x, sum4);
+  return u.i;
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec4q max(Vec4q const &a, Vec4q const &b) { return select(a > b, a, b); }
+
+// function min: a < b ? a : b
+static inline Vec4q min(Vec4q const &a, Vec4q const &b) { return select(a < b, a, b); }
+
+// function abs: a >= 0 ? a : -a
+static inline Vec4q abs(Vec4q const &a)
+{
+  __m256i sign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);  // 0 > a
+  __m256i inv  = _mm256_xor_si256(a, sign);                      // invert bits if negative
+  return _mm256_sub_epi64(inv, sign);                            // add 1
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec4q abs_saturated(Vec4q const &a)
+{
+  __m256i absa   = abs(a);                                            // abs(a)
+  __m256i overfl = _mm256_cmpgt_epi64(_mm256_setzero_si256(), absa);  // 0 > a
+  return _mm256_add_epi64(absa, overfl);                              // subtract 1 if 0x8000000000000000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec4q rotate_left(Vec4q const &a, int b)
+{
+#ifdef __AVX512VL__
+  return _mm256_rolv_epi64(a, _mm256_set1_epi64x(int64_t(b)));
+#else
+  __m256i left = _mm256_sll_epi64(a, _mm_cvtsi32_si128(b & 0x3F));          // a << b
+  __m256i right = _mm256_srl_epi64(a, _mm_cvtsi32_si128((64 - b) & 0x3F));  // a >> (64 - b)
+  __m256i rot = _mm256_or_si256(left, right);                               // or
+  return rot;
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 4 64-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec4uq : public Vec4q
+{
+ public:
+  // Default constructor:
+  Vec4uq() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec4uq(uint64_t i) { ymm = Vec4q(i); }
+  // Constructor to build from all elements:
+  Vec4uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3) { ymm = Vec4q(i0, i1, i2, i3); }
+  // Constructor to build from two Vec2uq:
+  Vec4uq(Vec2uq const &a0, Vec2uq const &a1) { ymm = set_m128ir(a0, a1); }
+  // Constructor to convert from type __m256i used in intrinsics:
+  Vec4uq(__m256i const &x) { ymm = x; }
+  // Assignment operator to convert from type __m256i used in intrinsics:
+  Vec4uq &operator=(__m256i const &x)
+  {
+    ymm = x;
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec4uq &load(void const *p)
+  {
+    ymm = _mm256_loadu_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec4uq &load_a(void const *p)
+  {
+    ymm = _mm256_load_si256((__m256i const *)p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4uq const &insert(uint32_t index, uint64_t value)
+  {
+    Vec4q::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint64_t extract(uint32_t index) const { return Vec4q::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint64_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec2uq:
+  Vec2uq get_low() const { return _mm256_castsi256_si128(ymm); }
+  Vec2uq get_high() const { return _mm256_extractf128_si256(ymm, 1); }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec4uq operator+(Vec4uq const &a, Vec4uq const &b) { return Vec4uq(Vec4q(a) + Vec4q(b)); }
+
+// vector operator - : subtract
+static inline Vec4uq operator-(Vec4uq const &a, Vec4uq const &b) { return Vec4uq(Vec4q(a) - Vec4q(b)); }
+
+// vector operator * : multiply element by element
+static inline Vec4uq operator*(Vec4uq const &a, Vec4uq const &b) { return Vec4uq(Vec4q(a) * Vec4q(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator>>(Vec4uq const &a, uint32_t b) { return _mm256_srl_epi64(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator>>(Vec4uq const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right artihmetic
+static inline Vec4uq &operator>>=(Vec4uq &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec4uq operator<<(Vec4uq const &a, uint32_t b) { return Vec4uq((Vec4q)a << (int32_t)b); }
+
+// vector operator << : shift left all elements
+static inline Vec4uq operator<<(Vec4uq const &a, int32_t b) { return Vec4uq((Vec4q)a << b); }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec4qb operator>(Vec4uq const &a, Vec4uq const &b)
+{
+  //#if defined ( __XOP__ ) // AMD XOP instruction set
+  __m256i sign64 = Vec4uq(0x8000000000000000);
+  __m256i aflip  = _mm256_xor_si256(a, sign64);
+  __m256i bflip  = _mm256_xor_si256(b, sign64);
+  Vec4q cmp      = _mm256_cmpgt_epi64(aflip, bflip);
+  return Vec4qb(cmp);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec4qb operator<(Vec4uq const &a, Vec4uq const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec4qb operator>=(Vec4uq const &a, Vec4uq const &b) { return Vec4qb(Vec4q(~(b > a))); }
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec4qb operator<=(Vec4uq const &a, Vec4uq const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec4uq operator&(Vec4uq const &a, Vec4uq const &b) { return Vec4uq(Vec256b(a) & Vec256b(b)); }
+static inline Vec4uq operator&&(Vec4uq const &a, Vec4uq const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec4uq operator|(Vec4uq const &a, Vec4uq const &b) { return Vec4uq(Vec256b(a) | Vec256b(b)); }
+static inline Vec4uq operator||(Vec4uq const &a, Vec4uq const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec4uq operator^(Vec4uq const &a, Vec4uq const &b) { return Vec4uq(Vec256b(a) ^ Vec256b(b)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4uq select(Vec4qb const &s, Vec4uq const &a, Vec4uq const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4uq if_add(Vec4qb const &f, Vec4uq const &a, Vec4uq const &b) { return a + (Vec4uq(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint64_t horizontal_add(Vec4uq const &a) { return horizontal_add((Vec4q)a); }
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sing/zero extended before adding to avoid overflow
+static inline int64_t horizontal_add_x(Vec8i const &a)
+{
+  __m256i signs = _mm256_srai_epi32(a, 31);         // sign of all elements
+  Vec4q a01     = _mm256_unpacklo_epi32(a, signs);  // sign-extended a0, a1, a4, a5
+  Vec4q a23     = _mm256_unpackhi_epi32(a, signs);  // sign-extended a2, a3, a6, a7
+  return horizontal_add(a01 + a23);
+}
+
+static inline uint64_t horizontal_add_x(Vec8ui const &a)
+{
+  __m256i zero = _mm256_setzero_si256();          // 0
+  __m256i a01  = _mm256_unpacklo_epi32(a, zero);  // zero-extended a0, a1
+  __m256i a23  = _mm256_unpackhi_epi32(a, zero);  // zero-extended a2, a3
+  return horizontal_add(Vec4q(a01) + Vec4q(a23));
+}
+
+// function max: a > b ? a : b
+static inline Vec4uq max(Vec4uq const &a, Vec4uq const &b) { return Vec4uq(select(a > b, a, b)); }
+
+// function min: a < b ? a : b
+static inline Vec4uq min(Vec4uq const &a, Vec4uq const &b) { return Vec4uq(select(a > b, b, a)); }
+
+/*****************************************************************************
+ *
+ *          Vector permute functions
+ *
+ ******************************************************************************
+ *
+ * These permute functions can reorder the elements of a vector and optionally
+ * set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to select.
+ * An index of -1 will generate zero. An index of -256 means don't care.
+ *
+ * Example:
+ * Vec8i a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+ * Vec8i b;
+ * b = permute8i<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+// Permute vector of 4 64-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q permute4q(Vec4q const &a)
+{
+  // Combine indexes into a single bitfield, with 8 bits for each
+  const int m1 = (i0 & 3) | (i1 & 3) << 8 | (i2 & 3) << 16 | (i3 & 3) << 24;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24;
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3) & 0x80) != 0;
+
+  if(((m1 ^ 0x03020100) & mz) == 0)
+    {
+      // no shuffling
+      if(dozero)
+        {
+          // zero some elements
+          const __m256i maskz = constant8i < i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1,
+                        i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > ();
+          return _mm256_and_si256(a, maskz);
+        }
+      return a;  // do nothing
+    }
+
+  if(((m1 ^ 0x02020000) & 0x02020202 & mz) == 0)
+    {
+      // no exchange of data between low and high half
+
+      if(((m1 ^ (m1 >> 16)) & 0x0101 & mz & (mz >> 16)) == 0 && !dozero)
+        {
+          // same pattern in low and high half. use VPSHUFD
+          const int sd = (((i0 >= 0) ? (i0 & 1) : (i2 & 1)) * 10 + 4) | (((i1 >= 0) ? (i1 & 1) : (i3 & 1)) * 10 + 4) << 4;
+          return _mm256_shuffle_epi32(a, sd);
+        }
+
+      // use VPSHUFB
+      const __m256i mm = constant8i < i0 < 0 ? -1 : (i0 & 1) * 0x08080808 + 0x03020100,
+                    i0 < 0 ? -1 : (i0 & 1) * 0x08080808 + 0x07060504, i1 < 0 ? -1 : (i1 & 1) * 0x08080808 + 0x03020100,
+                    i1 < 0 ? -1 : (i1 & 1) * 0x08080808 + 0x07060504, i2 < 0 ? -1 : (i2 & 1) * 0x08080808 + 0x03020100,
+                    i2 < 0 ? -1 : (i2 & 1) * 0x08080808 + 0x07060504, i3 < 0 ? -1 : (i3 & 1) * 0x08080808 + 0x03020100,
+                    i3 < 0 ? -1 : (i3 & 1) * 0x08080808 + 0x07060504 > ();
+      return _mm256_shuffle_epi8(a, mm);
+    }
+
+  // general case. Use VPERMQ
+  const int ms = (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6;
+  __m256i t1   = _mm256_permute4x64_epi64(a, ms);
+
+  if(dozero)
+    {
+      // zero some elements
+      const __m256i maskz = constant8i < i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1,
+                    i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > ();
+      return _mm256_and_si256(t1, maskz);
+    }
+  return t1;
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4uq permute4uq(Vec4uq const &a)
+{
+  return Vec4uq(permute4q<i0, i1, i2, i3>(a));
+}
+
+// Permute vector of 8 32-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i permute8i(Vec8i const &a)
+{
+  // Combine indexes into a single bitfield, with 4 bits for each
+  const int m1 =
+      (i0 & 7) | (i1 & 7) << 4 | (i2 & 7) << 8 | (i3 & 7) << 12 | (i4 & 7) << 16 | (i5 & 7) << 20 | (i6 & 7) << 24 | (i7 & 7) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12 |
+                 (i4 < 0 ? 0 : 0xF) << 16 | (i5 < 0 ? 0 : 0xF) << 20 | (i6 < 0 ? 0 : 0xF) << 24 | (i7 < 0 ? 0 : 0xF) << 28;
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80) != 0;
+
+  __m256i t1, mask;
+
+  if(((m1 ^ 0x76543210) & mz) == 0)
+    {
+      // no shuffling
+      if(dozero)
+        {
+          // zero some elements
+          mask = constant8i < i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1,
+          i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > ();
+          return _mm256_and_si256(a, mask);
+        }
+      return a;  // do nothing
+    }
+
+  // Check if we can use 64-bit permute. Even numbered indexes must be even and odd numbered
+  // indexes must be equal to the preceding index + 1, except for negative indexes.
+  if(((m1 ^ 0x10101010) & 0x11111111 & mz) == 0 && ((m1 ^ m1 >> 4) & 0x0E0E0E0E & mz & mz >> 4) == 0)
+    {
+      const bool partialzero = int((i0 ^ i1) | (i2 ^ i3) | (i4 ^ i5) | (i6 ^ i7)) < 0;  // part of a 64-bit block is zeroed
+      const int blank1       = partialzero ? -0x100 : -1;                               // ignore or zero
+      const int n0           = i0 > 0 ? i0 / 2 : i1 > 0 ? i1 / 2 : blank1;              // indexes for 64 bit blend
+      const int n1           = i2 > 0 ? i2 / 2 : i3 > 0 ? i3 / 2 : blank1;
+      const int n2           = i4 > 0 ? i4 / 2 : i5 > 0 ? i5 / 2 : blank1;
+      const int n3           = i6 > 0 ? i6 / 2 : i7 > 0 ? i7 / 2 : blank1;
+      // do 64-bit permute
+      t1 = permute4q<n0, n1, n2, n3>(Vec4q(a));
+      if(blank1 == -1 || !dozero)
+        {
+          return t1;
+        }
+      // need more zeroing
+      mask = constant8i < i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1,
+      i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > ();
+      return _mm256_and_si256(t1, mask);
+    }
+
+  if(((m1 ^ 0x44440000) & 0x44444444 & mz) == 0)
+    {
+      // no exchange of data between low and high half
+
+      if(((m1 ^ (m1 >> 16)) & 0x3333 & mz & (mz >> 16)) == 0 && !dozero)
+        {
+          // same pattern in low and high half. use VPSHUFD
+          const int sd = ((i0 >= 0) ? (i0 & 3) : (i4 & 3)) | ((i1 >= 0) ? (i1 & 3) : (i5 & 3)) << 2 |
+                         ((i2 >= 0) ? (i2 & 3) : (i6 & 3)) << 4 | ((i3 >= 0) ? (i3 & 3) : (i7 & 3)) << 6;
+          return _mm256_shuffle_epi32(a, sd);
+        }
+
+      // use VPSHUFB
+      mask = constant8i < i0 < 0 ? -1 : (i0 & 3) * 0x04040404 + 0x03020100, i1 < 0 ? -1 : (i1 & 3) * 0x04040404 + 0x03020100,
+      i2 < 0 ? -1 : (i2 & 3) * 0x04040404 + 0x03020100, i3 < 0 ? -1 : (i3 & 3) * 0x04040404 + 0x03020100,
+      i4 < 0 ? -1 : (i4 & 3) * 0x04040404 + 0x03020100, i5 < 0 ? -1 : (i5 & 3) * 0x04040404 + 0x03020100,
+      i6 < 0 ? -1 : (i6 & 3) * 0x04040404 + 0x03020100, i7 < 0 ? -1 : (i7 & 3) * 0x04040404 + 0x03020100 > ();
+      return _mm256_shuffle_epi8(a, mask);
+    }
+
+  // general case. Use VPERMD
+  mask = constant8i < i0 < 0 ? -1 : (i0 & 7), i1 < 0 ? -1 : (i1 & 7), i2 < 0 ? -1 : (i2 & 7), i3 < 0 ? -1 : (i3 & 7),
+  i4 < 0 ? -1 : (i4 & 7), i5 < 0 ? -1 : (i5 & 7), i6 < 0 ? -1 : (i6 & 7), i7 < 0 ? -1 : (i7 & 7) > ();
+#if defined(_MSC_VER) && _MSC_VER < 1700 && !defined(__INTEL_COMPILER)
+  // bug in MS VS 11 beta: operands in wrong order. fixed in v. 11.0
+  t1 = _mm256_permutevar8x32_epi32(mask, a);  // ms
+#elif defined(GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+  // Gcc 4.7.0 also has operands in wrong order. fixed in version 4.7.1
+  t1 = _mm256_permutevar8x32_epi32(mask, a);  // GCC
+#else
+  t1 = _mm256_permutevar8x32_epi32(a, mask);  // no-bug version
+#endif
+
+  if(dozero)
+    {
+      // zero some elements
+      mask = constant8i < i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1,
+      i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > ();
+      return _mm256_and_si256(t1, mask);
+    }
+  return t1;
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8ui permute8ui(Vec8ui const &a)
+{
+  return Vec8ui(permute8i<i0, i1, i2, i3, i4, i5, i6, i7>(a));
+}
+
+// Permute vector of 16 16-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16s permute16s(Vec16s const &a)
+{
+  // Combine indexes 0 - 7 into a single bitfield, with 4 bits for each
+  const int mlo = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                  (i6 & 0xF) << 24 | (i7 & 0xF) << 28;
+
+  // Combine indexes 8 - 15 into a single bitfield, with 4 bits for each
+  const int mhi = (i8 & 0xF) | (i9 & 0xF) << 4 | (i10 & 0xF) << 8 | (i11 & 0xF) << 12 | (i12 & 0xF) << 16 | (i13 & 0xF) << 20 |
+                  (i14 & 0xF) << 24 | (i15 & 0xF) << 28;
+
+  // Mask to zero out negative indexes 0 - 7
+  const int zlo = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12 |
+                  (i4 < 0 ? 0 : 0xF) << 16 | (i5 < 0 ? 0 : 0xF) << 20 | (i6 < 0 ? 0 : 0xF) << 24 | (i7 < 0 ? 0 : 0xF) << 28;
+
+  // Mask to zero out negative indexes 8 - 15
+  const int zhi = (i8 < 0 ? 0 : 0xF) | (i9 < 0 ? 0 : 0xF) << 4 | (i10 < 0 ? 0 : 0xF) << 8 | (i11 < 0 ? 0 : 0xF) << 12 |
+                  (i12 < 0 ? 0 : 0xF) << 16 | (i13 < 0 ? 0 : 0xF) << 20 | (i14 < 0 ? 0 : 0xF) << 24 | (i15 < 0 ? 0 : 0xF) << 28;
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) & 0x80) != 0;
+
+  __m256i t1, mask;
+
+  // special case: all zero
+  if(zlo == 0 && zhi == 0)
+    {
+      return _mm256_setzero_si256();
+    }
+
+  // special case: rotate 128 bits
+  if(i0 >= 0 && i0 < 16 && i1 == ((i0 + 1) & 7) && i2 == ((i0 + 2) & 7) && i3 == ((i0 + 3) & 7) && i4 == ((i0 + 4) & 7) &&
+     i5 == ((i0 + 5) & 7) && i6 == ((i0 + 6) & 7) && i7 == ((i0 + 7) & 7) && i8 == i0 + 8 && i9 == i1 + 8 && i10 == i2 + 8 &&
+     i11 == i3 + 8 && i12 == i4 + 8 && i13 == i5 + 8 && i14 == i6 + 8 && i15 == i7 + 8)
+    {
+      return _mm256_alignr_epi8(a, a, (i0 & 7) * 2);
+    }
+
+  // special case: rotate 256 bits
+  if(i0 >= 0 && i0 < 16 && i1 == ((i0 + 1) & 15) && i2 == ((i0 + 2) & 15) && i3 == ((i0 + 3) & 15) && i4 == ((i0 + 4) & 15) &&
+     i5 == ((i0 + 5) & 15) && i6 == ((i0 + 6) & 15) && i7 == ((i0 + 7) & 15) && i8 == ((i0 + 8) & 15) && i9 == ((i0 + 9) & 15) &&
+     i10 == ((i0 + 10) & 15) && i11 == ((i0 + 11) & 15) && i12 == ((i0 + 12) & 15) && i13 == ((i0 + 13) & 15) &&
+     i14 == ((i0 + 14) & 15) && i15 == ((i0 + 15) & 15))
+    {
+      t1 = _mm256_permute4x64_epi64(a, 0x4E);
+      return _mm256_alignr_epi8(a, t1, (i0 & 7) * 2);
+    }
+
+  // special case: no exchange of data between 64-bit sections, and same pattern in low and high 128 bits:
+  // can use VPSHUFLW or VPSHUFHW
+  if(((mlo ^ 0x44440000) & 0xCCCCCCCC & zlo) == 0 && ((mhi ^ 0xCCCC8888) & 0xCCCCCCCC & zhi) == 0 &&
+     ((mlo ^ mhi) & 0x33333333 & zlo & zhi) == 0)
+    {
+      const int slo = (i0 >= 0 ? (i0 & 3) : i8 >= 0 ? (i8 & 3) : 0) | (i1 >= 0 ? (i1 & 3) : i9 >= 0 ? (i9 & 3) : 1) << 2 |
+                      (i2 >= 0 ? (i2 & 3) : i10 >= 0 ? (i10 & 3) : 2) << 4 | (i3 >= 0 ? (i3 & 3) : i11 >= 0 ? (i11 & 3) : 3) << 6;
+
+      const int shi = (i4 >= 0 ? (i4 & 3) : i12 >= 0 ? (i12 & 3) : 0) | (i5 >= 0 ? (i5 & 3) : i13 >= 0 ? (i13 & 3) : 1) << 2 |
+                      (i6 >= 0 ? (i6 & 3) : i14 >= 0 ? (i14 & 3) : 2) << 4 | (i7 >= 0 ? (i7 & 3) : i15 >= 0 ? (i15 & 3) : 3) << 6;
+
+      if(shi == 0xE4 && slo == 0xE4)
+        {  // no permute
+          if(dozero)
+            {
+              // zero some elements
+              const __m256i maskz = constant8i<
+                  int((i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000)), int((i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000)),
+                  int((i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000)), int((i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000)),
+                  int((i8 < 0 ? 0 : 0xFFFF) | (i9 < 0 ? 0 : 0xFFFF0000)), int((i10 < 0 ? 0 : 0xFFFF) | (i11 < 0 ? 0 : 0xFFFF0000)),
+                  int((i12 < 0 ? 0 : 0xFFFF) | (i13 < 0 ? 0 : 0xFFFF0000)),
+                  int((i14 < 0 ? 0 : 0xFFFF) | (i15 < 0 ? 0 : 0xFFFF0000))>();
+              return _mm256_and_si256(a, maskz);
+            }
+          return a;  // do nothing
+        }
+      if(shi == 0xE4 && !dozero)
+        {
+          return _mm256_shufflelo_epi16(a, slo);  // low permute only
+        }
+      if(slo == 0xE4 && !dozero)
+        {
+          return _mm256_shufflehi_epi16(a, shi);  // high permute only
+        }
+    }
+
+  // Check if we can use 32-bit permute. Even numbered indexes must be even and odd numbered
+  // indexes must be equal to the preceding index + 1, except for negative indexes.
+  if(((mlo ^ 0x10101010) & 0x11111111 & zlo) == 0 && ((mlo ^ mlo >> 4) & 0x0E0E0E0E & zlo & zlo >> 4) == 0 &&
+     ((mhi ^ 0x10101010) & 0x11111111 & zhi) == 0 && ((mhi ^ mhi >> 4) & 0x0E0E0E0E & zhi & zhi >> 4) == 0)
+    {
+      const bool partialzero = int((i0 ^ i1) | (i2 ^ i3) | (i4 ^ i5) | (i6 ^ i7) | (i8 ^ i9) | (i10 ^ i11) | (i12 ^ i13) |
+                                   (i14 ^ i15)) < 0;                        // part of a 32-bit block is zeroed
+      const int blank1       = partialzero ? -0x100 : -1;                   // ignore or zero
+      const int n0           = i0 > 0 ? i0 / 2 : i1 > 0 ? i1 / 2 : blank1;  // indexes for 64 bit blend
+      const int n1           = i2 > 0 ? i2 / 2 : i3 > 0 ? i3 / 2 : blank1;
+      const int n2           = i4 > 0 ? i4 / 2 : i5 > 0 ? i5 / 2 : blank1;
+      const int n3           = i6 > 0 ? i6 / 2 : i7 > 0 ? i7 / 2 : blank1;
+      const int n4           = i8 > 0 ? i8 / 2 : i9 > 0 ? i9 / 2 : blank1;
+      const int n5           = i10 > 0 ? i10 / 2 : i11 > 0 ? i11 / 2 : blank1;
+      const int n6           = i12 > 0 ? i12 / 2 : i13 > 0 ? i13 / 2 : blank1;
+      const int n7           = i14 > 0 ? i14 / 2 : i15 > 0 ? i15 / 2 : blank1;
+      // do 32-bit permute
+      t1 = permute8i<n0, n1, n2, n3, n4, n5, n6, n7>(Vec8i(a));
+      if(blank1 == -1 || !dozero)
+        {
+          return t1;
+        }
+      // need more zeroing
+      mask =
+          constant8i<int((i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000)), int((i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000)),
+                     int((i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000)), int((i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000)),
+                     int((i8 < 0 ? 0 : 0xFFFF) | (i9 < 0 ? 0 : 0xFFFF0000)), int((i10 < 0 ? 0 : 0xFFFF) | (i11 < 0 ? 0 : 0xFFFF0000)),
+                     int((i12 < 0 ? 0 : 0xFFFF) | (i13 < 0 ? 0 : 0xFFFF0000)),
+                     int((i14 < 0 ? 0 : 0xFFFF) | (i15 < 0 ? 0 : 0xFFFF0000))>();
+      return _mm256_and_si256(t1, mask);
+    }
+
+  // special case: all elements from same half
+  if((mlo & 0x88888888 & zlo) == 0 && ((mhi ^ 0x88888888) & 0x88888888 & zhi) == 0)
+    {
+      mask = constant8i<(i0 < 0 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | (i1 < 0 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16,
+                        (i2 < 0 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | (i3 < 0 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16,
+                        (i4 < 0 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | (i5 < 0 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16,
+                        (i6 < 0 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | (i7 < 0 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16,
+                        (i8 < 0 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | (i9 < 0 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16,
+                        (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+                        (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+                        (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16>();
+      return _mm256_shuffle_epi8(a, mask);
+    }
+
+  // special case: all elements from low half
+  if((mlo & 0x88888888 & zlo) == 0 && (mhi & 0x88888888 & zhi) == 0)
+    {
+      mask = constant8i<(i0 < 0 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | (i1 < 0 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16,
+                        (i2 < 0 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | (i3 < 0 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16,
+                        (i4 < 0 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | (i5 < 0 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16,
+                        (i6 < 0 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | (i7 < 0 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16,
+                        (i8 < 0 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | (i9 < 0 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16,
+                        (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+                        (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+                        (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16>();
+      t1   = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1);  // low, low
+      return _mm256_shuffle_epi8(t1, mask);
+    }
+
+  // special case: all elements from high half
+  if(((mlo ^ 0x88888888) & 0x88888888 & zlo) == 0 && ((mhi ^ 0x88888888) & 0x88888888 & zhi) == 0)
+    {
+      mask = constant8i<(i0 < 0 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | (i1 < 0 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16,
+                        (i2 < 0 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | (i3 < 0 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16,
+                        (i4 < 0 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | (i5 < 0 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16,
+                        (i6 < 0 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | (i7 < 0 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16,
+                        (i8 < 0 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | (i9 < 0 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16,
+                        (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+                        (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+                        (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16>();
+      t1   = _mm256_permute4x64_epi64(a, 0xEE);  // high, high
+      return _mm256_shuffle_epi8(t1, mask);
+    }
+
+  // special case: all elements from opposite half
+  if(((mlo ^ 0x88888888) & 0x88888888 & zlo) == 0 && (mhi & 0x88888888 & zhi) == 0)
+    {
+      mask = constant8i<(i0 < 0 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | (i1 < 0 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16,
+                        (i2 < 0 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | (i3 < 0 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16,
+                        (i4 < 0 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | (i5 < 0 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16,
+                        (i6 < 0 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | (i7 < 0 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16,
+                        (i8 < 0 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | (i9 < 0 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16,
+                        (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+                        (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+                        (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16>();
+      t1   = _mm256_permute4x64_epi64(a, 0x4E);  // high, low
+      return _mm256_shuffle_epi8(t1, mask);
+    }
+
+  // general case: elements from both halves
+  const __m256i mmsame =
+      constant8i<((i0 ^ 8) < 8 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | ((i1 ^ 8) < 8 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16,
+                 ((i2 ^ 8) < 8 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | ((i3 ^ 8) < 8 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16,
+                 ((i4 ^ 8) < 8 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | ((i5 ^ 8) < 8 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16,
+                 ((i6 ^ 8) < 8 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | ((i7 ^ 8) < 8 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16,
+                 (i8 < 8 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | (i9 < 8 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16,
+                 (i10 < 8 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 8 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+                 (i12 < 8 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 8 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+                 (i14 < 8 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 8 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16>();
+
+  const __m256i mmopposite =
+      constant8i<(i0 < 8 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | (i1 < 8 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16,
+                 (i2 < 8 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | (i3 < 8 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16,
+                 (i4 < 8 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | (i5 < 8 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16,
+                 (i6 < 8 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | (i7 < 8 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16,
+                 ((i8 ^ 8) < 8 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | ((i9 ^ 8) < 8 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16,
+                 ((i10 ^ 8) < 8 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | ((i11 ^ 8) < 8 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+                 ((i12 ^ 8) < 8 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | ((i13 ^ 8) < 8 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+                 ((i14 ^ 8) < 8 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | ((i15 ^ 8) < 8 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16>();
+
+  __m256i topp = _mm256_permute4x64_epi64(a, 0x4E);  // high, low
+  __m256i r1   = _mm256_shuffle_epi8(topp, mmopposite);
+  __m256i r2   = _mm256_shuffle_epi8(a, mmsame);
+  return _mm256_or_si256(r1, r2);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16us permute16us(Vec16us const &a)
+{
+  return Vec16us(permute16s<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15, int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, int i24, int i25, int i26, int i27, int i28,
+          int i29, int i30, int i31>
+static inline Vec32c permute32c(Vec32c const &a)
+{
+  // collect bit 4 of each index
+  const int m1 = (i0 & 16) >> 4 | (i1 & 16) >> 3 | (i2 & 16) >> 2 | (i3 & 16) >> 1 | (i4 & 16) | (i5 & 16) << 1 | (i6 & 16) << 2 |
+                 (i7 & 16) << 3 | (i8 & 16) << 4 | (i9 & 16) << 5 | (i10 & 16) << 6 | (i11 & 16) << 7 | (i12 & 16) << 8 |
+                 (i13 & 16) << 9 | (i14 & 16) << 10 | (i15 & 16) << 11 | (i16 & 16) << 12 | (i17 & 16) << 13 | (i18 & 16) << 14 |
+                 (i19 & 16) << 15 | (i20 & 16) << 16 | (i21 & 16) << 17 | (i22 & 16) << 18 | (i23 & 16) << 19 | (i24 & 16) << 20 |
+                 (i25 & 16) << 21 | (i26 & 16) << 22 | (i27 & 16) << 23 | (i28 & 16) << 24 | (i29 & 16) << 25 | (i30 & 16) << 26 |
+                 (i31 & 16) << 27;
+
+  // check which elements to set to zero
+  const int mz =
+      ~((i0 < 0) | (i1 < 0) << 1 | (i2 < 0) << 2 | (i3 < 0) << 3 | (i4 < 0) << 4 | (i5 < 0) << 5 | (i6 < 0) << 6 | (i7 < 0) << 7 |
+        (i8 < 0) << 8 | (i9 < 0) << 9 | (i10 < 0) << 10 | (i11 < 0) << 11 | (i12 < 0) << 12 | (i13 < 0) << 13 | (i14 < 0) << 14 |
+        (i15 < 0) << 15 | (i16 < 0) << 16 | (i17 < 0) << 17 | (i18 < 0) << 18 | (i19 < 0) << 19 | (i20 < 0) << 20 | (i21 < 0) << 21 |
+        (i22 < 0) << 22 | (i23 < 0) << 23 | (i24 < 0) << 24 | (i25 < 0) << 25 | (i26 < 0) << 26 | (i27 < 0) << 27 | (i28 < 0) << 28 |
+        (i29 < 0) << 29 | (i30 < 0) << 30 | (i31 < 0) << 31);
+
+  // Combine indexes 0-7, 8-15, 16-23, 24-31 into a bitfields, with 8 bits for each
+  const uint64_t g0 = (i0 & 0x1F) | (i1 & 0x1F) << 8 | (i2 & 0x1F) << 16 | (i3 & 0x1F) << 24 | (i4 & 0x1FLL) << 32 |
+                      (i5 & 0x1FLL) << 40 | (i6 & 0x1FLL) << 48 | (i7 & 0x1FLL) << 56;
+  const uint64_t g1 = (i8 & 0x1F) | (i9 & 0x1F) << 8 | (i10 & 0x1F) << 16 | (i11 & 0x1F) << 24 | (i12 & 0x1FLL) << 32 |
+                      (i13 & 0x1FLL) << 40 | (i14 & 0x1FLL) << 48 | (i15 & 0x1FLL) << 56;
+  const uint64_t g2 = (i16 & 0x1F) | (i17 & 0x1F) << 8 | (i18 & 0x1F) << 16 | (i19 & 0x1F) << 24 | (i20 & 0x1FLL) << 32 |
+                      (i21 & 0x1FLL) << 40 | (i22 & 0x1FLL) << 48 | (i23 & 0x1FLL) << 56;
+  const uint64_t g3 = (i24 & 0x1F) | (i25 & 0x1F) << 8 | (i26 & 0x1F) << 16 | (i27 & 0x1F) << 24 | (i28 & 0x1FLL) << 32 |
+                      (i29 & 0x1FLL) << 40 | (i30 & 0x1FLL) << 48 | (i31 & 0x1FLL) << 56;
+
+  // Masks to zero out negative indexes
+  const uint64_t z0 = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24 |
+                      (i4 < 0 ? 0 : 0xFFLL) << 32 | (i5 < 0 ? 0 : 0xFFLL) << 40 | (i6 < 0 ? 0 : 0xFFLL) << 48 |
+                      (i7 < 0 ? 0 : 0xFFLL) << 56;
+  const uint64_t z1 = (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF) << 8 | (i10 < 0 ? 0 : 0xFF) << 16 | (i11 < 0 ? 0 : 0xFF) << 24 |
+                      (i12 < 0 ? 0 : 0xFFLL) << 32 | (i13 < 0 ? 0 : 0xFFLL) << 40 | (i14 < 0 ? 0 : 0xFFLL) << 48 |
+                      (i15 < 0 ? 0 : 0xFFLL) << 56;
+  const uint64_t z2 = (i16 < 0 ? 0 : 0xFF) | (i17 < 0 ? 0 : 0xFF) << 8 | (i18 < 0 ? 0 : 0xFF) << 16 | (i19 < 0 ? 0 : 0xFF) << 24 |
+                      (i20 < 0 ? 0 : 0xFFLL) << 32 | (i21 < 0 ? 0 : 0xFFLL) << 40 | (i22 < 0 ? 0 : 0xFFLL) << 48 |
+                      (i23 < 0 ? 0 : 0xFFLL) << 56;
+  const uint64_t z3 = (i24 < 0 ? 0 : 0xFF) | (i25 < 0 ? 0 : 0xFF) << 8 | (i26 < 0 ? 0 : 0xFF) << 16 | (i27 < 0 ? 0 : 0xFF) << 24 |
+                      (i28 < 0 ? 0 : 0xFFLL) << 32 | (i29 < 0 ? 0 : 0xFFLL) << 40 | (i30 < 0 ? 0 : 0xFFLL) << 48 |
+                      (i31 < 0 ? 0 : 0xFFLL) << 56;
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15 | i16 | i17 | i18 | i19 |
+                        i20 | i21 | i22 | i23 | i24 | i25 | i26 | i27 | i28 | i29 | i30 | i31) &
+                       0x80) != 0;
+
+  __m256i t1, mask;
+
+  // special case: all zero
+  if(mz == 0)
+    return _mm256_setzero_si256();
+
+  // special case: no permute
+  if((i0 < 0 || i0 == 0) && (i1 < 0 || i1 == 1) && (i2 < 0 || i2 == 2) && (i3 < 0 || i3 == 3) && (i4 < 0 || i4 == 4) &&
+     (i5 < 0 || i5 == 5) && (i6 < 0 || i6 == 6) && (i7 < 0 || i7 == 7) && (i8 < 0 || i8 == 8) && (i9 < 0 || i9 == 9) &&
+     (i10 < 0 || i10 == 10) && (i11 < 0 || i11 == 11) && (i12 < 0 || i12 == 12) && (i13 < 0 || i13 == 13) && (i14 < 0 || i14 == 14) &&
+     (i15 < 0 || i15 == 15) && (i16 < 0 || i16 == 16) && (i17 < 0 || i17 == 17) && (i18 < 0 || i18 == 18) && (i19 < 0 || i19 == 19) &&
+     (i20 < 0 || i20 == 20) && (i21 < 0 || i21 == 21) && (i22 < 0 || i22 == 22) && (i23 < 0 || i23 == 23) && (i24 < 0 || i24 == 24) &&
+     (i25 < 0 || i25 == 25) && (i26 < 0 || i26 == 26) && (i27 < 0 || i27 == 27) && (i28 < 0 || i28 == 28) && (i29 < 0 || i29 == 29) &&
+     (i30 < 0 || i30 == 30) && (i31 < 0 || i31 == 31))
+    {
+      if(dozero)
+        {
+          // zero some elements
+          mask =
+              constant8i<int((i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000)),
+                         int((i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000)),
+                         int((i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000)),
+                         int((i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000)),
+                         int((i16 < 0 ? 0 : 0xFF) | (i17 < 0 ? 0 : 0xFF00) | (i18 < 0 ? 0 : 0xFF0000) | (i19 < 0 ? 0 : 0xFF000000)),
+                         int((i20 < 0 ? 0 : 0xFF) | (i21 < 0 ? 0 : 0xFF00) | (i22 < 0 ? 0 : 0xFF0000) | (i23 < 0 ? 0 : 0xFF000000)),
+                         int((i24 < 0 ? 0 : 0xFF) | (i25 < 0 ? 0 : 0xFF00) | (i26 < 0 ? 0 : 0xFF0000) | (i27 < 0 ? 0 : 0xFF000000)),
+                         int((i28 < 0 ? 0 : 0xFF) | (i29 < 0 ? 0 : 0xFF00) | (i30 < 0 ? 0 : 0xFF0000) | (i31 < 0 ? 0 : 0xFF000000))>();
+          return _mm256_and_si256(a, mask);
+        }
+      return a;  // do nothing
+    }
+
+  // special case: rotate 128 bits
+  if(i0 >= 0 && i0 < 32 && i1 == ((i0 + 1) & 15) && i2 == ((i0 + 2) & 15) && i3 == ((i0 + 3) & 15) && i4 == ((i0 + 4) & 15) &&
+     i5 == ((i0 + 5) & 15) && i6 == ((i0 + 6) & 15) && i7 == ((i0 + 7) & 15) && i8 == ((i0 + 8) & 15) && i9 == ((i0 + 9) & 15) &&
+     i10 == ((i0 + 10) & 15) && i11 == ((i0 + 11) & 15) && i12 == ((i0 + 12) & 15) && i13 == ((i0 + 13) & 15) &&
+     i14 == ((i0 + 14) & 15) && i15 == ((i0 + 15) & 15) && i16 == i0 + 16 && i17 == i1 + 16 && i18 == i2 + 16 && i19 == i3 + 16 &&
+     i20 == i4 + 16 && i21 == i5 + 16 && i22 == i6 + 16 && i23 == i7 + 16 && i24 == i8 + 16 && i25 == i9 + 16 && i26 == i10 + 16 &&
+     i27 == i11 + 16 && i28 == i12 + 16 && i29 == i13 + 16 && i30 == i14 + 16 && i31 == i15 + 16)
+    {
+      return _mm256_alignr_epi8(a, a, i0 & 15);
+    }
+
+  // special case: rotate 256 bits
+  if(i0 >= 0 && i0 < 32 && i1 == ((i0 + 1) & 31) && i2 == ((i0 + 2) & 31) && i3 == ((i0 + 3) & 31) && i4 == ((i0 + 4) & 31) &&
+     i5 == ((i0 + 5) & 31) && i6 == ((i0 + 6) & 31) && i7 == ((i0 + 7) & 31) && i8 == ((i0 + 8) & 31) && i9 == ((i0 + 9) & 31) &&
+     i10 == ((i0 + 10) & 31) && i11 == ((i0 + 11) & 31) && i12 == ((i0 + 12) & 31) && i13 == ((i0 + 13) & 31) &&
+     i14 == ((i0 + 14) & 31) && i15 == ((i0 + 15) & 31) && i16 == ((i0 + 16) & 31) && i17 == ((i0 + 17) & 31) &&
+     i18 == ((i0 + 18) & 31) && i19 == ((i0 + 19) & 31) && i20 == ((i0 + 20) & 31) && i21 == ((i0 + 21) & 31) &&
+     i22 == ((i0 + 22) & 31) && i23 == ((i0 + 23) & 31) && i24 == ((i0 + 24) & 31) && i25 == ((i0 + 25) & 31) &&
+     i26 == ((i0 + 26) & 31) && i27 == ((i0 + 27) & 31) && i28 == ((i0 + 28) & 31) && i29 == ((i0 + 29) & 31) &&
+     i30 == ((i0 + 30) & 31) && i31 == ((i0 + 31) & 31))
+    {
+      t1 = _mm256_permute4x64_epi64(a, 0x4E);
+      return _mm256_alignr_epi8(a, t1, i0 & 15);
+    }
+
+  // Check if we can use 16-bit permute. Even numbered indexes must be even and odd numbered
+  // indexes must be equal to the preceding index + 1, except for negative indexes.
+  if(((g0 ^ 0x0100010001000100) & 0x0101010101010101 & z0) == 0 && ((g0 ^ g0 >> 8) & 0x00FE00FE00FE00FE & z0 & z0 >> 8) == 0 &&
+     ((g1 ^ 0x0100010001000100) & 0x0101010101010101 & z1) == 0 && ((g1 ^ g1 >> 8) & 0x00FE00FE00FE00FE & z1 & z1 >> 8) == 0 &&
+     ((g2 ^ 0x0100010001000100) & 0x0101010101010101 & z2) == 0 && ((g2 ^ g2 >> 8) & 0x00FE00FE00FE00FE & z2 & z2 >> 8) == 0 &&
+     ((g3 ^ 0x0100010001000100) & 0x0101010101010101 & z3) == 0 && ((g3 ^ g3 >> 8) & 0x00FE00FE00FE00FE & z3 & z3 >> 8) == 0)
+    {
+      const bool partialzero = int((i0 ^ i1) | (i2 ^ i3) | (i4 ^ i5) | (i6 ^ i7) | (i8 ^ i9) | (i10 ^ i11) | (i12 ^ i13) |
+                                   (i14 ^ i15) | (i16 ^ i17) | (i18 ^ i19) | (i20 ^ i21) | (i22 ^ i23) | (i24 ^ i25) | (i26 ^ i27) |
+                                   (i28 ^ i29) | (i30 ^ i31)) < 0;          // part of a 16-bit block is zeroed
+      const int blank1       = partialzero ? -0x100 : -1;                   // ignore or zero
+      const int n0           = i0 > 0 ? i0 / 2 : i1 > 0 ? i1 / 2 : blank1;  // indexes for 64 bit blend
+      const int n1           = i2 > 0 ? i2 / 2 : i3 > 0 ? i3 / 2 : blank1;
+      const int n2           = i4 > 0 ? i4 / 2 : i5 > 0 ? i5 / 2 : blank1;
+      const int n3           = i6 > 0 ? i6 / 2 : i7 > 0 ? i7 / 2 : blank1;
+      const int n4           = i8 > 0 ? i8 / 2 : i9 > 0 ? i9 / 2 : blank1;
+      const int n5           = i10 > 0 ? i10 / 2 : i11 > 0 ? i11 / 2 : blank1;
+      const int n6           = i12 > 0 ? i12 / 2 : i13 > 0 ? i13 / 2 : blank1;
+      const int n7           = i14 > 0 ? i14 / 2 : i15 > 0 ? i15 / 2 : blank1;
+      const int n8           = i16 > 0 ? i16 / 2 : i17 > 0 ? i17 / 2 : blank1;
+      const int n9           = i18 > 0 ? i18 / 2 : i19 > 0 ? i19 / 2 : blank1;
+      const int n10          = i20 > 0 ? i20 / 2 : i21 > 0 ? i21 / 2 : blank1;
+      const int n11          = i22 > 0 ? i22 / 2 : i23 > 0 ? i23 / 2 : blank1;
+      const int n12          = i24 > 0 ? i24 / 2 : i25 > 0 ? i25 / 2 : blank1;
+      const int n13          = i26 > 0 ? i26 / 2 : i27 > 0 ? i27 / 2 : blank1;
+      const int n14          = i28 > 0 ? i28 / 2 : i29 > 0 ? i29 / 2 : blank1;
+      const int n15          = i30 > 0 ? i30 / 2 : i31 > 0 ? i31 / 2 : blank1;
+      // do 16-bit permute
+      t1 = permute16s<n0, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11, n12, n13, n14, n15>(Vec16s(a));
+      if(blank1 == -1 || !dozero)
+        {
+          return t1;
+        }
+      // need more zeroing
+      mask = constant8i<int((i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000)),
+                        int((i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000)),
+                        int((i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000)),
+                        int((i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000)),
+                        int((i16 < 0 ? 0 : 0xFF) | (i17 < 0 ? 0 : 0xFF00) | (i18 < 0 ? 0 : 0xFF0000) | (i19 < 0 ? 0 : 0xFF000000)),
+                        int((i20 < 0 ? 0 : 0xFF) | (i21 < 0 ? 0 : 0xFF00) | (i22 < 0 ? 0 : 0xFF0000) | (i23 < 0 ? 0 : 0xFF000000)),
+                        int((i24 < 0 ? 0 : 0xFF) | (i25 < 0 ? 0 : 0xFF00) | (i26 < 0 ? 0 : 0xFF0000) | (i27 < 0 ? 0 : 0xFF000000)),
+                        int((i28 < 0 ? 0 : 0xFF) | (i29 < 0 ? 0 : 0xFF00) | (i30 < 0 ? 0 : 0xFF0000) | (i31 < 0 ? 0 : 0xFF000000))>();
+      return _mm256_and_si256(a, mask);
+    }
+
+  // special case: all elements from same half
+  if(((m1 ^ 0xFFFF0000) & mz) == 0)
+    {
+      mask = constant8i<(i0 & 0xFF) | (i1 & 0xFF) << 8 | (i2 & 0xFF) << 16 | (i3 & 0xFF) << 24,
+                        (i4 & 0xFF) | (i5 & 0xFF) << 8 | (i6 & 0xFF) << 16 | (i7 & 0xFF) << 24,
+                        (i8 & 0xFF) | (i9 & 0xFF) << 8 | (i10 & 0xFF) << 16 | (i11 & 0xFF) << 24,
+                        (i12 & 0xFF) | (i13 & 0xFF) << 8 | (i14 & 0xFF) << 16 | (i15 & 0xFF) << 24,
+                        (i16 & 0xEF) | (i17 & 0xEF) << 8 | (i18 & 0xEF) << 16 | (i19 & 0xEF) << 24,
+                        (i20 & 0xEF) | (i21 & 0xEF) << 8 | (i22 & 0xEF) << 16 | (i23 & 0xEF) << 24,
+                        (i24 & 0xEF) | (i25 & 0xEF) << 8 | (i26 & 0xEF) << 16 | (i27 & 0xEF) << 24,
+                        (i28 & 0xEF) | (i29 & 0xEF) << 8 | (i30 & 0xEF) << 16 | (i31 & 0xEF) << 24>();
+      return _mm256_shuffle_epi8(a, mask);
+    }
+
+  // special case: all elements from low half
+  if((m1 & mz) == 0)
+    {
+      mask = constant8i<(i0 & 0xFF) | (i1 & 0xFF) << 8 | (i2 & 0xFF) << 16 | (i3 & 0xFF) << 24,
+                        (i4 & 0xFF) | (i5 & 0xFF) << 8 | (i6 & 0xFF) << 16 | (i7 & 0xFF) << 24,
+                        (i8 & 0xFF) | (i9 & 0xFF) << 8 | (i10 & 0xFF) << 16 | (i11 & 0xFF) << 24,
+                        (i12 & 0xFF) | (i13 & 0xFF) << 8 | (i14 & 0xFF) << 16 | (i15 & 0xFF) << 24,
+                        (i16 & 0xFF) | (i17 & 0xFF) << 8 | (i18 & 0xFF) << 16 | (i19 & 0xFF) << 24,
+                        (i20 & 0xFF) | (i21 & 0xFF) << 8 | (i22 & 0xFF) << 16 | (i23 & 0xFF) << 24,
+                        (i24 & 0xFF) | (i25 & 0xFF) << 8 | (i26 & 0xFF) << 16 | (i27 & 0xFF) << 24,
+                        (i28 & 0xFF) | (i29 & 0xFF) << 8 | (i30 & 0xFF) << 16 | (i31 & 0xFF) << 24>();
+      t1   = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1);  // low, low
+      return _mm256_shuffle_epi8(t1, mask);
+    }
+
+  // special case: all elements from high half
+  if(((m1 ^ 0xFFFFFFFF) & mz) == 0)
+    {
+      mask = constant8i<(i0 & 0xEF) | (i1 & 0xEF) << 8 | (i2 & 0xEF) << 16 | (i3 & 0xEF) << 24,
+                        (i4 & 0xEF) | (i5 & 0xEF) << 8 | (i6 & 0xEF) << 16 | (i7 & 0xEF) << 24,
+                        (i8 & 0xEF) | (i9 & 0xEF) << 8 | (i10 & 0xEF) << 16 | (i11 & 0xEF) << 24,
+                        (i12 & 0xEF) | (i13 & 0xEF) << 8 | (i14 & 0xEF) << 16 | (i15 & 0xEF) << 24,
+                        (i16 & 0xEF) | (i17 & 0xEF) << 8 | (i18 & 0xEF) << 16 | (i19 & 0xEF) << 24,
+                        (i20 & 0xEF) | (i21 & 0xEF) << 8 | (i22 & 0xEF) << 16 | (i23 & 0xEF) << 24,
+                        (i24 & 0xEF) | (i25 & 0xEF) << 8 | (i26 & 0xEF) << 16 | (i27 & 0xEF) << 24,
+                        (i28 & 0xEF) | (i29 & 0xEF) << 8 | (i30 & 0xEF) << 16 | (i31 & 0xEF) << 24>();
+      t1   = _mm256_permute4x64_epi64(a, 0xEE);  // high, high
+      return _mm256_shuffle_epi8(t1, mask);
+    }
+
+  // special case: all elements from opposite half
+  if(((m1 ^ 0x0000FFFF) & mz) == 0)
+    {
+      mask = constant8i<(i0 & 0xEF) | (i1 & 0xEF) << 8 | (i2 & 0xEF) << 16 | (i3 & 0xEF) << 24,
+                        (i4 & 0xEF) | (i5 & 0xEF) << 8 | (i6 & 0xEF) << 16 | (i7 & 0xEF) << 24,
+                        (i8 & 0xEF) | (i9 & 0xEF) << 8 | (i10 & 0xEF) << 16 | (i11 & 0xEF) << 24,
+                        (i12 & 0xEF) | (i13 & 0xEF) << 8 | (i14 & 0xEF) << 16 | (i15 & 0xEF) << 24,
+                        (i16 & 0xFF) | (i17 & 0xFF) << 8 | (i18 & 0xFF) << 16 | (i19 & 0xFF) << 24,
+                        (i20 & 0xFF) | (i21 & 0xFF) << 8 | (i22 & 0xFF) << 16 | (i23 & 0xFF) << 24,
+                        (i24 & 0xFF) | (i25 & 0xFF) << 8 | (i26 & 0xFF) << 16 | (i27 & 0xFF) << 24,
+                        (i28 & 0xFF) | (i29 & 0xFF) << 8 | (i30 & 0xFF) << 16 | (i31 & 0xFF) << 24>();
+
+      t1 = _mm256_permute4x64_epi64(a, 0x4E);  // high, low
+      return _mm256_shuffle_epi8(t1, mask);
+    }
+
+  // general case: elements from both halves
+  const __m256i mmsame =
+      constant8i<((i0 & 0xF0) ? 0xFF : (i0 & 15)) | ((i1 & 0xF0) ? 0xFF : (i1 & 15)) << 8 | ((i2 & 0xF0) ? 0xFF : (i2 & 15)) << 16 |
+                     ((i3 & 0xF0) ? 0xFF : (i3 & 15)) << 24,
+                 ((i4 & 0xF0) ? 0xFF : (i4 & 15)) | ((i5 & 0xF0) ? 0xFF : (i5 & 15)) << 8 | ((i6 & 0xF0) ? 0xFF : (i6 & 15)) << 16 |
+                     ((i7 & 0xF0) ? 0xFF : (i7 & 15)) << 24,
+                 ((i8 & 0xF0) ? 0xFF : (i8 & 15)) | ((i9 & 0xF0) ? 0xFF : (i9 & 15)) << 8 | ((i10 & 0xF0) ? 0xFF : (i10 & 15)) << 16 |
+                     ((i11 & 0xF0) ? 0xFF : (i11 & 15)) << 24,
+                 ((i12 & 0xF0) ? 0xFF : (i12 & 15)) | ((i13 & 0xF0) ? 0xFF : (i13 & 15)) << 8 |
+                     ((i14 & 0xF0) ? 0xFF : (i14 & 15)) << 16 | ((i15 & 0xF0) ? 0xFF : (i15 & 15)) << 24,
+                 ((i16 & 0xF0) != 0x10 ? 0xFF : (i16 & 15)) | ((i17 & 0xF0) != 0x10 ? 0xFF : (i17 & 15)) << 8 |
+                     ((i18 & 0xF0) != 0x10 ? 0xFF : (i18 & 15)) << 16 | ((i19 & 0xF0) != 0x10 ? 0xFF : (i19 & 15)) << 24,
+                 ((i20 & 0xF0) != 0x10 ? 0xFF : (i20 & 15)) | ((i21 & 0xF0) != 0x10 ? 0xFF : (i21 & 15)) << 8 |
+                     ((i22 & 0xF0) != 0x10 ? 0xFF : (i22 & 15)) << 16 | ((i23 & 0xF0) != 0x10 ? 0xFF : (i23 & 15)) << 24,
+                 ((i24 & 0xF0) != 0x10 ? 0xFF : (i24 & 15)) | ((i25 & 0xF0) != 0x10 ? 0xFF : (i25 & 15)) << 8 |
+                     ((i26 & 0xF0) != 0x10 ? 0xFF : (i26 & 15)) << 16 | ((i27 & 0xF0) != 0x10 ? 0xFF : (i27 & 15)) << 24,
+                 ((i28 & 0xF0) != 0x10 ? 0xFF : (i28 & 15)) | ((i29 & 0xF0) != 0x10 ? 0xFF : (i29 & 15)) << 8 |
+                     ((i30 & 0xF0) != 0x10 ? 0xFF : (i30 & 15)) << 16 | ((i31 & 0xF0) != 0x10 ? 0xFF : (i31 & 15)) << 24>();
+
+  const __m256i mmopposite =
+      constant8i<((i0 & 0xF0) != 0x10 ? 0xFF : (i0 & 15)) | ((i1 & 0xF0) != 0x10 ? 0xFF : (i1 & 15)) << 8 |
+                     ((i2 & 0xF0) != 0x10 ? 0xFF : (i2 & 15)) << 16 | ((i3 & 0xF0) != 0x10 ? 0xFF : (i3 & 15)) << 24,
+                 ((i4 & 0xF0) != 0x10 ? 0xFF : (i4 & 15)) | ((i5 & 0xF0) != 0x10 ? 0xFF : (i5 & 15)) << 8 |
+                     ((i6 & 0xF0) != 0x10 ? 0xFF : (i6 & 15)) << 16 | ((i7 & 0xF0) != 0x10 ? 0xFF : (i7 & 15)) << 24,
+                 ((i8 & 0xF0) != 0x10 ? 0xFF : (i8 & 15)) | ((i9 & 0xF0) != 0x10 ? 0xFF : (i9 & 15)) << 8 |
+                     ((i10 & 0xF0) != 0x10 ? 0xFF : (i10 & 15)) << 16 | ((i11 & 0xF0) != 0x10 ? 0xFF : (i11 & 15)) << 24,
+                 ((i12 & 0xF0) != 0x10 ? 0xFF : (i12 & 15)) | ((i13 & 0xF0) != 0x10 ? 0xFF : (i13 & 15)) << 8 |
+                     ((i14 & 0xF0) != 0x10 ? 0xFF : (i14 & 15)) << 16 | ((i15 & 0xF0) != 0x10 ? 0xFF : (i15 & 15)) << 24,
+                 ((i16 & 0xF0) ? 0xFF : (i16 & 15)) | ((i17 & 0xF0) ? 0xFF : (i17 & 15)) << 8 |
+                     ((i18 & 0xF0) ? 0xFF : (i18 & 15)) << 16 | ((i19 & 0xF0) ? 0xFF : (i19 & 15)) << 24,
+                 ((i20 & 0xF0) ? 0xFF : (i20 & 15)) | ((i21 & 0xF0) ? 0xFF : (i21 & 15)) << 8 |
+                     ((i22 & 0xF0) ? 0xFF : (i22 & 15)) << 16 | ((i23 & 0xF0) ? 0xFF : (i23 & 15)) << 24,
+                 ((i24 & 0xF0) ? 0xFF : (i24 & 15)) | ((i25 & 0xF0) ? 0xFF : (i25 & 15)) << 8 |
+                     ((i26 & 0xF0) ? 0xFF : (i26 & 15)) << 16 | ((i27 & 0xF0) ? 0xFF : (i27 & 15)) << 24,
+                 ((i28 & 0xF0) ? 0xFF : (i28 & 15)) | ((i29 & 0xF0) ? 0xFF : (i29 & 15)) << 8 |
+                     ((i30 & 0xF0) ? 0xFF : (i30 & 15)) << 16 | ((i31 & 0xF0) ? 0xFF : (i31 & 15)) << 24>();
+
+  __m256i topp = _mm256_permute4x64_epi64(a, 0x4E);  // high, low
+  __m256i r1   = _mm256_shuffle_epi8(topp, mmopposite);
+  __m256i r2   = _mm256_shuffle_epi8(a, mmsame);
+  return _mm256_or_si256(r1, r2);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15, int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, int i24, int i25, int i26, int i27, int i28,
+          int i29, int i30, int i31>
+static inline Vec32uc permute32uc(Vec32uc const &a)
+{
+  return Vec32uc(permute32c<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22,
+                            i23, i24, i25, i26, i27, i28, i29, i30, i31>(a));
+}
+
+/*****************************************************************************
+ *
+ *          Vector blend functions
+ *
+ ******************************************************************************
+ *
+ * These blend functions can mix elements from two different vectors and
+ * optionally set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select, where higher indexes indicate an element from the second source
+ * vector. For example, if each vector has 8 elements, then indexes 0 - 7
+ * will select an element from the first vector and indexes 8 - 15 will select
+ * an element from the second vector. A negative index will generate zero.
+ *
+ * Example:
+ * Vec8i a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8i b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+ * Vec8i c;
+ * c = blend8i<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q blend4q(Vec4q const &a, Vec4q const &b)
+{
+  // Combine indexes into a single bitfield, with 8 bits for each
+  const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24;
+
+  // zeroing needed. An index of -0x100 means don't care
+  const bool dozero = ((i0 | i1 | i2 | i3) & 0x80) != 0;
+
+  __m256i t1, mask;
+
+  // special case: 128 bit blend/permute
+  if(((m1 ^ 0x01000100) & 0x01010101 & mz) == 0 && (((m1 + 0x00010001) ^ (m1 >> 8)) & 0x00FF00FF & mz & mz >> 8) == 0)
+    {
+      {
+        const int j0           = i0 >= 0 ? i0 / 2 : i1 >= 0 ? i1 / 2 : 4;  // index for low 128 bits
+        const int j1           = i2 >= 0 ? i2 / 2 : i3 >= 0 ? i3 / 2 : 4;  // index for high 128 bits
+        const bool partialzero = int((i0 ^ i1) | (i2 ^ i3)) < 0;           // part of a 128-bit block is zeroed
+
+        switch(j0 | j1 << 4)
+          {
+            case 0x00:
+              t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1);
+              break;
+            case 0x02:
+              t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(a), 1);
+              break;
+            case 0x04:
+              if(dozero && !partialzero)
+                return _mm256_inserti128_si256(_mm256_setzero_si256(), _mm256_castsi256_si128(a), 1);
+              t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1);
+              break;
+            case 0x12:
+              t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(b), 0);
+              break;
+            case 0x14:
+              if(dozero && !partialzero)
+                return _mm256_inserti128_si256(a, _mm_setzero_si128(), 0);
+              t1 = a;
+              break;
+            case 0x01:
+            case 0x10:
+            case 0x11:  // all from a
+              return permute4q<i0, i1, i2, i3>(a);
+            case 0x20:
+              t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(b), 1);
+              break;
+            case 0x22:
+              t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(b), 1);
+              break;
+            case 0x24:
+              if(dozero && !partialzero)
+                return _mm256_inserti128_si256(_mm256_setzero_si256(), _mm256_castsi256_si128(b), 1);
+              t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(b), 1);
+              break;
+            case 0x30:
+              t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(a), 0);
+              break;
+            case 0x34:
+              if(dozero && !partialzero)
+                return _mm256_inserti128_si256(b, _mm_setzero_si128(), 0);
+              t1 = b;
+              break;
+            case 0x23:
+            case 0x32:
+            case 0x33:  // all from b
+              return permute4q<i0 ^ 4, i1 ^ 4, i2 ^ 4, i3 ^ 4>(b);
+            case 0x40:
+              if(dozero && !partialzero)
+                return _mm256_castsi128_si256(_mm_and_si128(_mm256_castsi256_si128(a), _mm256_castsi256_si128(a)));
+              t1 = a;
+              break;
+            case 0x42:
+              if(dozero && !partialzero)
+                return _mm256_castsi128_si256(_mm_and_si128(_mm256_castsi256_si128(b), _mm256_castsi256_si128(b)));
+              t1 = b;
+              break;
+            case 0x44:
+              return _mm256_setzero_si256();
+            default:
+              t1 = _mm256_permute2x128_si256(a, b, (j0 & 0x0F) | (j1 & 0x0F) << 4);
+          }
+      }
+    RETURNORZERO:
+      if(dozero)
+        {
+          // zero some elements
+          const __m256i maskz = constant8i < i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1,
+                        i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > ();
+          return _mm256_and_si256(t1, maskz);
+        }
+      return t1;
+    }
+
+  // special case: all from a
+  if((m1 & 0x04040404 & mz) == 0)
+    {
+      return permute4q<i0, i1, i2, i3>(a);
+    }
+
+  // special case: all from b
+  if((~m1 & 0x04040404 & mz) == 0)
+    {
+      return permute4q<i0 ^ 4, i1 ^ 4, i2 ^ 4, i3 ^ 4>(b);
+    }
+
+  // special case: blend without permute
+  if(((m1 ^ 0x03020100) & 0xFBFBFBFB & mz) == 0)
+    {
+      mask = constant8i < (i0 & 4) ? -1 : 0, (i0 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, (i2 & 4) ? -1 : 0,
+      (i2 & 4) ? -1 : 0, (i3 & 4) ? -1 : 0, (i3 & 4) ? -1 : 0 > ();
+      t1 = _mm256_blendv_epi8(a, b, mask);  // blend
+      goto RETURNORZERO;
+    }
+
+  // special case: shift left
+  if(i0 > 0 && i0 < 4 && mz == -1 && (m1 ^ ((i0 & 3) * 0x01010101 + 0x03020100)) == 0)
+    {
+      t1 = _mm256_permute2x128_si256(a, b, 0x21);
+      if(i0 < 2)
+        return _mm256_alignr_epi8(t1, a, (i0 & 1) * 8);
+      else
+        return _mm256_alignr_epi8(b, t1, (i0 & 1) * 8);
+    }
+  // special case: shift right
+  if(i0 > 4 && i0 < 8 && mz == -1 && (m1 ^ 0x04040404 ^ ((i0 & 3) * 0x01010101 + 0x03020100)) == 0)
+    {
+      t1 = _mm256_permute2x128_si256(b, a, 0x21);
+      if(i0 < 6)
+        return _mm256_alignr_epi8(t1, b, (i0 & 1) * 8);
+      else
+        return _mm256_alignr_epi8(a, t1, (i0 & 1) * 8);
+    }
+  // special case: unpack low
+  if(((m1 ^ 0x06020400) & mz) == 0)
+    {
+      t1 = _mm256_unpacklo_epi64(a, b);
+      goto RETURNORZERO;
+    }
+  // special case: unpack low
+  if(((m1 ^ 0x02060004) & mz) == 0)
+    {
+      t1 = _mm256_unpacklo_epi64(b, a);
+      goto RETURNORZERO;
+    }
+  // special case: unpack high
+  if(((m1 ^ 0x07030501) & mz) == 0)
+    {
+      t1 = _mm256_unpackhi_epi64(a, b);
+      goto RETURNORZERO;
+    }
+  // special case: unpack high
+  if(((m1 ^ 0x03070105) & mz) == 0)
+    {
+      t1 = _mm256_unpackhi_epi64(b, a);
+      goto RETURNORZERO;
+    }
+
+  // general case: permute and blend and possibly zero
+  const int blank = dozero ? -1 : -0x100;  // ignore or zero
+
+  // permute and blend
+  __m256i ta = permute4q < (i0 & 4) ? blank : i0, (i1 & 4) ? blank : i1, (i2 & 4) ? blank : i2, (i3 & 4) ? blank : i3 > (a);
+
+  __m256i tb = permute4q < ((i0 ^ 4) & 4) ? blank : i0 ^ 4, ((i1 ^ 4) & 4) ? blank : i1 ^ 4, ((i2 ^ 4) & 4) ? blank : i2 ^ 4,
+          ((i3 ^ 4) & 4) ? blank : i3 ^ 4 > (b);
+
+  if(blank == -1)
+    {
+      // we have zeroed, need only to OR
+      return _mm256_or_si256(ta, tb);
+    }
+  // no zeroing, need to blend
+  mask = constant8i < (i0 & 4) ? -1 : 0, (i0 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, (i2 & 4) ? -1 : 0, (i2 & 4) ? -1 : 0,
+  (i3 & 4) ? -1 : 0, (i3 & 4) ? -1 : 0 > ();
+
+  return _mm256_blendv_epi8(ta, tb, mask);  // blend
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4uq blend4uq(Vec4uq const &a, Vec4uq const &b)
+{
+  return Vec4uq(blend4q<i0, i1, i2, i3>(a, b));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i blend8i(Vec8i const &a, Vec8i const &b)
+{
+  const int ior = i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7;  // OR indexes
+
+  // is zeroing needed
+  const bool do_zero = ior < 0 && (ior & 0x80);  // at least one index is negative, and not -0x100
+
+  // Combine all the indexes into a single bitfield, with 4 bits for each
+  const int m1 = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                 (i6 & 0xF) << 24 | (i7 & 0xF) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12 |
+                 (i4 < 0 ? 0 : 0xF) << 16 | (i5 < 0 ? 0 : 0xF) << 20 | (i6 < 0 ? 0 : 0xF) << 24 | (i7 < 0 ? 0 : 0xF) << 28;
+
+  __m256i t1, mask;
+
+  if(mz == 0)
+    return _mm256_setzero_si256();  // all zero
+
+  // special case: 64 bit blend/permute
+  if(((m1 ^ 0x10101010) & 0x11111111 & mz) == 0 && ((m1 ^ (m1 >> 4)) & 0x0E0E0E0E & mz & mz >> 4) == 0)
+    {
+      // check if part of a 64-bit block is zeroed
+      const bool partialzero = int((i0 ^ i1) | (i2 ^ i3) | (i4 ^ i5) | (i6 ^ i7)) < 0;
+      const int blank1       = partialzero ? -0x100 : -1;  // ignore if zeroing later anyway
+      // indexes for 64 bit blend
+      const int j0 = i0 >= 0 ? i0 / 2 : i1 >= 0 ? i1 / 2 : blank1;
+      const int j1 = i2 >= 0 ? i2 / 2 : i3 >= 0 ? i3 / 2 : blank1;
+      const int j2 = i4 >= 0 ? i4 / 2 : i5 >= 0 ? i5 / 2 : blank1;
+      const int j3 = i6 >= 0 ? i6 / 2 : i7 >= 0 ? i7 / 2 : blank1;
+      // 64-bit blend and permute
+      t1 = blend4q<j0, j1, j2, j3>(Vec4q(a), Vec4q(b));
+      if(partialzero && do_zero)
+        {
+          // zero some elements
+          mask = constant8i < i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1,
+          i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > ();
+          return _mm256_and_si256(t1, mask);
+        }
+      return t1;
+    }
+
+  if((m1 & 0x88888888 & mz) == 0)
+    {
+      // all from a
+      return permute8i<i0, i1, i2, i3, i4, i5, i6, i7>(a);
+    }
+
+  if(((m1 ^ 0x88888888) & 0x88888888 & mz) == 0)
+    {
+      // all from b
+      return permute8i<i0 & ~8, i1 & ~8, i2 & ~8, i3 & ~8, i4 & ~8, i5 & ~8, i6 & ~8, i7 & ~8>(b);
+    }
+
+  if((((m1 & 0x77777777) ^ 0x76543210) & mz) == 0)
+    {
+      // blend and zero, no permute
+      mask = constant8i < (i0 & 8) ? 0 : -1, (i1 & 8) ? 0 : -1, (i2 & 8) ? 0 : -1, (i3 & 8) ? 0 : -1, (i4 & 8) ? 0 : -1,
+      (i5 & 8) ? 0 : -1, (i6 & 8) ? 0 : -1, (i7 & 8) ? 0 : -1 > ();
+      t1 = select(mask, a, b);
+      if(!do_zero)
+        return t1;
+      // zero some elements
+      mask = constant8i < (i0 < 0 && (i0 & 8)) ? 0 : -1, (i1 < 0 && (i1 & 8)) ? 0 : -1, (i2 < 0 && (i2 & 8)) ? 0 : -1,
+      (i3 < 0 && (i3 & 8)) ? 0 : -1, (i4 < 0 && (i4 & 8)) ? 0 : -1, (i5 < 0 && (i5 & 8)) ? 0 : -1, (i6 < 0 && (i6 & 8)) ? 0 : -1,
+      (i7 < 0 && (i7 & 8)) ? 0 : -1 > ();
+      return _mm256_and_si256(t1, mask);
+    }
+
+  // special case: shift left
+  if(i0 > 0 && i0 < 8 && mz == -1 && (m1 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) == 0)
+    {
+      t1 = _mm256_permute2x128_si256(a, b, 0x21);
+      if(i0 < 4)
+        return _mm256_alignr_epi8(t1, a, (i0 & 3) * 4);
+      else
+        return _mm256_alignr_epi8(b, t1, (i0 & 3) * 4);
+    }
+  // special case: shift right
+  if(i0 > 8 && i0 < 16 && mz == -1 && (m1 ^ 0x88888888 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) == 0)
+    {
+      t1 = _mm256_permute2x128_si256(b, a, 0x21);
+      if(i0 < 12)
+        return _mm256_alignr_epi8(t1, b, (i0 & 3) * 4);
+      else
+        return _mm256_alignr_epi8(a, t1, (i0 & 3) * 4);
+    }
+
+  // general case: permute and blend and possible zero
+  const int blank = do_zero ? -1 : -0x100;  // ignore or zero
+
+  Vec8i ta = permute8i < (uint32_t)i0 < 8 ? i0 : blank, (uint32_t)i1 < 8 ? i1 : blank, (uint32_t)i2 < 8 ? i2 : blank,
+        (uint32_t)i3 < 8 ? i3 : blank, (uint32_t)i4 < 8 ? i4 : blank, (uint32_t)i5 < 8 ? i5 : blank, (uint32_t)i6 < 8 ? i6 : blank,
+        (uint32_t)i7 < 8 ? i7 : blank > (a);
+  Vec8i tb = permute8i < (uint32_t)(i0 ^ 8) < 8 ? (i0 ^ 8) : blank, (uint32_t)(i1 ^ 8) < 8 ? (i1 ^ 8) : blank,
+        (uint32_t)(i2 ^ 8) < 8 ? (i2 ^ 8) : blank, (uint32_t)(i3 ^ 8) < 8 ? (i3 ^ 8) : blank,
+        (uint32_t)(i4 ^ 8) < 8 ? (i4 ^ 8) : blank, (uint32_t)(i5 ^ 8) < 8 ? (i5 ^ 8) : blank,
+        (uint32_t)(i6 ^ 8) < 8 ? (i6 ^ 8) : blank, (uint32_t)(i7 ^ 8) < 8 ? (i7 ^ 8) : blank > (b);
+  if(blank == -1)
+    {
+      return _mm256_or_si256(ta, tb);
+    }
+  // no zeroing, need to blend
+  const int maskb = ((i0 >> 3) & 1) | ((i1 >> 2) & 2) | ((i2 >> 1) & 4) | (i3 & 8) | ((i4 << 1) & 0x10) | ((i5 << 2) & 0x20) |
+                    ((i6 << 3) & 0x40) | ((i7 << 4) & 0x80);
+  return _mm256_blend_epi32(ta, tb, maskb);  // blend
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8ui blend8ui(Vec8ui const &a, Vec8ui const &b)
+{
+  return Vec8ui(blend8i<i0, i1, i2, i3, i4, i5, i6, i7>(a, b));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16s blend16s(Vec16s const &a, Vec16s const &b)
+{
+  // collect bit 4 of each index
+  const int m1 = (i0 & 16) >> 4 | (i1 & 16) >> 3 | (i2 & 16) >> 2 | (i3 & 16) >> 1 | (i4 & 16) | (i5 & 16) << 1 | (i6 & 16) << 2 |
+                 (i7 & 16) << 3 | (i8 & 16) << 4 | (i9 & 16) << 5 | (i10 & 16) << 6 | (i11 & 16) << 7 | (i12 & 16) << 8 |
+                 (i13 & 16) << 9 | (i14 & 16) << 10 | (i15 & 16) << 11;
+
+  // check which elements to set to zero
+  const int mz = 0x0000FFFF ^ ((i0 < 0) | (i1 < 0) << 1 | (i2 < 0) << 2 | (i3 < 0) << 3 | (i4 < 0) << 4 | (i5 < 0) << 5 |
+                               (i6 < 0) << 6 | (i7 < 0) << 7 | (i8 < 0) << 8 | (i9 < 0) << 9 | (i10 < 0) << 10 | (i11 < 0) << 11 |
+                               (i12 < 0) << 12 | (i13 < 0) << 13 | (i14 < 0) << 14 | (i15 < 0) << 15);
+
+  __m256i t1, mask;
+
+  // special case: all zero
+  if(mz == 0)
+    return _mm256_setzero_si256();
+
+  // special case: all from a
+  if((m1 & mz) == 0)
+    {
+      return permute16s<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a);
+    }
+
+  // special case: all from b
+  if(((m1 ^ 0xFFFF) & mz) == 0)
+    {
+      return permute16s<i0 ^ 16, i1 ^ 16, i2 ^ 16, i3 ^ 16, i4 ^ 16, i5 ^ 16, i6 ^ 16, i7 ^ 16, i8 ^ 16, i9 ^ 16, i10 ^ 16, i11 ^ 16,
+                        i12 ^ 16, i13 ^ 16, i14 ^ 16, i15 ^ 16>(b);
+    }
+
+  // special case: blend without permute
+  if((i0 < 0 || (i0 & 15) == 0) && (i1 < 0 || (i1 & 15) == 1) && (i2 < 0 || (i2 & 15) == 2) && (i3 < 0 || (i3 & 15) == 3) &&
+     (i4 < 0 || (i4 & 15) == 4) && (i5 < 0 || (i5 & 15) == 5) && (i6 < 0 || (i6 & 15) == 6) && (i7 < 0 || (i7 & 15) == 7) &&
+     (i8 < 0 || (i8 & 15) == 8) && (i9 < 0 || (i9 & 15) == 9) && (i10 < 0 || (i10 & 15) == 10) && (i11 < 0 || (i11 & 15) == 11) &&
+     (i12 < 0 || (i12 & 15) == 12) && (i13 < 0 || (i13 & 15) == 13) && (i14 < 0 || (i14 & 15) == 14) && (i15 < 0 || (i15 & 15) == 15))
+    {
+      mask = constant8i<
+          int(((i0 & 16) ? 0xFFFF : 0) | ((i1 & 16) ? 0xFFFF0000 : 0)), int(((i2 & 16) ? 0xFFFF : 0) | ((i3 & 16) ? 0xFFFF0000 : 0)),
+          int(((i4 & 16) ? 0xFFFF : 0) | ((i5 & 16) ? 0xFFFF0000 : 0)), int(((i6 & 16) ? 0xFFFF : 0) | ((i7 & 16) ? 0xFFFF0000 : 0)),
+          int(((i8 & 16) ? 0xFFFF : 0) | ((i9 & 16) ? 0xFFFF0000 : 0)), int(((i10 & 16) ? 0xFFFF : 0) | ((i11 & 16) ? 0xFFFF0000 : 0)),
+          int(((i12 & 16) ? 0xFFFF : 0) | ((i13 & 16) ? 0xFFFF0000 : 0)),
+          int(((i14 & 16) ? 0xFFFF : 0) | ((i15 & 16) ? 0xFFFF0000 : 0))>();
+
+      t1 = _mm256_blendv_epi8(a, b, mask);  // blend
+
+      if(mz != 0xFFFF)
+        {
+          // zero some elements
+          mask = constant8i<
+              int((i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000)), int((i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000)),
+              int((i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000)), int((i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000)),
+              int((i8 < 0 ? 0 : 0xFFFF) | (i9 < 0 ? 0 : 0xFFFF0000)), int((i10 < 0 ? 0 : 0xFFFF) | (i11 < 0 ? 0 : 0xFFFF0000)),
+              int((i12 < 0 ? 0 : 0xFFFF) | (i13 < 0 ? 0 : 0xFFFF0000)), int((i14 < 0 ? 0 : 0xFFFF) | (i15 < 0 ? 0 : 0xFFFF0000))>();
+          return _mm256_and_si256(t1, mask);
+        }
+      return t1;
+    }
+
+  // special case: shift left
+  const int slb = i0 > 0 ? i0 : i15 - 15;
+  if(slb > 0 && slb < 16 && (i0 == slb + 0 || i0 < 0) && (i1 == slb + 1 || i1 < 0) && (i2 == slb + 2 || i2 < 0) &&
+     (i3 == slb + 3 || i3 < 0) && (i4 == slb + 4 || i4 < 0) && (i5 == slb + 5 || i5 < 0) && (i6 == slb + 6 || i6 < 0) &&
+     (i7 == slb + 7 || i7 < 0) && (i8 == slb + 8 || i8 < 0) && (i9 == slb + 9 || i9 < 0) && (i10 == slb + 10 || i10 < 0) &&
+     (i11 == slb + 11 || i11 < 0) && (i12 == slb + 12 || i12 < 0) && (i13 == slb + 13 || i13 < 0) && (i14 == slb + 14 || i14 < 0) &&
+     (i15 == slb + 15 || i15 < 0))
+    {
+      t1 = _mm256_permute2x128_si256(a, b, 0x21);
+      if(slb < 8)
+        t1 = _mm256_alignr_epi8(t1, a, (slb & 7) * 2);
+      else
+        t1 = _mm256_alignr_epi8(b, t1, (slb & 7) * 2);
+      if(mz != 0xFFFF)
+        {
+          // zero some elements
+          mask = constant8i<
+              int((i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000)), int((i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000)),
+              int((i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000)), int((i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000)),
+              int((i8 < 0 ? 0 : 0xFFFF) | (i9 < 0 ? 0 : 0xFFFF0000)), int((i10 < 0 ? 0 : 0xFFFF) | (i11 < 0 ? 0 : 0xFFFF0000)),
+              int((i12 < 0 ? 0 : 0xFFFF) | (i13 < 0 ? 0 : 0xFFFF0000)), int((i14 < 0 ? 0 : 0xFFFF) | (i15 < 0 ? 0 : 0xFFFF0000))>();
+          return _mm256_and_si256(t1, mask);
+        }
+      return t1;
+    }
+  // special case: shift right
+  const int srb = i0 > 0 ? (i0 ^ 16) : (i15 ^ 16) - 15;
+  if(srb > 0 && srb < 16 && ((i0 ^ 16) == srb + 0 || i0 < 0) && ((i1 ^ 16) == srb + 1 || i1 < 0) && ((i2 ^ 16) == srb + 2 || i2 < 0) &&
+     ((i3 ^ 16) == srb + 3 || i3 < 0) && ((i4 ^ 16) == srb + 4 || i4 < 0) && ((i5 ^ 16) == srb + 5 || i5 < 0) &&
+     ((i6 ^ 16) == srb + 6 || i6 < 0) && ((i7 ^ 16) == srb + 7 || i7 < 0) && ((i8 ^ 16) == srb + 8 || i8 < 0) &&
+     ((i9 ^ 16) == srb + 9 || i9 < 0) && ((i10 ^ 16) == srb + 10 || i10 < 0) && ((i11 ^ 16) == srb + 11 || i11 < 0) &&
+     ((i12 ^ 16) == srb + 12 || i12 < 0) && ((i13 ^ 16) == srb + 13 || i13 < 0) && ((i14 ^ 16) == srb + 14 || i14 < 0) &&
+     ((i15 ^ 16) == srb + 15 || i15 < 0))
+    {
+      t1 = _mm256_permute2x128_si256(b, a, 0x21);
+      if(srb < 8)
+        t1 = _mm256_alignr_epi8(t1, b, (srb & 7) * 2);
+      else
+        t1 = _mm256_alignr_epi8(a, t1, (srb & 7) * 2);
+      if(mz != 0xFFFF)
+        {
+          // zero some elements
+          mask = constant8i<
+              int((i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000)), int((i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000)),
+              int((i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000)), int((i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000)),
+              int((i8 < 0 ? 0 : 0xFFFF) | (i9 < 0 ? 0 : 0xFFFF0000)), int((i10 < 0 ? 0 : 0xFFFF) | (i11 < 0 ? 0 : 0xFFFF0000)),
+              int((i12 < 0 ? 0 : 0xFFFF) | (i13 < 0 ? 0 : 0xFFFF0000)), int((i14 < 0 ? 0 : 0xFFFF) | (i15 < 0 ? 0 : 0xFFFF0000))>();
+          return _mm256_and_si256(t1, mask);
+        }
+      return t1;
+    }
+
+  // general case: permute and blend and possibly zero
+  const int blank = (mz == 0xFFFF) ? -0x100 : -1;  // ignore or zero
+
+  // permute and blend
+  __m256i ta = permute16s < (i0 & 16) ? blank : i0, (i1 & 16) ? blank : i1, (i2 & 16) ? blank : i2, (i3 & 16) ? blank : i3,
+          (i4 & 16) ? blank : i4, (i5 & 16) ? blank : i5, (i6 & 16) ? blank : i6, (i7 & 16) ? blank : i7, (i8 & 16) ? blank : i8,
+          (i9 & 16) ? blank : i9, (i10 & 16) ? blank : i10, (i11 & 16) ? blank : i11, (i12 & 16) ? blank : i12,
+          (i13 & 16) ? blank : i13, (i14 & 16) ? blank : i14, (i15 & 16) ? blank : i15 > (a);
+
+  __m256i tb = permute16s < ((i0 ^ 16) & 16) ? blank : i0 ^ 16, ((i1 ^ 16) & 16) ? blank : i1 ^ 16, ((i2 ^ 16) & 16) ? blank : i2 ^ 16,
+          ((i3 ^ 16) & 16) ? blank : i3 ^ 16, ((i4 ^ 16) & 16) ? blank : i4 ^ 16, ((i5 ^ 16) & 16) ? blank : i5 ^ 16,
+          ((i6 ^ 16) & 16) ? blank : i6 ^ 16, ((i7 ^ 16) & 16) ? blank : i7 ^ 16, ((i8 ^ 16) & 16) ? blank : i8 ^ 16,
+          ((i9 ^ 16) & 16) ? blank : i9 ^ 16, ((i10 ^ 16) & 16) ? blank : i10 ^ 16, ((i11 ^ 16) & 16) ? blank : i11 ^ 16,
+          ((i12 ^ 16) & 16) ? blank : i12 ^ 16, ((i13 ^ 16) & 16) ? blank : i13 ^ 16, ((i14 ^ 16) & 16) ? blank : i14 ^ 16,
+          ((i15 ^ 16) & 16) ? blank : i15 ^ 16 > (b);
+
+  if(blank == -1)
+    {
+      // we have zeroed, need only to OR
+      return _mm256_or_si256(ta, tb);
+    }
+  // no zeroing, need to blend
+  mask = constant8i<
+      int(((i0 & 16) ? 0xFFFF : 0) | ((i1 & 16) ? 0xFFFF0000 : 0)), int(((i2 & 16) ? 0xFFFF : 0) | ((i3 & 16) ? 0xFFFF0000 : 0)),
+      int(((i4 & 16) ? 0xFFFF : 0) | ((i5 & 16) ? 0xFFFF0000 : 0)), int(((i6 & 16) ? 0xFFFF : 0) | ((i7 & 16) ? 0xFFFF0000 : 0)),
+      int(((i8 & 16) ? 0xFFFF : 0) | ((i9 & 16) ? 0xFFFF0000 : 0)), int(((i10 & 16) ? 0xFFFF : 0) | ((i11 & 16) ? 0xFFFF0000 : 0)),
+      int(((i12 & 16) ? 0xFFFF : 0) | ((i13 & 16) ? 0xFFFF0000 : 0)),
+      int(((i14 & 16) ? 0xFFFF : 0) | ((i15 & 16) ? 0xFFFF0000 : 0))>();
+
+  return _mm256_blendv_epi8(ta, tb, mask);  // blend
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16us blend16us(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(blend16s<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a, b));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15, int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, int i24, int i25, int i26, int i27, int i28,
+          int i29, int i30, int i31>
+static inline Vec32c blend32c(Vec32c const &a, Vec32c const &b)
+{
+  // collect bit 5 of each index
+  const int m1 = (i0 & 32) >> 5 | (i1 & 32) >> 4 | (i2 & 32) >> 3 | (i3 & 32) >> 2 | (i4 & 32) >> 1 | (i5 & 32) | (i6 & 32) << 1 |
+                 (i7 & 32) << 2 | (i8 & 32) << 3 | (i9 & 32) << 4 | (i10 & 32) << 5 | (i11 & 32) << 6 | (i12 & 32) << 7 |
+                 (i13 & 32) << 8 | (i14 & 32) << 9 | (i15 & 32) << 10 | (i16 & 32) << 11 | (i17 & 32) << 12 | (i18 & 32) << 13 |
+                 (i19 & 32) << 14 | (i20 & 32) << 15 | (i21 & 32) << 16 | (i22 & 32) << 17 | (i23 & 32) << 18 | (i24 & 32) << 19 |
+                 (i25 & 32) << 20 | (i26 & 32) << 21 | (i27 & 32) << 22 | (i28 & 32) << 23 | (i29 & 32) << 24 | (i30 & 32) << 25 |
+                 (i31 & 32) << 26;
+
+  // check which elements to set to zero
+  const int mz =
+      ~((i0 < 0) | (i1 < 0) << 1 | (i2 < 0) << 2 | (i3 < 0) << 3 | (i4 < 0) << 4 | (i5 < 0) << 5 | (i6 < 0) << 6 | (i7 < 0) << 7 |
+        (i8 < 0) << 8 | (i9 < 0) << 9 | (i10 < 0) << 10 | (i11 < 0) << 11 | (i12 < 0) << 12 | (i13 < 0) << 13 | (i14 < 0) << 14 |
+        (i15 < 0) << 15 | (i16 < 0) << 16 | (i17 < 0) << 17 | (i18 < 0) << 18 | (i19 < 0) << 19 | (i20 < 0) << 20 | (i21 < 0) << 21 |
+        (i22 < 0) << 22 | (i23 < 0) << 23 | (i24 < 0) << 24 | (i25 < 0) << 25 | (i26 < 0) << 26 | (i27 < 0) << 27 | (i28 < 0) << 28 |
+        (i29 < 0) << 29 | (i30 < 0) << 30 | (i31 < 0) << 31);
+
+  __m256i t1, mask;
+
+  // special case: all zero
+  if(mz == 0)
+    return _mm256_setzero_si256();
+
+  // special case: all from a
+  if((m1 & mz) == 0)
+    {
+      return permute32c<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23,
+                        i24, i25, i26, i27, i28, i29, i30, i31>(a);
+    }
+
+  // special case: all from b
+  if((~m1 & mz) == 0)
+    {
+      return permute32c<i0 ^ 32, i1 ^ 32, i2 ^ 32, i3 ^ 32, i4 ^ 32, i5 ^ 32, i6 ^ 32, i7 ^ 32, i8 ^ 32, i9 ^ 32, i10 ^ 32, i11 ^ 32,
+                        i12 ^ 32, i13 ^ 32, i14 ^ 32, i15 ^ 32, i16 ^ 32, i17 ^ 32, i18 ^ 32, i19 ^ 32, i20 ^ 32, i21 ^ 32, i22 ^ 32,
+                        i23 ^ 32, i24 ^ 32, i25 ^ 32, i26 ^ 32, i27 ^ 32, i28 ^ 32, i29 ^ 32, i30 ^ 32, i31 ^ 32>(b);
+    }
+
+  // special case: blend without permute
+  if((i0 < 0 || (i0 & 31) == 0) && (i1 < 0 || (i1 & 31) == 1) && (i2 < 0 || (i2 & 31) == 2) && (i3 < 0 || (i3 & 31) == 3) &&
+     (i4 < 0 || (i4 & 31) == 4) && (i5 < 0 || (i5 & 31) == 5) && (i6 < 0 || (i6 & 31) == 6) && (i7 < 0 || (i7 & 31) == 7) &&
+     (i8 < 0 || (i8 & 31) == 8) && (i9 < 0 || (i9 & 31) == 9) && (i10 < 0 || (i10 & 31) == 10) && (i11 < 0 || (i11 & 31) == 11) &&
+     (i12 < 0 || (i12 & 31) == 12) && (i13 < 0 || (i13 & 31) == 13) && (i14 < 0 || (i14 & 31) == 14) &&
+     (i15 < 0 || (i15 & 31) == 15) && (i16 < 0 || (i16 & 31) == 16) && (i17 < 0 || (i17 & 31) == 17) &&
+     (i18 < 0 || (i18 & 31) == 18) && (i19 < 0 || (i19 & 31) == 19) && (i20 < 0 || (i20 & 31) == 20) &&
+     (i21 < 0 || (i21 & 31) == 21) && (i22 < 0 || (i22 & 31) == 22) && (i23 < 0 || (i23 & 31) == 23) &&
+     (i24 < 0 || (i24 & 31) == 24) && (i25 < 0 || (i25 & 31) == 25) && (i26 < 0 || (i26 & 31) == 26) &&
+     (i27 < 0 || (i27 & 31) == 27) && (i28 < 0 || (i28 & 31) == 28) && (i29 < 0 || (i29 & 31) == 29) &&
+     (i30 < 0 || (i30 & 31) == 30) && (i31 < 0 || (i31 & 31) == 31))
+    {
+      mask =
+          constant8i<int(((i0 << 2) & 0x80) | ((i1 << 10) & 0x8000) | ((i2 << 18) & 0x800000) | (uint32_t(i3 << 26) & 0x80000000)),
+                     int(((i4 << 2) & 0x80) | ((i5 << 10) & 0x8000) | ((i6 << 18) & 0x800000) | (uint32_t(i7 << 26) & 0x80000000)),
+                     int(((i8 << 2) & 0x80) | ((i9 << 10) & 0x8000) | ((i10 << 18) & 0x800000) | (uint32_t(i11 << 26) & 0x80000000)),
+                     int(((i12 << 2) & 0x80) | ((i13 << 10) & 0x8000) | ((i14 << 18) & 0x800000) | (uint32_t(i15 << 26) & 0x80000000)),
+                     int(((i16 << 2) & 0x80) | ((i17 << 10) & 0x8000) | ((i18 << 18) & 0x800000) | (uint32_t(i19 << 26) & 0x80000000)),
+                     int(((i20 << 2) & 0x80) | ((i21 << 10) & 0x8000) | ((i22 << 18) & 0x800000) | (uint32_t(i23 << 26) & 0x80000000)),
+                     int(((i24 << 2) & 0x80) | ((i25 << 10) & 0x8000) | ((i26 << 18) & 0x800000) | (uint32_t(i27 << 26) & 0x80000000)),
+                     int(((i28 << 2) & 0x80) | ((i29 << 10) & 0x8000) | ((i30 << 18) & 0x800000) |
+                         (uint32_t(i31 << 26) & 0x80000000))>();
+
+      t1 = _mm256_blendv_epi8(a, b, mask);  // blend
+
+      if(mz != -1)
+        {
+          // zero some elements
+          const __m256i maskz =
+              constant8i<int((i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000)),
+                         int((i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000)),
+                         int((i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000)),
+                         int((i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000)),
+                         int((i16 < 0 ? 0 : 0xFF) | (i17 < 0 ? 0 : 0xFF00) | (i18 < 0 ? 0 : 0xFF0000) | (i19 < 0 ? 0 : 0xFF000000)),
+                         int((i20 < 0 ? 0 : 0xFF) | (i21 < 0 ? 0 : 0xFF00) | (i22 < 0 ? 0 : 0xFF0000) | (i23 < 0 ? 0 : 0xFF000000)),
+                         int((i24 < 0 ? 0 : 0xFF) | (i25 < 0 ? 0 : 0xFF00) | (i26 < 0 ? 0 : 0xFF0000) | (i27 < 0 ? 0 : 0xFF000000)),
+                         int((i28 < 0 ? 0 : 0xFF) | (i29 < 0 ? 0 : 0xFF00) | (i30 < 0 ? 0 : 0xFF0000) | (i31 < 0 ? 0 : 0xFF000000))>();
+          return _mm256_and_si256(t1, maskz);
+        }
+      return t1;
+    }
+
+  // special case: shift left
+  const int slb = i0 > 0 ? i0 : i31 - 31;
+  if(slb > 0 && slb < 32 && (i0 == slb + 0 || i0 < 0) && (i1 == slb + 1 || i1 < 0) && (i2 == slb + 2 || i2 < 0) &&
+     (i3 == slb + 3 || i3 < 0) && (i4 == slb + 4 || i4 < 0) && (i5 == slb + 5 || i5 < 0) && (i6 == slb + 6 || i6 < 0) &&
+     (i7 == slb + 7 || i7 < 0) && (i8 == slb + 8 || i8 < 0) && (i9 == slb + 9 || i9 < 0) && (i10 == slb + 10 || i10 < 0) &&
+     (i11 == slb + 11 || i11 < 0) && (i12 == slb + 12 || i12 < 0) && (i13 == slb + 13 || i13 < 0) && (i14 == slb + 14 || i14 < 0) &&
+     (i15 == slb + 15 || i15 < 0) && (i16 == slb + 16 || i16 < 0) && (i17 == slb + 17 || i17 < 0) && (i18 == slb + 18 || i18 < 0) &&
+     (i19 == slb + 19 || i19 < 0) && (i20 == slb + 20 || i20 < 0) && (i21 == slb + 21 || i21 < 0) && (i22 == slb + 22 || i22 < 0) &&
+     (i23 == slb + 23 || i23 < 0) && (i24 == slb + 24 || i24 < 0) && (i25 == slb + 25 || i25 < 0) && (i26 == slb + 26 || i26 < 0) &&
+     (i27 == slb + 27 || i27 < 0) && (i28 == slb + 28 || i28 < 0) && (i29 == slb + 29 || i29 < 0) && (i30 == slb + 30 || i30 < 0) &&
+     (i31 == slb + 31 || i31 < 0))
+    {
+      t1 = _mm256_permute2x128_si256(a, b, 0x21);
+      if(slb < 16)
+        t1 = _mm256_alignr_epi8(t1, a, slb & 15);
+      else
+        t1 = _mm256_alignr_epi8(b, t1, slb & 15);
+      if(mz != -1)
+        {
+          // zero some elements
+          const __m256i maskz =
+              constant8i<int((i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000)),
+                         int((i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000)),
+                         int((i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000)),
+                         int((i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000)),
+                         int((i16 < 0 ? 0 : 0xFF) | (i17 < 0 ? 0 : 0xFF00) | (i18 < 0 ? 0 : 0xFF0000) | (i19 < 0 ? 0 : 0xFF000000)),
+                         int((i20 < 0 ? 0 : 0xFF) | (i21 < 0 ? 0 : 0xFF00) | (i22 < 0 ? 0 : 0xFF0000) | (i23 < 0 ? 0 : 0xFF000000)),
+                         int((i24 < 0 ? 0 : 0xFF) | (i25 < 0 ? 0 : 0xFF00) | (i26 < 0 ? 0 : 0xFF0000) | (i27 < 0 ? 0 : 0xFF000000)),
+                         int((i28 < 0 ? 0 : 0xFF) | (i29 < 0 ? 0 : 0xFF00) | (i30 < 0 ? 0 : 0xFF0000) | (i31 < 0 ? 0 : 0xFF000000))>();
+          return _mm256_and_si256(t1, maskz);
+        }
+      return t1;
+    }
+  // special case: shift right
+  const int srb = i0 > 0 ? (i0 ^ 32) : (i31 ^ 32) - 31;
+  if(srb > 0 && srb < 32 && ((i0 ^ 32) == srb + 0 || i0 < 0) && ((i1 ^ 32) == srb + 1 || i1 < 0) && ((i2 ^ 32) == srb + 2 || i2 < 0) &&
+     ((i3 ^ 32) == srb + 3 || i3 < 0) && ((i4 ^ 32) == srb + 4 || i4 < 0) && ((i5 ^ 32) == srb + 5 || i5 < 0) &&
+     ((i6 ^ 32) == srb + 6 || i6 < 0) && ((i7 ^ 32) == srb + 7 || i7 < 0) && ((i8 ^ 32) == srb + 8 || i8 < 0) &&
+     ((i9 ^ 32) == srb + 9 || i9 < 0) && ((i10 ^ 32) == srb + 10 || i10 < 0) && ((i11 ^ 32) == srb + 11 || i11 < 0) &&
+     ((i12 ^ 32) == srb + 12 || i12 < 0) && ((i13 ^ 32) == srb + 13 || i13 < 0) && ((i14 ^ 32) == srb + 14 || i14 < 0) &&
+     ((i15 ^ 32) == srb + 15 || i15 < 0) && ((i16 ^ 32) == srb + 16 || i16 < 0) && ((i17 ^ 32) == srb + 17 || i17 < 0) &&
+     ((i18 ^ 32) == srb + 18 || i18 < 0) && ((i19 ^ 32) == srb + 19 || i19 < 0) && ((i20 ^ 32) == srb + 20 || i20 < 0) &&
+     ((i21 ^ 32) == srb + 21 || i21 < 0) && ((i22 ^ 32) == srb + 22 || i22 < 0) && ((i23 ^ 32) == srb + 23 || i23 < 0) &&
+     ((i24 ^ 32) == srb + 24 || i24 < 0) && ((i25 ^ 32) == srb + 25 || i25 < 0) && ((i26 ^ 32) == srb + 26 || i26 < 0) &&
+     ((i27 ^ 32) == srb + 27 || i27 < 0) && ((i28 ^ 32) == srb + 28 || i28 < 0) && ((i29 ^ 32) == srb + 29 || i29 < 0) &&
+     ((i30 ^ 32) == srb + 30 || i30 < 0) && ((i31 ^ 32) == srb + 31 || i31 < 0))
+    {
+      t1 = _mm256_permute2x128_si256(b, a, 0x21);
+      if(srb < 16)
+        t1 = _mm256_alignr_epi8(t1, b, srb & 15);
+      else
+        t1 = _mm256_alignr_epi8(a, t1, srb & 15);
+      if(mz != -1)
+        {
+          // zero some elements
+          const __m256i maskz =
+              constant8i<int((i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000)),
+                         int((i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000)),
+                         int((i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000)),
+                         int((i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000)),
+                         int((i16 < 0 ? 0 : 0xFF) | (i17 < 0 ? 0 : 0xFF00) | (i18 < 0 ? 0 : 0xFF0000) | (i19 < 0 ? 0 : 0xFF000000)),
+                         int((i20 < 0 ? 0 : 0xFF) | (i21 < 0 ? 0 : 0xFF00) | (i22 < 0 ? 0 : 0xFF0000) | (i23 < 0 ? 0 : 0xFF000000)),
+                         int((i24 < 0 ? 0 : 0xFF) | (i25 < 0 ? 0 : 0xFF00) | (i26 < 0 ? 0 : 0xFF0000) | (i27 < 0 ? 0 : 0xFF000000)),
+                         int((i28 < 0 ? 0 : 0xFF) | (i29 < 0 ? 0 : 0xFF00) | (i30 < 0 ? 0 : 0xFF0000) | (i31 < 0 ? 0 : 0xFF000000))>();
+          return _mm256_and_si256(t1, maskz);
+        }
+      return t1;
+    }
+
+  // general case: permute and blend and possible zero
+  const int blank = (mz == -1) ? -0x100 : -1;  // ignore or zero
+
+  // permute and blend
+  __m256i ta = permute32c < (i0 & 32) ? blank : i0, (i1 & 32) ? blank : i1, (i2 & 32) ? blank : i2, (i3 & 32) ? blank : i3,
+          (i4 & 32) ? blank : i4, (i5 & 32) ? blank : i5, (i6 & 32) ? blank : i6, (i7 & 32) ? blank : i7, (i8 & 32) ? blank : i8,
+          (i9 & 32) ? blank : i9, (i10 & 32) ? blank : i10, (i11 & 32) ? blank : i11, (i12 & 32) ? blank : i12,
+          (i13 & 32) ? blank : i13, (i14 & 32) ? blank : i14, (i15 & 32) ? blank : i15, (i16 & 32) ? blank : i16,
+          (i17 & 32) ? blank : i17, (i18 & 32) ? blank : i18, (i19 & 32) ? blank : i19, (i20 & 32) ? blank : i20,
+          (i21 & 32) ? blank : i21, (i22 & 32) ? blank : i22, (i23 & 32) ? blank : i23, (i24 & 32) ? blank : i24,
+          (i25 & 32) ? blank : i25, (i26 & 32) ? blank : i26, (i27 & 32) ? blank : i27, (i28 & 32) ? blank : i28,
+          (i29 & 32) ? blank : i29, (i30 & 32) ? blank : i30, (i31 & 32) ? blank : i31 > (a);
+
+  __m256i tb = permute32c < ((i0 ^ 32) & 32) ? blank : i0 ^ 32, ((i1 ^ 32) & 32) ? blank : i1 ^ 32, ((i2 ^ 32) & 32) ? blank : i2 ^ 32,
+          ((i3 ^ 32) & 32) ? blank : i3 ^ 32, ((i4 ^ 32) & 32) ? blank : i4 ^ 32, ((i5 ^ 32) & 32) ? blank : i5 ^ 32,
+          ((i6 ^ 32) & 32) ? blank : i6 ^ 32, ((i7 ^ 32) & 32) ? blank : i7 ^ 32, ((i8 ^ 32) & 32) ? blank : i8 ^ 32,
+          ((i9 ^ 32) & 32) ? blank : i9 ^ 32, ((i10 ^ 32) & 32) ? blank : i10 ^ 32, ((i11 ^ 32) & 32) ? blank : i11 ^ 32,
+          ((i12 ^ 32) & 32) ? blank : i12 ^ 32, ((i13 ^ 32) & 32) ? blank : i13 ^ 32, ((i14 ^ 32) & 32) ? blank : i14 ^ 32,
+          ((i15 ^ 32) & 32) ? blank : i15 ^ 32, ((i16 ^ 32) & 32) ? blank : i16 ^ 32, ((i17 ^ 32) & 32) ? blank : i17 ^ 32,
+          ((i18 ^ 32) & 32) ? blank : i18 ^ 32, ((i19 ^ 32) & 32) ? blank : i19 ^ 32, ((i20 ^ 32) & 32) ? blank : i20 ^ 32,
+          ((i21 ^ 32) & 32) ? blank : i21 ^ 32, ((i22 ^ 32) & 32) ? blank : i22 ^ 32, ((i23 ^ 32) & 32) ? blank : i23 ^ 32,
+          ((i24 ^ 32) & 32) ? blank : i24 ^ 32, ((i25 ^ 32) & 32) ? blank : i25 ^ 32, ((i26 ^ 32) & 32) ? blank : i26 ^ 32,
+          ((i27 ^ 32) & 32) ? blank : i27 ^ 32, ((i28 ^ 32) & 32) ? blank : i28 ^ 32, ((i29 ^ 32) & 32) ? blank : i29 ^ 32,
+          ((i30 ^ 32) & 32) ? blank : i30 ^ 32, ((i31 ^ 32) & 32) ? blank : i31 ^ 32 > (b);
+
+  if(blank == -1)
+    {
+      // we have zeroed, need only to OR
+      return _mm256_or_si256(ta, tb);
+    }
+  // no zeroing, need to blend
+  mask =
+      constant8i<int(((i0 << 2) & 0x80) | ((i1 << 10) & 0x8000) | ((i2 << 18) & 0x800000) | (uint32_t(i3 << 26) & 0x80000000)),
+                 int(((i4 << 2) & 0x80) | ((i5 << 10) & 0x8000) | ((i6 << 18) & 0x800000) | (uint32_t(i7 << 26) & 0x80000000)),
+                 int(((i8 << 2) & 0x80) | ((i9 << 10) & 0x8000) | ((i10 << 18) & 0x800000) | (uint32_t(i11 << 26) & 0x80000000)),
+                 int(((i12 << 2) & 0x80) | ((i13 << 10) & 0x8000) | ((i14 << 18) & 0x800000) | (uint32_t(i15 << 26) & 0x80000000)),
+                 int(((i16 << 2) & 0x80) | ((i17 << 10) & 0x8000) | ((i18 << 18) & 0x800000) | (uint32_t(i19 << 26) & 0x80000000)),
+                 int(((i20 << 2) & 0x80) | ((i21 << 10) & 0x8000) | ((i22 << 18) & 0x800000) | (uint32_t(i23 << 26) & 0x80000000)),
+                 int(((i24 << 2) & 0x80) | ((i25 << 10) & 0x8000) | ((i26 << 18) & 0x800000) | (uint32_t(i27 << 26) & 0x80000000)),
+                 int(((i28 << 2) & 0x80) | ((i29 << 10) & 0x8000) | ((i30 << 18) & 0x800000) | (uint32_t(i31 << 26) & 0x80000000))>();
+
+  return _mm256_blendv_epi8(ta, tb, mask);  // blend
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15, int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, int i24, int i25, int i26, int i27, int i28,
+          int i29, int i30, int i31>
+static inline Vec32uc blend32uc(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(blend32c<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23,
+                          i24, i25, i26, i27, i28, i29, i30, i31>(a, b));
+}
+
+/*****************************************************************************
+ *
+ *          Vector lookup functions
+ *
+ ******************************************************************************
+ *
+ * These functions use vector elements as indexes into a table.
+ * The table is given as one or more vectors or as an array.
+ *
+ * This can be used for several purposes:
+ *  - table lookup
+ *  - permute or blend with variable indexes
+ *  - blend from more than two sources
+ *  - gather non-contiguous data
+ *
+ * An index out of range may produce any value - the actual value produced is
+ * implementation dependent and may be different for different instruction
+ * sets. An index out of range does not produce an error message or exception.
+ *
+ * Example:
+ * Vec8i a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+ * Vec8i b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8i c;
+ * c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+ *
+ *****************************************************************************/
+
+static inline Vec32c lookup32(Vec32c const &index, Vec32c const &table)
+{
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+  Vec16c t0 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_low());
+  Vec16c t1 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_high());
+  return Vec32c(t0, t1);
+#else
+  Vec32c f0 = constant8i<0, 0, 0, 0, 0x10101010, 0x10101010, 0x10101010, 0x10101010>();
+  Vec32c f1 = constant8i<0x10101010, 0x10101010, 0x10101010, 0x10101010, 0, 0, 0, 0>();
+  Vec32c tablef = _mm256_permute4x64_epi64(table, 0x4E);  // low and high parts swapped
+  Vec32c r0 = _mm256_shuffle_epi8(table, (index ^ f0) + 0x70);
+  Vec32c r1 = _mm256_shuffle_epi8(tablef, (index ^ f1) + 0x70);
+  return r0 | r1;
+#endif
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32uc const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 16)
+    {
+      Vec16c tt = Vec16c().load(table);
+      Vec16c r0 = lookup16(index.get_low(), tt);
+      Vec16c r1 = lookup16(index.get_high(), tt);
+      return Vec32c(r0, r1);
+    }
+  if(n <= 32)
+    return lookup32(index, Vec32c().load(table));
+  // n > 32. Limit index
+  Vec32uc index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec32uc(index) & uint8_t(n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec32uc(index), uint8_t(n - 1));
+    }
+  Vec8ui mask0 = Vec8ui(0x000000FF);                                                                         // mask 8 bits
+  Vec32c t0    = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & Vec8ui(index1)), 1);             // positions 0, 4, 8,  ...
+  Vec32c t1 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & _mm256_srli_epi32(index1, 8)), 1);  // positions 1, 5, 9,  ...
+  Vec32c t2 =
+      _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & _mm256_srli_epi32(index1, 16)), 1);  // positions 2, 6, 10, ...
+  Vec32c t3 = _mm256_i32gather_epi32((const int *)table, _mm256_srli_epi32(index1, 24), 1);           // positions 3, 7, 11, ...
+  t0        = t0 & mask0;
+  t1        = _mm256_slli_epi32(t1 & mask0, 8);
+  t2        = _mm256_slli_epi32(t2 & mask0, 16);
+  t3        = _mm256_slli_epi32(t3, 24);
+  return (t0 | t3) | (t1 | t2);
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32c const &index, void const *table)
+{
+  return lookup<n>(Vec32uc(index), table);
+}
+
+static inline Vec16s lookup16(Vec16s const &index, Vec16s const &table)
+{
+  return Vec16s(lookup32(Vec32c(index * 0x202 + 0x100), Vec32c(table)));
+}
+
+template <int n>
+static inline Vec16s lookup(Vec16s const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 8)
+    {
+      Vec8s table1 = Vec8s().load(table);
+      return Vec16s(lookup8(index.get_low(), table1), lookup8(index.get_high(), table1));
+    }
+  if(n <= 16)
+    return lookup16(index, Vec16s().load(table));
+  // n > 16. Limit index
+  Vec16us index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec16us(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec16us(index), n - 1);
+    }
+  Vec16s t1 = _mm256_i32gather_epi32((const int *)table, __m256i(Vec8ui(index1) & 0x0000FFFF), 2);  // even positions
+  Vec16s t2 = _mm256_i32gather_epi32((const int *)table, _mm256_srli_epi32(index1, 16), 2);         // odd  positions
+  return blend16s<0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30>(t1, t2);
+}
+
+static inline Vec8i lookup8(Vec8i const &index, Vec8i const &table) { return _mm256_permutevar8x32_epi32(table, index); }
+
+template <int n>
+static inline Vec8i lookup(Vec8i const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 8)
+    {
+      Vec8i table1 = Vec8i().load(table);
+      return lookup8(index, table1);
+    }
+  if(n <= 16)
+    {
+      Vec8i table1 = Vec8i().load(table);
+      Vec8i table2 = Vec8i().load((int32_t const *)table + 8);
+      Vec8i y1     = lookup8(index, table1);
+      Vec8i y2     = lookup8(index, table2);
+      Vec8ib s     = index > 7;
+      return select(s, y2, y1);
+    }
+  // n > 16. Limit index
+  Vec8ui index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec8ui(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec8ui(index), n - 1);
+    }
+  return _mm256_i32gather_epi32((const int *)table, index1, 4);
+}
+
+static inline Vec4q lookup4(Vec4q const &index, Vec4q const &table)
+{
+  return Vec4q(lookup8(Vec8i(index * 0x200000002ll + 0x100000000ll), Vec8i(table)));
+}
+
+template <int n>
+static inline Vec4q lookup(Vec4q const &index, int64_t const *table)
+{
+  if(n <= 0)
+    return 0;
+  // n > 0. Limit index
+  Vec4uq index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec4uq(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1.
+      // There is no 64-bit min instruction, but we can use the 32-bit unsigned min,
+      // since n is a 32-bit integer
+      index1 = Vec4uq(min(Vec8ui(index), constant8i<n - 1, 0, n - 1, 0, n - 1, 0, n - 1, 0>()));
+    }
+// old compilers can't agree how to define a 64 bit integer. Intel and MS use __int64, gcc use long long
+#if defined(__clang__) && CLANG_VERSION < 30400
+  // clang 3.3 uses const int * in accordance with official Intel doc., which is wrong. will be fixed
+  return _mm256_i64gather_epi64((const int *)table, index1, 8);
+#elif defined(_MSC_VER) && _MSC_VER < 1700 && !defined(__INTEL_COMPILER)
+  // Old MS and Intel use non-standard type __int64
+  return _mm256_i64gather_epi64((const int64_t *)table, index1, 8);
+#else
+                                              // Gnu, Clang 3.4, MS 11.0
+  return _mm256_i64gather_epi64((const long long *)table, index1, 8);
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Other permutations with variable indexes
+ *
+ *****************************************************************************/
+
+// Function shift_bytes_up: shift whole vector left by b bytes.
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec32c shift_bytes_up(Vec32c const &a, int b)
+{
+  if(b < 16)
+    {
+      return Vec32c(shift_bytes_up(a.get_low(), b), shift_bytes_up(a.get_high(), b) | shift_bytes_down(a.get_low(), 16 - b));
+    }
+  else
+    {
+      return Vec32c(Vec16c(0), shift_bytes_up(a.get_high(), b - 16));
+    }
+}
+
+// Function shift_bytes_down: shift whole vector right by b bytes
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec32c shift_bytes_down(Vec32c const &a, int b)
+{
+  if(b < 16)
+    {
+      return Vec32c(shift_bytes_down(a.get_low(), b) | shift_bytes_up(a.get_high(), 16 - b), shift_bytes_down(a.get_high(), b));
+    }
+  else
+    {
+      return Vec32c(shift_bytes_down(a.get_high(), b - 16), Vec16c(0));
+    }
+}
+
+/*****************************************************************************
+ *
+ *          Gather functions with fixed indexes
+ *
+ *****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3, i4, i5, i6, i7
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i gather8i(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) >= 0> Negative_array_index;  // Error message if index is negative
+  const int i01min   = i0 < i1 ? i0 : i1;
+  const int i23min   = i2 < i3 ? i2 : i3;
+  const int i45min   = i4 < i5 ? i4 : i5;
+  const int i67min   = i6 < i7 ? i6 : i7;
+  const int i0123min = i01min < i23min ? i01min : i23min;
+  const int i4567min = i45min < i67min ? i45min : i67min;
+  const int imin     = i0123min < i4567min ? i0123min : i4567min;
+  const int i01max   = i0 > i1 ? i0 : i1;
+  const int i23max   = i2 > i3 ? i2 : i3;
+  const int i45max   = i4 > i5 ? i4 : i5;
+  const int i67max   = i6 > i7 ? i6 : i7;
+  const int i0123max = i01max > i23max ? i01max : i23max;
+  const int i4567max = i45max > i67max ? i45max : i67max;
+  const int imax     = i0123max > i4567max ? i0123max : i4567max;
+
+  if(imax - imin <= 7)
+    {
+      // load one contiguous block and permute
+      if(imax > 7)
+        {
+          // make sure we don't read past the end of the array
+          Vec8i b = Vec8i().load((int32_t const *)a + imax - 7);
+          return permute8i<i0 - imax + 7, i1 - imax + 7, i2 - imax + 7, i3 - imax + 7, i4 - imax + 7, i5 - imax + 7, i6 - imax + 7,
+                           i7 - imax + 7>(b);
+        }
+      else
+        {
+          Vec8i b = Vec8i().load((int32_t const *)a + imin);
+          return permute8i<i0 - imin, i1 - imin, i2 - imin, i3 - imin, i4 - imin, i5 - imin, i6 - imin, i7 - imin>(b);
+        }
+    }
+  if((i0 < imin + 8 || i0 > imax - 8) && (i1 < imin + 8 || i1 > imax - 8) && (i2 < imin + 8 || i2 > imax - 8) &&
+     (i3 < imin + 8 || i3 > imax - 8) && (i4 < imin + 8 || i4 > imax - 8) && (i5 < imin + 8 || i5 > imax - 8) &&
+     (i6 < imin + 8 || i6 > imax - 8) && (i7 < imin + 8 || i7 > imax - 8))
+    {
+      // load two contiguous blocks and blend
+      Vec8i b      = Vec8i().load((int32_t const *)a + imin);
+      Vec8i c      = Vec8i().load((int32_t const *)a + imax - 7);
+      const int j0 = i0 < imin + 8 ? i0 - imin : 15 - imax + i0;
+      const int j1 = i1 < imin + 8 ? i1 - imin : 15 - imax + i1;
+      const int j2 = i2 < imin + 8 ? i2 - imin : 15 - imax + i2;
+      const int j3 = i3 < imin + 8 ? i3 - imin : 15 - imax + i3;
+      const int j4 = i4 < imin + 8 ? i4 - imin : 15 - imax + i4;
+      const int j5 = i5 < imin + 8 ? i5 - imin : 15 - imax + i5;
+      const int j6 = i6 < imin + 8 ? i6 - imin : 15 - imax + i6;
+      const int j7 = i7 < imin + 8 ? i7 - imin : 15 - imax + i7;
+      return blend8i<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+  // use AVX2 gather
+  return _mm256_i32gather_epi32((const int *)a, Vec8i(i0, i1, i2, i3, i4, i5, i6, i7), 4);
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q gather4q(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3) >= 0> Negative_array_index;  // Error message if index is negative
+  const int i01min = i0 < i1 ? i0 : i1;
+  const int i23min = i2 < i3 ? i2 : i3;
+  const int imin   = i01min < i23min ? i01min : i23min;
+  const int i01max = i0 > i1 ? i0 : i1;
+  const int i23max = i2 > i3 ? i2 : i3;
+  const int imax   = i01max > i23max ? i01max : i23max;
+  if(imax - imin <= 3)
+    {
+      // load one contiguous block and permute
+      if(imax > 3)
+        {
+          // make sure we don't read past the end of the array
+          Vec4q b = Vec4q().load((int64_t const *)a + imax - 3);
+          return permute4q<i0 - imax + 3, i1 - imax + 3, i2 - imax + 3, i3 - imax + 3>(b);
+        }
+      else
+        {
+          Vec4q b = Vec4q().load((int64_t const *)a + imin);
+          return permute4q<i0 - imin, i1 - imin, i2 - imin, i3 - imin>(b);
+        }
+    }
+  if((i0 < imin + 4 || i0 > imax - 4) && (i1 < imin + 4 || i1 > imax - 4) && (i2 < imin + 4 || i2 > imax - 4) &&
+     (i3 < imin + 4 || i3 > imax - 4))
+    {
+      // load two contiguous blocks and blend
+      Vec4q b      = Vec4q().load((int64_t const *)a + imin);
+      Vec4q c      = Vec4q().load((int64_t const *)a + imax - 3);
+      const int j0 = i0 < imin + 4 ? i0 - imin : 7 - imax + i0;
+      const int j1 = i1 < imin + 4 ? i1 - imin : 7 - imax + i1;
+      const int j2 = i2 < imin + 4 ? i2 - imin : 7 - imax + i2;
+      const int j3 = i3 < imin + 4 ? i3 - imin : 7 - imax + i3;
+      return blend4q<j0, j1, j2, j3>(b, c);
+    }
+    // use AVX2 gather
+    // old compilers can't agree how to define a 64 bit integer. Intel and MS use __int64, gcc use long long
+#if defined(__clang__) && CLANG_VERSION < 30400
+  // clang 3.3 uses const int * in accordance with official Intel doc., which is wrong. will be fixed
+  return _mm256_i32gather_epi64((const int *)a, Vec4i(i0, i1, i2, i3), 8);
+#elif defined(_MSC_VER) && _MSC_VER < 1700 && !defined(__INTEL_COMPILER)
+  // Old MS and Intel use non-standard type __int64
+  return _mm256_i32gather_epi64((const int64_t *)a, Vec4i(i0, i1, i2, i3), 8);
+#else
+  // Gnu, Clang 3.4, MS 11.0
+  return _mm256_i32gather_epi64((const long long *)a, Vec4i(i0, i1, i2, i3), 8);
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Vector scatter functions
+ *
+ ******************************************************************************
+ *
+ * These functions write the elements of a vector to arbitrary positions in an
+ * array in memory. Each vector element is written to an array position
+ * determined by an index. An element is not written if the corresponding
+ * index is out of range.
+ * The indexes can be specified as constant template parameters or as an
+ * integer vector.
+ *
+ * The scatter functions are useful if the data are distributed in a sparce
+ * manner into the array. If the array is dense then it is more efficient
+ * to permute the data into the right positions and then write the whole
+ * permuted vector into the array.
+ *
+ * Example:
+ * Vec8q a(10,11,12,13,14,15,16,17);
+ * int64_t b[16] = {0};
+ * scatter<0,2,14,10,1,-1,5,9>(a,b);
+ * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
+ *
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8i const &data, void *array)
+{
+#if defined(__AVX512VL__)
+  __m256i indx   = constant8i<i0, i1, i2, i3, i4, i5, i6, i7>();
+  __mmask16 mask = uint16_t(i0 >= 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                            (i6 >= 0) << 6 | (i7 >= 0) << 7);
+  _mm256_mask_i32scatter_epi32((int *)array, mask, indx, data, 4);
+#elif defined(__AVX512F__)
+  __m512i indx = _mm512_castsi256_si512(constant8i<i0, i1, i2, i3, i4, i5, i6, i7>());
+  __mmask16 mask = uint16_t(i0 >= 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                            (i6 >= 0) << 6 | (i7 >= 0) << 7);
+  _mm512_mask_i32scatter_epi32((int *)array, mask, indx, _mm512_castsi256_si512(data), 4);
+#else
+  int32_t *arr       = (int32_t *)array;
+  const int index[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+  for(int i = 0; i < 8; i++)
+    {
+      if(index[i] >= 0)
+        arr[index[i]] = data[i];
+    }
+#endif
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4q const &data, void *array)
+{
+#if defined(__AVX512VL__)
+  __m128i indx   = constant4i<i0, i1, i2, i3>();
+  __mmask16 mask = uint16_t(i0 >= 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3);
+  _mm256_mask_i32scatter_epi64((long long *)array, mask, indx, data, 8);
+#elif defined(__AVX512F__)
+  __m256i indx = _mm256_castsi128_si256(constant4i<i0, i1, i2, i3>());
+  __mmask16 mask = uint16_t(i0 >= 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3);
+  _mm512_mask_i32scatter_epi64((long long *)array, mask, indx, _mm512_castsi256_si512(data), 8);
+#else
+  int64_t *arr       = (int64_t *)array;
+  const int index[4] = {i0, i1, i2, i3};
+  for(int i = 0; i < 4; i++)
+    {
+      if(index[i] >= 0)
+        arr[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec8i const &index, uint32_t limit, Vec8i const &data, void *array)
+{
+#if defined(__AVX512VL__)
+  __mmask16 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit));
+  _mm256_mask_i32scatter_epi32((int *)array, mask, index, data, 4);
+#elif defined(__AVX512F__)
+  // 16 bit mask. upper 8 bits are (0<0) = false
+  __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
+  _mm512_mask_i32scatter_epi32((int *)array, mask, _mm512_castsi256_si512(index), _mm512_castsi256_si512(data), 4);
+#else
+  int32_t *arr = (int32_t *)array;
+  for(int i = 0; i < 8; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        arr[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec4q const &index, uint32_t limit, Vec4q const &data, void *array)
+{
+#if defined(__AVX512VL__)
+  __mmask16 mask = _mm256_cmplt_epu64_mask(index, Vec4uq(uint64_t(limit)));
+  _mm256_mask_i64scatter_epi64((long long *)array, mask, index, data, 8);
+#elif defined(__AVX512F__)
+  // 16 bit mask. upper 8 bits are (0<0) = false
+  __mmask16 mask = _mm512_cmplt_epu64_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec4uq(uint64_t(limit))));
+  _mm512_mask_i64scatter_epi64((long long *)array, mask, _mm512_castsi256_si512(index), _mm512_castsi256_si512(data), 8);
+#else
+  int64_t *arr = (int64_t *)array;
+  for(int i = 0; i < 4; i++)
+    {
+      if(uint64_t(index[i]) < uint64_t(limit))
+        arr[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec4i const &index, uint32_t limit, Vec4q const &data, void *array)
+{
+#if defined(__AVX512VL__)
+  __mmask16 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit));
+  _mm256_mask_i32scatter_epi64((long long *)array, mask, index, data, 8);
+#elif defined(__AVX512F__)
+  // 16 bit mask. upper 8 bits are (0<0) = false
+  __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi128_si512(index), _mm512_castsi128_si512(Vec4ui(limit)));
+  _mm512_mask_i32scatter_epi64((long long *)array, mask, _mm256_castsi128_si256(index), _mm512_castsi256_si512(data), 8);
+#else
+  int64_t *arr = (int64_t *)array;
+  for(int i = 0; i < 4; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        arr[index[i]] = data[i];
+    }
+#endif
+}
+
+/*****************************************************************************
+ *
+ *          Functions for conversion between integer sizes
+ *
+ *****************************************************************************/
+
+// Extend 8-bit integers to 16-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 16 elements to 16 bits with sign extension
+static inline Vec16s extend_low(Vec32c const &a)
+{
+  __m256i a2   = permute4q<0, -256, 1, -256>(Vec4q(a));          // get bits 64-127 to position 128-191
+  __m256i sign = _mm256_cmpgt_epi8(_mm256_setzero_si256(), a2);  // 0 > a2
+  return _mm256_unpacklo_epi8(a2, sign);                         // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 16 elements to 16 bits with sign extension
+static inline Vec16s extend_high(Vec32c const &a)
+{
+  __m256i a2   = permute4q<-256, 2, -256, 3>(Vec4q(a));          // get bits 128-191 to position 64-127
+  __m256i sign = _mm256_cmpgt_epi8(_mm256_setzero_si256(), a2);  // 0 > a2
+  return _mm256_unpackhi_epi8(a2, sign);                         // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 16 elements to 16 bits with zero extension
+static inline Vec16us extend_low(Vec32uc const &a)
+{
+  __m256i a2 = permute4q<0, -256, 1, -256>(Vec4q(a));       // get bits 64-127 to position 128-191
+  return _mm256_unpacklo_epi8(a2, _mm256_setzero_si256());  // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 19 elements to 16 bits with zero extension
+static inline Vec16us extend_high(Vec32uc const &a)
+{
+  __m256i a2 = permute4q<-256, 2, -256, 3>(Vec4q(a));       // get bits 128-191 to position 64-127
+  return _mm256_unpackhi_epi8(a2, _mm256_setzero_si256());  // interleave with zero extensions
+}
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 32 bits with sign extension
+static inline Vec8i extend_low(Vec16s const &a)
+{
+  __m256i a2   = permute4q<0, -256, 1, -256>(Vec4q(a));  // get bits 64-127 to position 128-191
+  __m256i sign = _mm256_srai_epi16(a2, 15);              // sign bit
+  return _mm256_unpacklo_epi16(a2, sign);                // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with sign extension
+static inline Vec8i extend_high(Vec16s const &a)
+{
+  __m256i a2   = permute4q<-256, 2, -256, 3>(Vec4q(a));  // get bits 128-191 to position 64-127
+  __m256i sign = _mm256_srai_epi16(a2, 15);              // sign bit
+  return _mm256_unpackhi_epi16(a2, sign);                // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_low(Vec16us const &a)
+{
+  __m256i a2 = permute4q<0, -256, 1, -256>(Vec4q(a));        // get bits 64-127 to position 128-191
+  return _mm256_unpacklo_epi16(a2, _mm256_setzero_si256());  // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_high(Vec16us const &a)
+{
+  __m256i a2 = permute4q<-256, 2, -256, 3>(Vec4q(a));        // get bits 128-191 to position 64-127
+  return _mm256_unpackhi_epi16(a2, _mm256_setzero_si256());  // interleave with zero extensions
+}
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 4 elements to 64 bits with sign extension
+static inline Vec4q extend_low(Vec8i const &a)
+{
+  __m256i a2   = permute4q<0, -256, 1, -256>(Vec4q(a));  // get bits 64-127 to position 128-191
+  __m256i sign = _mm256_srai_epi32(a2, 31);              // sign bit
+  return _mm256_unpacklo_epi32(a2, sign);                // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 4 elements to 64 bits with sign extension
+static inline Vec4q extend_high(Vec8i const &a)
+{
+  __m256i a2   = permute4q<-256, 2, -256, 3>(Vec4q(a));  // get bits 128-191 to position 64-127
+  __m256i sign = _mm256_srai_epi32(a2, 31);              // sign bit
+  return _mm256_unpackhi_epi32(a2, sign);                // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_low(Vec8ui const &a)
+{
+  __m256i a2 = permute4q<0, -256, 1, -256>(Vec4q(a));        // get bits 64-127 to position 128-191
+  return _mm256_unpacklo_epi32(a2, _mm256_setzero_si256());  // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_high(Vec8ui const &a)
+{
+  __m256i a2 = permute4q<-256, 2, -256, 3>(Vec4q(a));        // get bits 128-191 to position 64-127
+  return _mm256_unpackhi_epi32(a2, _mm256_setzero_si256());  // interleave with zero extensions
+}
+
+// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec32c compress(Vec16s const &low, Vec16s const &high)
+{
+  __m256i mask  = _mm256_set1_epi32(0x00FF00FF);     // mask for low bytes
+  __m256i lowm  = _mm256_and_si256(low, mask);       // bytes of low
+  __m256i highm = _mm256_and_si256(high, mask);      // bytes of high
+  __m256i pk    = _mm256_packus_epi16(lowm, highm);  // unsigned pack
+  return _mm256_permute4x64_epi64(pk, 0xD8);         // put in right place
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed, with saturation
+static inline Vec32c compress_saturated(Vec16s const &low, Vec16s const &high)
+{
+  __m256i pk = _mm256_packs_epi16(low, high);  // packed with signed saturation
+  return _mm256_permute4x64_epi64(pk, 0xD8);   // put in right place
+}
+
+// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
+// Unsigned, overflow wraps around
+static inline Vec32uc compress(Vec16us const &low, Vec16us const &high) { return Vec32uc(compress((Vec16s)low, (Vec16s)high)); }
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Unsigned, with saturation
+static inline Vec32uc compress_saturated(Vec16us const &low, Vec16us const &high)
+{
+  __m256i maxval = _mm256_set1_epi32(0x00FF00FF);     // maximum value
+  __m256i minval = _mm256_setzero_si256();            // minimum value = 0
+  __m256i low1   = _mm256_min_epu16(low, maxval);     // upper limit
+  __m256i high1  = _mm256_min_epu16(high, maxval);    // upper limit
+  __m256i low2   = _mm256_max_epu16(low1, minval);    // lower limit
+  __m256i high2  = _mm256_max_epu16(high1, minval);   // lower limit
+  __m256i pk     = _mm256_packus_epi16(low2, high2);  // this instruction saturates from signed 32 bit to unsigned 16 bit
+  return _mm256_permute4x64_epi64(pk, 0xD8);          // put in right place
+}
+
+// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16s compress(Vec8i const &low, Vec8i const &high)
+{
+  __m256i mask  = _mm256_set1_epi32(0x0000FFFF);     // mask for low words
+  __m256i lowm  = _mm256_and_si256(low, mask);       // bytes of low
+  __m256i highm = _mm256_and_si256(high, mask);      // bytes of high
+  __m256i pk    = _mm256_packus_epi32(lowm, highm);  // unsigned pack
+  return _mm256_permute4x64_epi64(pk, 0xD8);         // put in right place
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed with saturation
+static inline Vec16s compress_saturated(Vec8i const &low, Vec8i const &high)
+{
+  __m256i pk = _mm256_packs_epi32(low, high);  // pack with signed saturation
+  return _mm256_permute4x64_epi64(pk, 0xD8);   // put in right place
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16us compress(Vec8ui const &low, Vec8ui const &high) { return Vec16us(compress((Vec8i)low, (Vec8i)high)); }
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Unsigned, with saturation
+static inline Vec16us compress_saturated(Vec8ui const &low, Vec8ui const &high)
+{
+  __m256i maxval = _mm256_set1_epi32(0x0000FFFF);     // maximum value
+  __m256i minval = _mm256_setzero_si256();            // minimum value = 0
+  __m256i low1   = _mm256_min_epu32(low, maxval);     // upper limit
+  __m256i high1  = _mm256_min_epu32(high, maxval);    // upper limit
+  __m256i low2   = _mm256_max_epu32(low1, minval);    // lower limit
+  __m256i high2  = _mm256_max_epu32(high1, minval);   // lower limit
+  __m256i pk     = _mm256_packus_epi32(low2, high2);  // this instruction saturates from signed 32 bit to unsigned 16 bit
+  return _mm256_permute4x64_epi64(pk, 0xD8);          // put in right place
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec8i compress(Vec4q const &low, Vec4q const &high)
+{
+  __m256i low2  = _mm256_shuffle_epi32(low, 0xD8);     // low dwords of low  to pos. 0 and 32
+  __m256i high2 = _mm256_shuffle_epi32(high, 0xD8);    // low dwords of high to pos. 0 and 32
+  __m256i pk    = _mm256_unpacklo_epi64(low2, high2);  // interleave
+  return _mm256_permute4x64_epi64(pk, 0xD8);           // put in right place
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec8i compress_saturated(Vec4q const &a, Vec4q const &b)
+{
+  Vec4q maxval = constant8ui<0x7FFFFFFF, 0, 0x7FFFFFFF, 0, 0x7FFFFFFF, 0, 0x7FFFFFFF, 0>();
+  Vec4q minval = constant8ui<0x80000000, 0xFFFFFFFF, 0x80000000, 0xFFFFFFFF, 0x80000000, 0xFFFFFFFF, 0x80000000, 0xFFFFFFFF>();
+  Vec4q a1     = min(a, maxval);
+  Vec4q b1     = min(b, maxval);
+  Vec4q a2     = max(a1, minval);
+  Vec4q b2     = max(b1, minval);
+  return compress(a2, b2);
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8ui compress(Vec4uq const &low, Vec4uq const &high) { return Vec8ui(compress((Vec4q)low, (Vec4q)high)); }
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec8ui compress_saturated(Vec4uq const &low, Vec4uq const &high)
+{
+  __m256i zero     = _mm256_setzero_si256();            // 0
+  __m256i lowzero  = _mm256_cmpeq_epi32(low, zero);     // for each dword is zero
+  __m256i highzero = _mm256_cmpeq_epi32(high, zero);    // for each dword is zero
+  __m256i mone     = _mm256_set1_epi32(-1);             // FFFFFFFF
+  __m256i lownz    = _mm256_xor_si256(lowzero, mone);   // for each dword is nonzero
+  __m256i highnz   = _mm256_xor_si256(highzero, mone);  // for each dword is nonzero
+  __m256i lownz2   = _mm256_srli_epi64(lownz, 32);      // shift down to low dword
+  __m256i highnz2  = _mm256_srli_epi64(highnz, 32);     // shift down to low dword
+  __m256i lowsatur = _mm256_or_si256(low, lownz2);      // low, saturated
+  __m256i hisatur  = _mm256_or_si256(high, highnz2);    // high, saturated
+  return Vec8ui(compress(Vec4q(lowsatur), Vec4q(hisatur)));
+}
+
+/*****************************************************************************
+ *
+ *          Integer division operators
+ *
+ *          Please see the file vectori128.h for explanation.
+ *
+ *****************************************************************************/
+
+// vector operator / : divide each element by divisor
+
+// vector of 8 32-bit signed integers
+static inline Vec8i operator/(Vec8i const &a, Divisor_i const &d)
+{
+  __m256i m   = _mm256_broadcastq_epi64(d.getm());         // broadcast multiplier
+  __m256i sgn = _mm256_broadcastq_epi64(d.getsign());      // broadcast sign of d
+  __m256i t1  = _mm256_mul_epi32(a, m);                    // 32x32->64 bit signed multiplication of even elements of a
+  __m256i t2  = _mm256_srli_epi64(t1, 32);                 // high dword of even numbered results
+  __m256i t3  = _mm256_srli_epi64(a, 32);                  // get odd elements of a into position for multiplication
+  __m256i t4  = _mm256_mul_epi32(t3, m);                   // 32x32->64 bit signed multiplication of odd elements
+  __m256i t5  = constant8i<0, -1, 0, -1, 0, -1, 0, -1>();  // mask for odd elements
+  __m256i t7  = _mm256_blendv_epi8(t2, t4, t5);            // blend two results
+  __m256i t8  = _mm256_add_epi32(t7, a);                   // add
+  __m256i t9  = _mm256_sra_epi32(t8, d.gets1());           // shift right artihmetic
+  __m256i t10 = _mm256_srai_epi32(a, 31);                  // sign of a
+  __m256i t11 = _mm256_sub_epi32(t10, sgn);                // sign of a - sign of d
+  __m256i t12 = _mm256_sub_epi32(t9, t11);                 // + 1 if a < 0, -1 if d < 0
+  return _mm256_xor_si256(t12, sgn);                       // change sign if divisor negative
+}
+
+// vector of 8 32-bit unsigned integers
+static inline Vec8ui operator/(Vec8ui const &a, Divisor_ui const &d)
+{
+  __m256i m   = _mm256_broadcastq_epi64(d.getm());         // broadcast multiplier
+  __m256i t1  = _mm256_mul_epu32(a, m);                    // 32x32->64 bit unsigned multiplication of even elements of a
+  __m256i t2  = _mm256_srli_epi64(t1, 32);                 // high dword of even numbered results
+  __m256i t3  = _mm256_srli_epi64(a, 32);                  // get odd elements of a into position for multiplication
+  __m256i t4  = _mm256_mul_epu32(t3, m);                   // 32x32->64 bit unsigned multiplication of odd elements
+  __m256i t5  = constant8i<0, -1, 0, -1, 0, -1, 0, -1>();  // mask for odd elements
+  __m256i t7  = _mm256_blendv_epi8(t2, t4, t5);            // blend two results
+  __m256i t8  = _mm256_sub_epi32(a, t7);                   // subtract
+  __m256i t9  = _mm256_srl_epi32(t8, d.gets1());           // shift right logical
+  __m256i t10 = _mm256_add_epi32(t7, t9);                  // add
+  return _mm256_srl_epi32(t10, d.gets2());                 // shift right logical
+}
+
+// vector of 16 16-bit signed integers
+static inline Vec16s operator/(Vec16s const &a, Divisor_s const &d)
+{
+  __m256i m   = _mm256_broadcastq_epi64(d.getm());     // broadcast multiplier
+  __m256i sgn = _mm256_broadcastq_epi64(d.getsign());  // broadcast sign of d
+  __m256i t1  = _mm256_mulhi_epi16(a, m);              // multiply high signed words
+  __m256i t2  = _mm256_add_epi16(t1, a);               // + a
+  __m256i t3  = _mm256_sra_epi16(t2, d.gets1());       // shift right artihmetic
+  __m256i t4  = _mm256_srai_epi16(a, 15);              // sign of a
+  __m256i t5  = _mm256_sub_epi16(t4, sgn);             // sign of a - sign of d
+  __m256i t6  = _mm256_sub_epi16(t3, t5);              // + 1 if a < 0, -1 if d < 0
+  return _mm256_xor_si256(t6, sgn);                    // change sign if divisor negative
+}
+
+// vector of 16 16-bit unsigned integers
+static inline Vec16us operator/(Vec16us const &a, Divisor_us const &d)
+{
+  __m256i m  = _mm256_broadcastq_epi64(d.getm());  // broadcast multiplier
+  __m256i t1 = _mm256_mulhi_epu16(a, m);           // multiply high signed words
+  __m256i t2 = _mm256_sub_epi16(a, t1);            // subtract
+  __m256i t3 = _mm256_srl_epi16(t2, d.gets1());    // shift right logical
+  __m256i t4 = _mm256_add_epi16(t1, t3);           // add
+  return _mm256_srl_epi16(t4, d.gets2());          // shift right logical
+}
+
+// vector of 32 8-bit signed integers
+static inline Vec32c operator/(Vec32c const &a, Divisor_s const &d)
+{
+  // expand into two Vec16s
+  Vec16s low  = extend_low(a) / d;
+  Vec16s high = extend_high(a) / d;
+  return compress(low, high);
+}
+
+// vector of 32 8-bit unsigned integers
+static inline Vec32uc operator/(Vec32uc const &a, Divisor_us const &d)
+{
+  // expand into two Vec16s
+  Vec16us low  = extend_low(a) / d;
+  Vec16us high = extend_high(a) / d;
+  return compress(low, high);
+}
+
+// vector operator /= : divide
+static inline Vec8i &operator/=(Vec8i &a, Divisor_i const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator /= : divide
+static inline Vec8ui &operator/=(Vec8ui &a, Divisor_ui const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator /= : divide
+static inline Vec16s &operator/=(Vec16s &a, Divisor_s const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator /= : divide
+static inline Vec16us &operator/=(Vec16us &a, Divisor_us const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator /= : divide
+static inline Vec32c &operator/=(Vec32c &a, Divisor_s const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator /= : divide
+static inline Vec32uc &operator/=(Vec32uc &a, Divisor_us const &d)
+{
+  a = a / d;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Integer division 2: divisor is a compile-time constant
+ *
+ *****************************************************************************/
+
+// Divide Vec8i by compile-time constant
+template <int32_t d>
+static inline Vec8i divide_by_i(Vec8i const &x)
+{
+  Static_error_check<(d != 0)> Dividing_by_zero;  // Error message if dividing by zero
+  if(d == 1)
+    return x;
+  if(d == -1)
+    return -x;
+  if(uint32_t(d) == 0x80000000u)
+    return Vec8i(x == Vec8i(0x80000000)) & 1;  // prevent overflow when changing sign
+  const uint32_t d1 =
+      d > 0 ? uint32_t(d) : -uint32_t(d);  // compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits)
+  if((d1 & (d1 - 1)) == 0)
+    {
+      // d1 is a power of 2. use shift
+      const int k = bit_scan_reverse_const(d1);
+      __m256i sign;
+      if(k > 1)
+        sign = _mm256_srai_epi32(x, k - 1);
+      else
+        sign = x;                                        // k copies of sign bit
+      __m256i bias   = _mm256_srli_epi32(sign, 32 - k);  // bias = x >= 0 ? 0 : k-1
+      __m256i xpbias = _mm256_add_epi32(x, bias);        // x + bias
+      __m256i q      = _mm256_srai_epi32(xpbias, k);     // (x + bias) >> k
+      if(d > 0)
+        return q;                                          // d > 0: return  q
+      return _mm256_sub_epi32(_mm256_setzero_si256(), q);  // d < 0: return -q
+    }
+  // general case
+  const int32_t sh   = bit_scan_reverse_const(uint32_t(d1) - 1);  // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case)
+  const int32_t mult = int(1 + (uint64_t(1) << (32 + sh)) / uint32_t(d1) - (int64_t(1) << 32));  // multiplier
+  const Divisor_i div(mult, sh, d < 0 ? -1 : 0);
+  return x / div;
+}
+
+// define Vec8i a / const_int(d)
+template <int32_t d>
+static inline Vec8i operator/(Vec8i const &a, Const_int_t<d>)
+{
+  return divide_by_i<d>(a);
+}
+
+// define Vec8i a / const_uint(d)
+template <uint32_t d>
+static inline Vec8i operator/(Vec8i const &a, Const_uint_t<d>)
+{
+  Static_error_check<(d < 0x80000000u)> Error_overflow_dividing_signed_by_unsigned;  // Error: dividing signed by overflowing unsigned
+  return divide_by_i<int32_t(d)>(a);                                                 // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8i &operator/=(Vec8i &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8i &operator/=(Vec8i &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// Divide Vec8ui by compile-time constant
+template <uint32_t d>
+static inline Vec8ui divide_by_ui(Vec8ui const &x)
+{
+  Static_error_check<(d != 0)> Dividing_by_zero;  // Error message if dividing by zero
+  if(d == 1)
+    return x;                               // divide by 1
+  const int b = bit_scan_reverse_const(d);  // floor(log2(d))
+  if((uint32_t(d) & (uint32_t(d) - 1)) == 0)
+    {
+      // d is a power of 2. use shift
+      return _mm256_srli_epi32(x, b);  // x >> b
+    }
+  // general case (d > 2)
+  uint32_t mult         = uint32_t((uint64_t(1) << (b + 32)) / d);         // multiplier = 2^(32+b) / d
+  const uint64_t rem    = (uint64_t(1) << (b + 32)) - uint64_t(d) * mult;  // remainder 2^(32+b) % d
+  const bool round_down = (2 * rem < d);                                   // check if fraction is less than 0.5
+  if(!round_down)
+    {
+      mult = mult + 1;  // round up mult
+    }
+  // do 32*32->64 bit unsigned multiplication and get high part of result
+  const __m256i multv = _mm256_set_epi32(0, mult, 0, mult, 0, mult, 0, mult);  // zero-extend mult and broadcast
+  __m256i t1          = _mm256_mul_epu32(x, multv);  // 32x32->64 bit unsigned multiplication of x[0] and x[2]
+  if(round_down)
+    {
+      t1 = _mm256_add_epi64(t1, multv);  // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+  __m256i t2 = _mm256_srli_epi64(t1, 32);    // high dword of result 0 and 2
+  __m256i t3 = _mm256_srli_epi64(x, 32);     // get x[1] and x[3] into position for multiplication
+  __m256i t4 = _mm256_mul_epu32(t3, multv);  // 32x32->64 bit unsigned multiplication of x[1] and x[3]
+  if(round_down)
+    {
+      t4 = _mm256_add_epi64(t4, multv);  // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+  __m256i t5 = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);  // mask of dword 1 and 3
+  __m256i t7 = _mm256_blendv_epi8(t2, t4, t5);                // blend two results
+  Vec8ui q   = _mm256_srli_epi32(t7, b);                      // shift right by b
+  return q;                                                   // no overflow possible
+}
+
+// define Vec8ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec8ui operator/(Vec8ui const &a, Const_uint_t<d>)
+{
+  return divide_by_ui<d>(a);
+}
+
+// define Vec8ui a / const_int(d)
+template <int32_t d>
+static inline Vec8ui operator/(Vec8ui const &a, Const_int_t<d>)
+{
+  Static_error_check<(d >= 0)> Error_dividing_unsigned_by_negative;  // Error: dividing unsigned by negative is ambiguous
+  return divide_by_ui<d>(a);                                         // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8ui &operator/=(Vec8ui &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8ui &operator/=(Vec8ui &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// Divide Vec16s by compile-time constant
+template <int d>
+static inline Vec16s divide_by_i(Vec16s const &x)
+{
+  const int16_t d0 = int16_t(d);                   // truncate d to 16 bits
+  Static_error_check<(d0 != 0)> Dividing_by_zero;  // Error message if dividing by zero
+  if(d0 == 1)
+    return x;  // divide by  1
+  if(d0 == -1)
+    return -x;  // divide by -1
+  if(uint16_t(d0) == 0x8000u)
+    return Vec16s(x == Vec16s(0x8000)) & 1;  // prevent overflow when changing sign
+  const uint16_t d1 = d0 > 0 ? d0 : -d0;     // compile-time abs(d0)
+  if((d1 & (d1 - 1)) == 0)
+    {
+      // d is a power of 2. use shift
+      const int k = bit_scan_reverse_const(uint32_t(d1));
+      __m256i sign;
+      if(k > 1)
+        sign = _mm256_srai_epi16(x, k - 1);
+      else
+        sign = x;                                        // k copies of sign bit
+      __m256i bias   = _mm256_srli_epi16(sign, 16 - k);  // bias = x >= 0 ? 0 : k-1
+      __m256i xpbias = _mm256_add_epi16(x, bias);        // x + bias
+      __m256i q      = _mm256_srai_epi16(xpbias, k);     // (x + bias) >> k
+      if(d0 > 0)
+        return q;                                          // d0 > 0: return  q
+      return _mm256_sub_epi16(_mm256_setzero_si256(), q);  // d0 < 0: return -q
+    }
+  // general case
+  const int L        = bit_scan_reverse_const(uint16_t(d1 - 1)) + 1;            // ceil(log2(d)). (d < 2 handled above)
+  const int16_t mult = int16_t(1 + (1u << (15 + L)) / uint32_t(d1) - 0x10000);  // multiplier
+  const int shift1   = L - 1;
+  const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1);
+  return x / div;
+}
+
+// define Vec16s a / const_int(d)
+template <int d>
+static inline Vec16s operator/(Vec16s const &a, Const_int_t<d>)
+{
+  return divide_by_i<d>(a);
+}
+
+// define Vec16s a / const_uint(d)
+template <uint32_t d>
+static inline Vec16s operator/(Vec16s const &a, Const_uint_t<d>)
+{
+  Static_error_check<(d < 0x8000u)> Error_overflow_dividing_signed_by_unsigned;  // Error: dividing signed by overflowing unsigned
+  return divide_by_i<int(d)>(a);                                                 // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16s &operator/=(Vec16s &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16s &operator/=(Vec16s &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// Divide Vec16us by compile-time constant
+template <uint32_t d>
+static inline Vec16us divide_by_ui(Vec16us const &x)
+{
+  const uint16_t d0 = uint16_t(d);                 // truncate d to 16 bits
+  Static_error_check<(d0 != 0)> Dividing_by_zero;  // Error message if dividing by zero
+  if(d0 == 1)
+    return x;                                // divide by 1
+  const int b = bit_scan_reverse_const(d0);  // floor(log2(d))
+  if((d0 & (d0 - 1)) == 0)
+    {
+      // d is a power of 2. use shift
+      return _mm256_srli_epi16(x, b);  // x >> b
+    }
+  // general case (d > 2)
+  uint16_t mult         = uint16_t((uint32_t(1) << (b + 16)) / d0);         // multiplier = 2^(32+b) / d
+  const uint32_t rem    = (uint32_t(1) << (b + 16)) - uint32_t(d0) * mult;  // remainder 2^(32+b) % d
+  const bool round_down = (2 * rem < d0);                                   // check if fraction is less than 0.5
+  Vec16us x1            = x;
+  if(round_down)
+    {
+      x1 = x1 + 1;  // round down mult and compensate by adding 1 to x
+    }
+  else
+    {
+      mult = mult + 1;  // round up mult. no compensation needed
+    }
+  const __m256i multv = _mm256_set1_epi16(mult);        // broadcast mult
+  __m256i xm          = _mm256_mulhi_epu16(x1, multv);  // high part of 16x16->32 bit unsigned multiplication
+  Vec16us q           = _mm256_srli_epi16(xm, b);       // shift right by b
+  if(round_down)
+    {
+      Vec16sb overfl = (x1 == Vec16us(_mm256_setzero_si256()));  // check for overflow of x+1
+      return select(overfl, Vec16us(mult >> b), q);              // deal with overflow (rarely needed)
+    }
+  else
+    {
+      return q;  // no overflow possible
+    }
+}
+
+// define Vec16us a / const_uint(d)
+template <uint32_t d>
+static inline Vec16us operator/(Vec16us const &a, Const_uint_t<d>)
+{
+  return divide_by_ui<d>(a);
+}
+
+// define Vec16us a / const_int(d)
+template <int d>
+static inline Vec16us operator/(Vec16us const &a, Const_int_t<d>)
+{
+  Static_error_check<(d >= 0)> Error_dividing_unsigned_by_negative;  // Error: dividing unsigned by negative is ambiguous
+  return divide_by_ui<d>(a);                                         // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16us &operator/=(Vec16us &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16us &operator/=(Vec16us &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// define Vec32c a / const_int(d)
+template <int d>
+static inline Vec32c operator/(Vec32c const &a, Const_int_t<d>)
+{
+  // expand into two Vec16s
+  Vec16s low  = extend_low(a) / Const_int_t<d>();
+  Vec16s high = extend_high(a) / Const_int_t<d>();
+  return compress(low, high);
+}
+
+// define Vec32c a / const_uint(d)
+template <uint32_t d>
+static inline Vec32c operator/(Vec32c const &a, Const_uint_t<d>)
+{
+  Static_error_check<(uint8_t(d) < 0x80u)>
+      Error_overflow_dividing_signed_by_unsigned;  // Error: dividing signed by overflowing unsigned
+  return a / Const_int_t<d>();                     // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32c &operator/=(Vec32c &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32c &operator/=(Vec32c &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// define Vec32uc a / const_uint(d)
+template <uint32_t d>
+static inline Vec32uc operator/(Vec32uc const &a, Const_uint_t<d>)
+{
+  // expand into two Vec16us
+  Vec16us low  = extend_low(a) / Const_uint_t<d>();
+  Vec16us high = extend_high(a) / Const_uint_t<d>();
+  return compress(low, high);
+}
+
+// define Vec32uc a / const_int(d)
+template <int d>
+static inline Vec32uc operator/(Vec32uc const &a, Const_int_t<d>)
+{
+  Static_error_check<(int8_t(d) >= 0)> Error_dividing_unsigned_by_negative;  // Error: dividing unsigned by negative is ambiguous
+  return a / Const_uint_t<d>();                                              // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32uc &operator/=(Vec32uc &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32uc &operator/=(Vec32uc &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal scan functions
+ *
+ *****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec32cb const &x)
+{
+  uint32_t a = _mm256_movemask_epi8(x);
+  if(a == 0)
+    return -1;
+  int32_t b = bit_scan_forward(a);
+  return b;
+}
+
+static inline int horizontal_find_first(Vec16sb const &x) { return horizontal_find_first(Vec32cb(x)) >> 1; }
+
+static inline int horizontal_find_first(Vec8ib const &x) { return horizontal_find_first(Vec32cb(x)) >> 2; }
+
+static inline int horizontal_find_first(Vec4qb const &x) { return horizontal_find_first(Vec32cb(x)) >> 3; }
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec32cb const &x)
+{
+  uint32_t a = _mm256_movemask_epi8(x);
+  return vml_popcnt(a);
+}
+
+static inline uint32_t horizontal_count(Vec16sb const &x) { return horizontal_count(Vec32cb(x)) >> 1; }
+
+static inline uint32_t horizontal_count(Vec8ib const &x) { return horizontal_count(Vec32cb(x)) >> 2; }
+
+static inline uint32_t horizontal_count(Vec4qb const &x) { return horizontal_count(Vec32cb(x)) >> 3; }
+
+/*****************************************************************************
+ *
+ *          Boolean <-> bitfield conversion functions
+ *
+ *****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint32_t to_bits(Vec32cb const &x) { return (uint32_t)_mm256_movemask_epi8(x); }
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec32cb to_Vec32cb(uint32_t x) { return Vec32cb(Vec32c(to_Vec16cb(uint16_t(x)), to_Vec16cb(uint16_t(x >> 16)))); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16sb const &x)
+{
+  __m128i a = _mm_packs_epi16(x.get_low(), x.get_high());  // 16-bit words to bytes
+  return (uint16_t)_mm_movemask_epi8(a);
+}
+
+// to_Vec16sb: convert integer bitfield to boolean vector
+static inline Vec16sb to_Vec16sb(uint16_t x) { return Vec16sb(Vec16s(to_Vec8sb(uint8_t(x)), to_Vec8sb(uint8_t(x >> 8)))); }
+
+#if INSTRSET < 9 || MAX_VECTOR_SIZE < 512
+// These functions are defined in Vectori512.h if AVX512 instruction set is used
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8ib const &x)
+{
+  __m128i a = _mm_packs_epi32(x.get_low(), x.get_high());  // 32-bit dwords to 16-bit words
+  __m128i b = _mm_packs_epi16(a, a);                       // 16-bit words to bytes
+  return (uint8_t)_mm_movemask_epi8(b);
+}
+
+// to_Vec8ib: convert integer bitfield to boolean vector
+static inline Vec8ib to_Vec8ib(uint8_t x) { return Vec8ib(Vec8i(to_Vec4ib(x), to_Vec4ib(x >> 4))); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4qb const &x)
+{
+  uint32_t a = _mm256_movemask_epi8(x);
+  return ((a & 1) | ((a >> 7) & 2)) | (((a >> 14) & 4) | ((a >> 21) & 8));
+}
+
+// to_Vec4qb: convert integer bitfield to boolean vector
+static inline Vec4qb to_Vec4qb(uint8_t x) { return Vec4qb(Vec4q(-(x & 1), -((x >> 1) & 1), -((x >> 2) & 1), -((x >> 3) & 1))); }
+
+#else  // function prototypes here only
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8ib x);
+
+// to_Vec8ib: convert integer bitfield to boolean vector
+static inline Vec8ib to_Vec8ib(uint8_t x);
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4qb x);
+
+// to_Vec4qb: convert integer bitfield to boolean vector
+static inline Vec4qb to_Vec4qb(uint8_t x);
+
+#endif  // INSTRSET < 9 || MAX_VECTOR_SIZE < 512
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORI256_H
diff --git a/src/vectorclass/vectori256e.h b/src/vectorclass/vectori256e.h
new file mode 100644
index 0000000000000000000000000000000000000000..16365914c7ff2fb5273cc574c005a025334186e4
--- /dev/null
+++ b/src/vectorclass/vectori256e.h
@@ -0,0 +1,4479 @@
+/****************************  vectori256e.h   *******************************
+ * Author:        Agner Fog
+ * Date created:  2012-05-30
+ * Last modified: 2017-02-19
+ * Version:       1.27
+ * Project:       vector classes
+ * Description:
+ * Header file defining 256-bit integer point vector classes as interface
+ * to intrinsic functions. Emulated for processors without AVX2 instruction set.
+ *
+ * The following vector classes are defined here:
+ * Vec256b   Vector of 256  1-bit unsigned  integers or Booleans
+ * Vec32c    Vector of  32  8-bit signed    integers
+ * Vec32uc   Vector of  32  8-bit unsigned  integers
+ * Vec32cb   Vector of  32  Booleans for use with Vec32c and Vec32uc
+ * Vec16s    Vector of  16  16-bit signed   integers
+ * Vec16us   Vector of  16  16-bit unsigned integers
+ * Vec16sb   Vector of  16  Booleans for use with Vec16s and Vec16us
+ * Vec8i     Vector of   8  32-bit signed   integers
+ * Vec8ui    Vector of   8  32-bit unsigned integers
+ * Vec8ib    Vector of   8  Booleans for use with Vec8i and Vec8ui
+ * Vec4q     Vector of   4  64-bit signed   integers
+ * Vec4uq    Vector of   4  64-bit unsigned integers
+ * Vec4qb    Vector of   4  Booleans for use with Vec4q and Vec4uq
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
+ *****************************************************************************/
+
+// check combination of header files
+#if defined(VECTORI256_H)
+#if VECTORI256_H != 1
+#error Two different versions of vectori256.h included
+#endif
+#else
+#define VECTORI256_H 1
+
+#ifdef VECTORF256_H
+#error Please put header file vectori256.h or vectori256e.h before vectorf256e.h
+#endif
+
+#include "vectori128.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/*****************************************************************************
+ *
+ *          base class Vec256ie
+ *
+ *****************************************************************************/
+// base class to replace Vec256ie when AVX2 is not supported
+class Vec256ie
+{
+ protected:
+  __m128i y0;  // low half
+  __m128i y1;  // high half
+ public:
+  Vec256ie(void){};  // default constructor
+  Vec256ie(__m128i x0, __m128i x1)
+  {  // constructor to build from two __m128i
+    y0 = x0;
+    y1 = x1;
+  }
+  __m128i get_low() const
+  {  // get low half
+    return y0;
+  }
+  __m128i get_high() const
+  {  // get high half
+    return y1;
+  }
+};
+
+/*****************************************************************************
+ *
+ *          Vector of 256 1-bit unsigned integers or Booleans
+ *
+ *****************************************************************************/
+
+class Vec256b : public Vec256ie
+{
+ public:
+  // Default constructor:
+  Vec256b() {}
+  // Constructor to broadcast the same value into all elements
+  // Removed because of undesired implicit conversions
+  // Vec256b(int i) {
+  //    y1 = y0 = _mm_set1_epi32(-(i & 1));}
+
+  // Constructor to build from two Vec128b:
+  Vec256b(Vec128b const &a0, Vec128b const &a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256ie
+  Vec256b(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec256b &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec256b &load(void const *p)
+  {
+    y0 = _mm_loadu_si128((__m128i const *)p);
+    y1 = _mm_loadu_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+  Vec256b &load_a(void const *p)
+  {
+    y0 = _mm_load_si128((__m128i const *)p);
+    y1 = _mm_load_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(void *p) const
+  {
+    _mm_storeu_si128((__m128i *)p, y0);
+    _mm_storeu_si128((__m128i *)p + 1, y1);
+  }
+  // Member function to store into array, aligned by 32
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+  void store_a(void *p) const
+  {
+    _mm_store_si128((__m128i *)p, y0);
+    _mm_store_si128((__m128i *)p + 1, y1);
+  }
+  // Member function to change a single bit
+  // Note: This function is inefficient. Use load function if changing more than one bit
+  Vec256b const &set_bit(uint32_t index, int value)
+  {
+    if(index < 128)
+      {
+        y0 = Vec128b(y0).set_bit(index, value);
+      }
+    else
+      {
+        y1 = Vec128b(y1).set_bit(index - 128, value);
+      }
+    return *this;
+  }
+  // Member function to get a single bit
+  // Note: This function is inefficient. Use store function if reading more than one bit
+  int get_bit(uint32_t index) const
+  {
+    if(index < 128)
+      {
+        return Vec128b(y0).get_bit(index);
+      }
+    else
+      {
+        return Vec128b(y1).get_bit(index - 128);
+      }
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return get_bit(index) != 0; }
+  // Member functions to split into two Vec128b:
+  Vec128b get_low() const { return y0; }
+  Vec128b get_high() const { return y1; }
+  static int size() { return 256; }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec256b operator&(Vec256b const &a, Vec256b const &b)
+{
+  return Vec256b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec256b operator&&(Vec256b const &a, Vec256b const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec256b operator|(Vec256b const &a, Vec256b const &b)
+{
+  return Vec256b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec256b operator||(Vec256b const &a, Vec256b const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec256b operator^(Vec256b const &a, Vec256b const &b)
+{
+  return Vec256b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec256b operator~(Vec256b const &a) { return Vec256b(~a.get_low(), ~a.get_high()); }
+
+// vector operator &= : bitwise and
+static inline Vec256b &operator&=(Vec256b &a, Vec256b const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec256b &operator|=(Vec256b &a, Vec256b const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec256b &operator^=(Vec256b &a, Vec256b const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec256b andnot(Vec256b const &a, Vec256b const &b)
+{
+  return Vec256b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Generate compile-time constant vector
+ *
+ *****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory.
+// Can be converted to any integer vector type
+template <int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7>
+static inline Vec256ie constant8i()
+{
+  static const union
+  {
+    int32_t i[8];
+    __m128i y[2];
+  } u = {{i0, i1, i2, i3, i4, i5, i6, i7}};
+  return Vec256ie(u.y[0], u.y[1]);
+}
+
+template <uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7>
+static inline Vec256ie constant8ui()
+{
+  return constant8i<int32_t(i0), int32_t(i1), int32_t(i2), int32_t(i3), int32_t(i4), int32_t(i5), int32_t(i6), int32_t(i7)>();
+}
+
+/*****************************************************************************
+ *
+ *          selectb function
+ *
+ *****************************************************************************/
+// Select between two sources, byte by byte. Used in various functions and operators
+// Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
+// Only bit 7 in each byte of s is checked,
+static inline Vec256ie selectb(Vec256ie const &s, Vec256ie const &a, Vec256ie const &b)
+{
+  return Vec256ie(selectb(s.get_low(), a.get_low(), b.get_low()), selectb(s.get_high(), a.get_high(), b.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal Boolean functions
+ *
+ *****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec256b const &a) { return horizontal_and(a.get_low() & a.get_high()); }
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec256b const &a) { return horizontal_or(a.get_low() | a.get_high()); }
+
+/*****************************************************************************
+ *
+ *          Vector of 32 8-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec32c : public Vec256b
+{
+ public:
+  // Default constructor:
+  Vec32c() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec32c(int i) { y1 = y0 = _mm_set1_epi8((char)i); }
+  // Constructor to build from all elements:
+  Vec32c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7, int8_t i8, int8_t i9, int8_t i10,
+         int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15, int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20,
+         int8_t i21, int8_t i22, int8_t i23, int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30,
+         int8_t i31)
+  {
+    y0 = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+    y1 = _mm_setr_epi8(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
+  }
+  // Constructor to build from two Vec16c:
+  Vec32c(Vec16c const &a0, Vec16c const &a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256ie
+  Vec32c(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec32c &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec32c &load(void const *p)
+  {
+    y0 = _mm_loadu_si128((__m128i const *)p);
+    y1 = _mm_loadu_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec32c &load_a(void const *p)
+  {
+    y0 = _mm_load_si128((__m128i const *)p);
+    y1 = _mm_load_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec32c &load_partial(int n, void const *p)
+  {
+    if(n <= 0)
+      {
+        *this = 0;
+      }
+    else if(n <= 16)
+      {
+        *this = Vec32c(Vec16c().load_partial(n, p), 0);
+      }
+    else if(n < 32)
+      {
+        *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n - 16, (char const *)p + 16));
+      }
+    else
+      {
+        load(p);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n <= 0)
+      {
+        return;
+      }
+    else if(n <= 16)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n < 32)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 16, (char *)p + 16);
+      }
+    else
+      {
+        store(p);
+      }
+  }
+  // cut off vector to n elements. The last 32-n elements are set to zero
+  Vec32c &cutoff(int n)
+  {
+    if(uint32_t(n) >= 32)
+      return *this;
+    static const union
+    {
+      int32_t i[16];
+      char c[64];
+    } mask = {{-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}};
+    *this &= Vec32c().load(mask.c + 32 - n);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec32c const &insert(uint32_t index, int8_t value)
+  {
+    if(index < 16)
+      {
+        y0 = Vec16c(y0).insert(index, value);
+      }
+    else
+      {
+        y1 = Vec16c(y1).insert(index - 16, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int8_t extract(uint32_t index) const
+  {
+    if(index < 16)
+      {
+        return Vec16c(y0).extract(index);
+      }
+    else
+      {
+        return Vec16c(y1).extract(index - 16);
+      }
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int8_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec16c:
+  Vec16c get_low() const { return y0; }
+  Vec16c get_high() const { return y1; }
+  static int size() { return 32; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec32cb: Vector of 32 Booleans for use with Vec32c and Vec32uc
+ *
+ *****************************************************************************/
+
+class Vec32cb : public Vec32c
+{
+ public:
+  // Default constructor:
+  Vec32cb() {}
+  // Constructor to build from all elements:
+  Vec32cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, bool x8, bool x9, bool x10, bool x11, bool x12,
+          bool x13, bool x14, bool x15, bool x16, bool x17, bool x18, bool x19, bool x20, bool x21, bool x22, bool x23, bool x24,
+          bool x25, bool x26, bool x27, bool x28, bool x29, bool x30, bool x31)
+      : Vec32c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), -int8_t(x8),
+               -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15), -int8_t(x16),
+               -int8_t(x17), -int8_t(x18), -int8_t(x19), -int8_t(x20), -int8_t(x21), -int8_t(x22), -int8_t(x23), -int8_t(x24),
+               -int8_t(x25), -int8_t(x26), -int8_t(x27), -int8_t(x28), -int8_t(x29), -int8_t(x30), -int8_t(x31))
+  {
+  }
+  // Constructor to convert from type Vec256ie
+  Vec32cb(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec32cb &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec32cb(bool b) : Vec32c(-int8_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec32cb &operator=(bool b)
+  {
+    *this = Vec32cb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec32cb(int b);
+  Vec32cb &operator=(int x);
+
+ public:
+  // Member functions to split into two Vec16c:
+  Vec16cb get_low() const { return y0; }
+  Vec16cb get_high() const { return y1; }
+  Vec32cb &insert(int index, bool a)
+  {
+    Vec32c::insert(index, -(int)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const { return Vec32c::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec32cb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec32cb operator&(Vec32cb const &a, Vec32cb const &b) { return Vec32cb(Vec256b(a) & Vec256b(b)); }
+static inline Vec32cb operator&&(Vec32cb const &a, Vec32cb const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec32cb &operator&=(Vec32cb &a, Vec32cb const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32cb operator|(Vec32cb const &a, Vec32cb const &b) { return Vec32cb(Vec256b(a) | Vec256b(b)); }
+static inline Vec32cb operator||(Vec32cb const &a, Vec32cb const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec32cb &operator|=(Vec32cb &a, Vec32cb const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32cb operator^(Vec32cb const &a, Vec32cb const &b) { return Vec32cb(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec32cb &operator^=(Vec32cb &a, Vec32cb const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32cb operator~(Vec32cb const &a) { return Vec32cb(~Vec256b(a)); }
+
+// vector operator ! : element not
+static inline Vec32cb operator!(Vec32cb const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec32cb andnot(Vec32cb const &a, Vec32cb const &b) { return Vec32cb(andnot(Vec256b(a), Vec256b(b))); }
+
+/*****************************************************************************
+ *
+ *          Operators for Vec32c
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec32c operator+(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec32c &operator+=(Vec32c &a, Vec32c const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec32c operator++(Vec32c &a, int)
+{
+  Vec32c a0 = a;
+  a         = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec32c &operator++(Vec32c &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec32c operator-(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec32c operator-(Vec32c const &a) { return Vec32c(-a.get_low(), -a.get_high()); }
+
+// vector operator -= : add
+static inline Vec32c &operator-=(Vec32c &a, Vec32c const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec32c operator--(Vec32c &a, int)
+{
+  Vec32c a0 = a;
+  a         = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec32c &operator--(Vec32c &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec32c operator*(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec32c &operator*=(Vec32c &a, Vec32c const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector of 32 8-bit signed integers
+static inline Vec32c operator/(Vec32c const &a, Divisor_s const &d) { return Vec32c(a.get_low() / d, a.get_high() / d); }
+
+// vector operator /= : divide
+static inline Vec32c &operator/=(Vec32c &a, Divisor_s const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32c operator<<(Vec32c const &a, int b) { return Vec32c(a.get_low() << b, a.get_high() << b); }
+
+// vector operator <<= : shift left
+static inline Vec32c &operator<<=(Vec32c &a, int b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic all elements
+static inline Vec32c operator>>(Vec32c const &a, int b) { return Vec32c(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32c &operator>>=(Vec32c &a, int b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec32cb operator==(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec32cb operator!=(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b (signed)
+static inline Vec32cb operator>(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (signed)
+static inline Vec32cb operator<(Vec32c const &a, Vec32c const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec32cb operator>=(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec32cb operator<=(Vec32c const &a, Vec32c const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec32c operator&(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec32c operator&&(Vec32c const &a, Vec32c const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec32c &operator&=(Vec32c &a, Vec32c const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32c operator|(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec32c operator||(Vec32c const &a, Vec32c const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec32c &operator|=(Vec32c &a, Vec32c const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32c operator^(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec32c &operator^=(Vec32c &a, Vec32c const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32c operator~(Vec32c const &a) { return Vec32c(~a.get_low(), ~a.get_high()); }
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec32cb operator!(Vec32c const &a) { return Vec32c(!a.get_low(), !a.get_high()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+static inline Vec32c select(Vec32cb const &s, Vec32c const &a, Vec32c const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32c if_add(Vec32cb const &f, Vec32c const &a, Vec32c const &b) { return a + (Vec32c(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add(Vec32c const &a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x(Vec32c const &a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); }
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec32c add_saturated(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec32c sub_saturated(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec32c max(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec32c min(Vec32c const &a, Vec32c const &b)
+{
+  return Vec32c(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec32c abs(Vec32c const &a) { return Vec32c(abs(a.get_low()), abs(a.get_high())); }
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec32c abs_saturated(Vec32c const &a) { return Vec32c(abs_saturated(a.get_low()), abs_saturated(a.get_high())); }
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec32c rotate_left(Vec32c const &a, int b) { return Vec32c(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b)); }
+
+/*****************************************************************************
+ *
+ *          Vector of 16 8-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec32uc : public Vec32c
+{
+ public:
+  // Default constructor:
+  Vec32uc() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec32uc(uint32_t i) { y1 = y0 = _mm_set1_epi8((char)i); }
+  // Constructor to build from all elements:
+  Vec32uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, uint8_t i8, uint8_t i9,
+          uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15, uint8_t i16, uint8_t i17, uint8_t i18,
+          uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23, uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27,
+          uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31)
+  {
+    y0 = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+    y1 = _mm_setr_epi8(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
+  }
+  // Constructor to build from two Vec16uc:
+  Vec32uc(Vec16uc const &a0, Vec16uc const &a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256ie
+  Vec32uc(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec32uc &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec32uc &load(void const *p)
+  {
+    y0 = _mm_loadu_si128((__m128i const *)p);
+    y1 = _mm_loadu_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec32uc &load_a(void const *p)
+  {
+    y0 = _mm_load_si128((__m128i const *)p);
+    y1 = _mm_load_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec32uc const &insert(uint32_t index, uint8_t value)
+  {
+    Vec32c::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint8_t extract(uint32_t index) const { return Vec32c::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint8_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec16uc:
+  Vec16uc get_low() const { return y0; }
+  Vec16uc get_high() const { return y1; }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec32uc operator+(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator - : subtract
+static inline Vec32uc operator-(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator * : multiply
+static inline Vec32uc operator*(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator / : divide
+static inline Vec32uc operator/(Vec32uc const &a, Divisor_us const &d) { return Vec32uc(a.get_low() / d, a.get_high() / d); }
+
+// vector operator /= : divide
+static inline Vec32uc &operator/=(Vec32uc &a, Divisor_us const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator<<(Vec32uc const &a, uint32_t b) { return Vec32uc(a.get_low() << b, a.get_high() << b); }
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator<<(Vec32uc const &a, int32_t b) { return a << (uint32_t)b; }
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator>>(Vec32uc const &a, uint32_t b) { return Vec32uc(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator>>(Vec32uc const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32uc &operator>>=(Vec32uc &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec32cb operator>=(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32c(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec32cb operator<=(Vec32uc const &a, Vec32uc const &b) { return b >= a; }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec32cb operator>(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32c(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec32cb operator<(Vec32uc const &a, Vec32uc const &b) { return b > a; }
+
+// vector operator & : bitwise and
+static inline Vec32uc operator&(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec32uc operator&&(Vec32uc const &a, Vec32uc const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec32uc operator|(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec32uc operator||(Vec32uc const &a, Vec32uc const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec32uc operator^(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32uc operator~(Vec32uc const &a) { return Vec32uc(~a.get_low(), ~a.get_high()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec32uc select(Vec32cb const &s, Vec32uc const &a, Vec32uc const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32uc if_add(Vec32cb const &f, Vec32uc const &a, Vec32uc const &b) { return a + (Vec32uc(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+// (Note: horizontal_add_x(Vec32uc) is slightly faster)
+static inline uint32_t horizontal_add(Vec32uc const &a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x(Vec32uc const &a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); }
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec32uc add_saturated(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec32uc sub_saturated(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec32uc max(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec32uc min(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 16 16-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec16s : public Vec256b
+{
+ public:
+  // Default constructor:
+  Vec16s() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec16s(int i) { y1 = y0 = _mm_set1_epi16((int16_t)i); }
+  // Constructor to build from all elements:
+  Vec16s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7, int16_t i8, int16_t i9,
+         int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15)
+  {
+    y0 = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7);
+    y1 = _mm_setr_epi16(i8, i9, i10, i11, i12, i13, i14, i15);
+  }
+  // Constructor to build from two Vec8s:
+  Vec16s(Vec8s const &a0, Vec8s const &a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256ie
+  Vec16s(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec16s &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec16s &load(void const *p)
+  {
+    y0 = _mm_loadu_si128((__m128i const *)p);
+    y1 = _mm_loadu_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec16s &load_a(void const *p)
+  {
+    y0 = _mm_load_si128((__m128i const *)p);
+    y1 = _mm_load_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec16s &load_partial(int n, void const *p)
+  {
+    if(n <= 0)
+      {
+        *this = 0;
+      }
+    else if(n <= 8)
+      {
+        *this = Vec16s(Vec8s().load_partial(n, p), 0);
+      }
+    else if(n < 16)
+      {
+        *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n - 8, (int16_t const *)p + 8));
+      }
+    else
+      {
+        load(p);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n <= 0)
+      {
+        return;
+      }
+    else if(n <= 8)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n < 16)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 8, (int16_t *)p + 8);
+      }
+    else
+      {
+        store(p);
+      }
+  }
+  // cut off vector to n elements. The last 16-n elements are set to zero
+  Vec16s &cutoff(int n)
+  {
+    *this = Vec32c(*this).cutoff(n * 2);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec16s const &insert(uint32_t index, int16_t value)
+  {
+    if(index < 8)
+      {
+        y0 = Vec8s(y0).insert(index, value);
+      }
+    else
+      {
+        y1 = Vec8s(y1).insert(index - 8, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int16_t extract(uint32_t index) const
+  {
+    if(index < 8)
+      {
+        return Vec8s(y0).extract(index);
+      }
+    else
+      {
+        return Vec8s(y1).extract(index - 8);
+      }
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int16_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec8s:
+  Vec8s get_low() const { return y0; }
+  Vec8s get_high() const { return y1; }
+  static int size() { return 16; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec16sb: Vector of 16 Booleans for use with Vec16s and Vec16us
+ *
+ *****************************************************************************/
+
+class Vec16sb : public Vec16s
+{
+ public:
+  // Default constructor:
+  Vec16sb() {}
+  // Constructor to build from all elements:
+  Vec16sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, bool x8, bool x9, bool x10, bool x11, bool x12,
+          bool x13, bool x14, bool x15)
+      : Vec16s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7),
+               -int16_t(x8), -int16_t(x9), -int16_t(x10), -int16_t(x11), -int16_t(x12), -int16_t(x13), -int16_t(x14), -int16_t(x15))
+  {
+  }
+  // Constructor to convert from type Vec256ie
+  Vec16sb(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec16sb &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec16sb(bool b) : Vec16s(-int16_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec16sb &operator=(bool b)
+  {
+    *this = Vec16sb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec16sb(int b);
+  Vec16sb &operator=(int x);
+
+ public:
+  // Member functions to split into two Vec8s:
+  Vec8sb get_low() const { return y0; }
+  Vec8sb get_high() const { return y1; }
+  Vec16sb &insert(int index, bool a)
+  {
+    Vec16s::insert(index, -(int)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const { return Vec16s::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec16sb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec16sb operator&(Vec16sb const &a, Vec16sb const &b) { return Vec16sb(Vec256b(a) & Vec256b(b)); }
+static inline Vec16sb operator&&(Vec16sb const &a, Vec16sb const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec16sb &operator&=(Vec16sb &a, Vec16sb const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16sb operator|(Vec16sb const &a, Vec16sb const &b) { return Vec16sb(Vec256b(a) | Vec256b(b)); }
+static inline Vec16sb operator||(Vec16sb const &a, Vec16sb const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec16sb &operator|=(Vec16sb &a, Vec16sb const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16sb operator^(Vec16sb const &a, Vec16sb const &b) { return Vec16sb(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec16sb &operator^=(Vec16sb &a, Vec16sb const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16sb operator~(Vec16sb const &a) { return Vec16sb(~Vec256b(a)); }
+
+// vector operator ! : element not
+static inline Vec16sb operator!(Vec16sb const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec16sb andnot(Vec16sb const &a, Vec16sb const &b) { return Vec16sb(andnot(Vec256b(a), Vec256b(b))); }
+
+/*****************************************************************************
+ *
+ *          Operators for Vec16s
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16s operator+(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec16s &operator+=(Vec16s &a, Vec16s const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec16s operator++(Vec16s &a, int)
+{
+  Vec16s a0 = a;
+  a         = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec16s &operator++(Vec16s &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16s operator-(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec16s operator-(Vec16s const &a) { return Vec16s(-a.get_low(), -a.get_high()); }
+
+// vector operator -= : subtract
+static inline Vec16s &operator-=(Vec16s &a, Vec16s const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec16s operator--(Vec16s &a, int)
+{
+  Vec16s a0 = a;
+  a         = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec16s &operator--(Vec16s &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16s operator*(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec16s &operator*=(Vec16s &a, Vec16s const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16s operator/(Vec16s const &a, Divisor_s const &d) { return Vec16s(a.get_low() / d, a.get_high() / d); }
+
+// vector operator /= : divide
+static inline Vec16s &operator/=(Vec16s &a, Divisor_s const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator << : shift left
+static inline Vec16s operator<<(Vec16s const &a, int b) { return Vec16s(a.get_low() << b, a.get_high() << b); }
+
+// vector operator <<= : shift left
+static inline Vec16s &operator<<=(Vec16s &a, int b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16s operator>>(Vec16s const &a, int b) { return Vec16s(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16s &operator>>=(Vec16s &a, int b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16sb operator==(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16sb operator!=(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16sb operator>(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16sb operator<(Vec16s const &a, Vec16s const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16sb operator>=(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16sb operator<=(Vec16s const &a, Vec16s const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec16s operator&(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16s operator&&(Vec16s const &a, Vec16s const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec16s &operator&=(Vec16s &a, Vec16s const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16s operator|(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16s operator||(Vec16s const &a, Vec16s const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec16s &operator|=(Vec16s &a, Vec16s const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16s operator^(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec16s &operator^=(Vec16s &a, Vec16s const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16s operator~(Vec16s const &a) { return Vec16s(~Vec256b(a)); }
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec16sb operator!(Vec16s const &a) { return Vec16s(!a.get_low(), !a.get_high()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16s select(Vec16sb const &s, Vec16s const &a, Vec16s const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16s if_add(Vec16sb const &f, Vec16s const &a, Vec16s const &b) { return a + (Vec16s(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add(Vec16s const &a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int32_t horizontal_add_x(Vec16s const &a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); }
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16s add_saturated(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16s sub_saturated(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16s max(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16s min(Vec16s const &a, Vec16s const &b)
+{
+  return Vec16s(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16s abs(Vec16s const &a) { return Vec16s(abs(a.get_low()), abs(a.get_high())); }
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16s abs_saturated(Vec16s const &a) { return Vec16s(abs_saturated(a.get_low()), abs_saturated(a.get_high())); }
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16s rotate_left(Vec16s const &a, int b) { return Vec16s(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b)); }
+
+/*****************************************************************************
+ *
+ *          Vector of 16 16-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec16us : public Vec16s
+{
+ public:
+  // Default constructor:
+  Vec16us() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec16us(uint32_t i) { y1 = y0 = _mm_set1_epi16((int16_t)i); }
+  // Constructor to build from all elements:
+  Vec16us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7, uint16_t i8,
+          uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15)
+  {
+    y0 = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7);
+    y1 = _mm_setr_epi16(i8, i9, i10, i11, i12, i13, i14, i15);
+  }
+  // Constructor to build from two Vec8us:
+  Vec16us(Vec8us const &a0, Vec8us const &a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256ie
+  Vec16us(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec16us &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec16us &load(void const *p)
+  {
+    y0 = _mm_loadu_si128((__m128i const *)p);
+    y1 = _mm_loadu_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec16us &load_a(void const *p)
+  {
+    y0 = _mm_load_si128((__m128i const *)p);
+    y1 = _mm_load_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec16us const &insert(uint32_t index, uint16_t value)
+  {
+    Vec16s::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint16_t extract(uint32_t index) const { return Vec16s::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint16_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec8us:
+  Vec8us get_low() const { return y0; }
+  Vec8us get_high() const { return y1; }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16us operator+(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator - : subtract
+static inline Vec16us operator-(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator * : multiply
+static inline Vec16us operator*(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator / : divide
+static inline Vec16us operator/(Vec16us const &a, Divisor_us const &d) { return Vec16us(a.get_low() / d, a.get_high() / d); }
+
+// vector operator /= : divide
+static inline Vec16us &operator/=(Vec16us &a, Divisor_us const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator>>(Vec16us const &a, uint32_t b) { return Vec16us(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator>>(Vec16us const &a, int b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right artihmetic
+static inline Vec16us &operator>>=(Vec16us &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16us operator<<(Vec16us const &a, uint32_t b) { return Vec16us(a.get_low() << b, a.get_high() << b); }
+
+// vector operator << : shift left all elements
+static inline Vec16us operator<<(Vec16us const &a, int32_t b) { return a << (uint32_t)b; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16sb operator>=(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16s(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16sb operator<=(Vec16us const &a, Vec16us const &b) { return b >= a; }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16sb operator>(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16s(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16sb operator<(Vec16us const &a, Vec16us const &b) { return b > a; }
+
+// vector operator & : bitwise and
+static inline Vec16us operator&(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16us operator&&(Vec16us const &a, Vec16us const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec16us operator|(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16us operator||(Vec16us const &a, Vec16us const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec16us operator^(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16us operator~(Vec16us const &a) { return Vec16us(~Vec256b(a)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16us select(Vec16sb const &s, Vec16us const &a, Vec16us const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16us if_add(Vec16sb const &f, Vec16us const &a, Vec16us const &b) { return a + (Vec16us(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add(Vec16us const &a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x(Vec16us const &a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); }
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16us add_saturated(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16us sub_saturated(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16us max(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16us min(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 8 32-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec8i : public Vec256b
+{
+ public:
+  // Default constructor:
+  Vec8i() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8i(int i) { y1 = y0 = _mm_set1_epi32(i); }
+  // Constructor to build from all elements:
+  Vec8i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7)
+  {
+    y0 = _mm_setr_epi32(i0, i1, i2, i3);
+    y1 = _mm_setr_epi32(i4, i5, i6, i7);
+  }
+  // Constructor to build from two Vec4i:
+  Vec8i(Vec4i const &a0, Vec4i const &a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256ie
+  Vec8i(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec8i &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec8i &load(void const *p)
+  {
+    y0 = _mm_loadu_si128((__m128i const *)p);
+    y1 = _mm_loadu_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec8i &load_a(void const *p)
+  {
+    y0 = _mm_load_si128((__m128i const *)p);
+    y1 = _mm_load_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec8i &load_partial(int n, void const *p)
+  {
+    if(n <= 0)
+      {
+        *this = 0;
+      }
+    else if(n <= 4)
+      {
+        *this = Vec8i(Vec4i().load_partial(n, p), 0);
+      }
+    else if(n < 8)
+      {
+        *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n - 4, (int32_t const *)p + 4));
+      }
+    else
+      {
+        load(p);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n <= 0)
+      {
+        return;
+      }
+    else if(n <= 4)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n < 8)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 4, (int32_t *)p + 4);
+      }
+    else
+      {
+        store(p);
+      }
+  }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec8i &cutoff(int n)
+  {
+    *this = Vec32c(*this).cutoff(n * 4);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8i const &insert(uint32_t index, int32_t value)
+  {
+    if(index < 4)
+      {
+        y0 = Vec4i(y0).insert(index, value);
+      }
+    else
+      {
+        y1 = Vec4i(y1).insert(index - 4, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  // Note: This function is inefficient. Use store function if extracting more than one element
+  int32_t extract(uint32_t index) const
+  {
+    if(index < 4)
+      {
+        return Vec4i(y0).extract(index);
+      }
+    else
+      {
+        return Vec4i(y1).extract(index - 4);
+      }
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int32_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4i:
+  Vec4i get_low() const { return y0; }
+  Vec4i get_high() const { return y1; }
+  static int size() { return 8; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec8ib: Vector of 8 Booleans for use with Vec8i and Vec8ui
+ *
+ *****************************************************************************/
+
+class Vec8ib : public Vec8i
+{
+ public:
+  // Default constructor:
+  Vec8ib() {}
+  // Constructor to build from all elements:
+  Vec8ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7)
+      : Vec8i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3), -int32_t(x4), -int32_t(x5), -int32_t(x6), -int32_t(x7))
+  {
+  }
+  // Constructor to convert from type Vec256ie
+  Vec8ib(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec8ib &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec8ib(bool b) : Vec8i(-int32_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec8ib &operator=(bool b)
+  {
+    *this = Vec8ib(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec8ib(int b);
+  Vec8ib &operator=(int x);
+
+ public:
+  // Member functions to split into two Vec4i:
+  Vec4ib get_low() const { return y0; }
+  Vec4ib get_high() const { return y1; }
+  Vec8ib &insert(int index, bool a)
+  {
+    Vec8i::insert(index, -(int)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  // Note: This function is inefficient. Use store function if extracting more than one element
+  bool extract(uint32_t index) const { return Vec8i::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec8ib
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8ib operator&(Vec8ib const &a, Vec8ib const &b) { return Vec8ib(Vec256b(a) & Vec256b(b)); }
+static inline Vec8ib operator&&(Vec8ib const &a, Vec8ib const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec8ib &operator&=(Vec8ib &a, Vec8ib const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8ib operator|(Vec8ib const &a, Vec8ib const &b) { return Vec8ib(Vec256b(a) | Vec256b(b)); }
+static inline Vec8ib operator||(Vec8ib const &a, Vec8ib const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec8ib &operator|=(Vec8ib &a, Vec8ib const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8ib operator^(Vec8ib const &a, Vec8ib const &b) { return Vec8ib(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec8ib &operator^=(Vec8ib &a, Vec8ib const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8ib operator~(Vec8ib const &a) { return Vec8ib(~Vec256b(a)); }
+
+// vector operator ! : element not
+static inline Vec8ib operator!(Vec8ib const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec8ib andnot(Vec8ib const &a, Vec8ib const &b) { return Vec8ib(andnot(Vec256b(a), Vec256b(b))); }
+
+/*****************************************************************************
+ *
+ *          Operators for Vec8i
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8i operator+(Vec8i const &a, Vec8i const &b) { return Vec8i(a.get_low() + b.get_low(), a.get_high() + b.get_high()); }
+
+// vector operator += : add
+static inline Vec8i &operator+=(Vec8i &a, Vec8i const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec8i operator++(Vec8i &a, int)
+{
+  Vec8i a0 = a;
+  a        = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec8i &operator++(Vec8i &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8i operator-(Vec8i const &a, Vec8i const &b) { return Vec8i(a.get_low() - b.get_low(), a.get_high() - b.get_high()); }
+
+// vector operator - : unary minus
+static inline Vec8i operator-(Vec8i const &a) { return Vec8i(-a.get_low(), -a.get_high()); }
+
+// vector operator -= : subtract
+static inline Vec8i &operator-=(Vec8i &a, Vec8i const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec8i operator--(Vec8i &a, int)
+{
+  Vec8i a0 = a;
+  a        = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec8i &operator--(Vec8i &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8i operator*(Vec8i const &a, Vec8i const &b) { return Vec8i(a.get_low() * b.get_low(), a.get_high() * b.get_high()); }
+
+// vector operator *= : multiply
+static inline Vec8i &operator*=(Vec8i &a, Vec8i const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8i operator/(Vec8i const &a, Divisor_i const &d) { return Vec8i(a.get_low() / d, a.get_high() / d); }
+
+// vector operator /= : divide
+static inline Vec8i &operator/=(Vec8i &a, Divisor_i const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator << : shift left
+static inline Vec8i operator<<(Vec8i const &a, int32_t b) { return Vec8i(a.get_low() << b, a.get_high() << b); }
+
+// vector operator <<= : shift left
+static inline Vec8i &operator<<=(Vec8i &a, int32_t b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8i operator>>(Vec8i const &a, int32_t b) { return Vec8i(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8i &operator>>=(Vec8i &a, int32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8ib operator==(Vec8i const &a, Vec8i const &b)
+{
+  return Vec8i(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8ib operator!=(Vec8i const &a, Vec8i const &b)
+{
+  return Vec8i(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8ib operator>(Vec8i const &a, Vec8i const &b)
+{
+  return Vec8i(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8ib operator<(Vec8i const &a, Vec8i const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8ib operator>=(Vec8i const &a, Vec8i const &b)
+{
+  return Vec8i(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8ib operator<=(Vec8i const &a, Vec8i const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec8i operator&(Vec8i const &a, Vec8i const &b) { return Vec8i(a.get_low() & b.get_low(), a.get_high() & b.get_high()); }
+static inline Vec8i operator&&(Vec8i const &a, Vec8i const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec8i &operator&=(Vec8i &a, Vec8i const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8i operator|(Vec8i const &a, Vec8i const &b) { return Vec8i(a.get_low() | b.get_low(), a.get_high() | b.get_high()); }
+static inline Vec8i operator||(Vec8i const &a, Vec8i const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec8i &operator|=(Vec8i &a, Vec8i const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8i operator^(Vec8i const &a, Vec8i const &b) { return Vec8i(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); }
+// vector operator ^= : bitwise xor
+static inline Vec8i &operator^=(Vec8i &a, Vec8i const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8i operator~(Vec8i const &a) { return Vec8i(~a.get_low(), ~a.get_high()); }
+
+// vector operator ! : returns true for elements == 0
+static inline Vec8ib operator!(Vec8i const &a) { return Vec8i(!a.get_low(), !a.get_high()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8i select(Vec8ib const &s, Vec8i const &a, Vec8i const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8i if_add(Vec8ib const &f, Vec8i const &a, Vec8i const &b) { return a + (Vec8i(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add(Vec8i const &a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x(Vec8i const &a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); }
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec8i add_saturated(Vec8i const &a, Vec8i const &b)
+{
+  return Vec8i(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec8i sub_saturated(Vec8i const &a, Vec8i const &b)
+{
+  return Vec8i(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec8i max(Vec8i const &a, Vec8i const &b)
+{
+  return Vec8i(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8i min(Vec8i const &a, Vec8i const &b)
+{
+  return Vec8i(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8i abs(Vec8i const &a) { return Vec8i(abs(a.get_low()), abs(a.get_high())); }
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8i abs_saturated(Vec8i const &a) { return Vec8i(abs_saturated(a.get_low()), abs_saturated(a.get_high())); }
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8i rotate_left(Vec8i const &a, int b) { return Vec8i(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b)); }
+
+/*****************************************************************************
+ *
+ *          Vector of 4 32-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec8ui : public Vec8i
+{
+ public:
+  // Default constructor:
+  Vec8ui() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8ui(uint32_t i) { y1 = y0 = _mm_set1_epi32(i); }
+  // Constructor to build from all elements:
+  Vec8ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
+  {
+    y0 = _mm_setr_epi32(i0, i1, i2, i3);
+    y1 = _mm_setr_epi32(i4, i5, i6, i7);
+  }
+  // Constructor to build from two Vec4ui:
+  Vec8ui(Vec4ui const &a0, Vec4ui const &a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256ie
+  Vec8ui(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec8ui &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec8ui &load(void const *p)
+  {
+    y0 = _mm_loadu_si128((__m128i const *)p);
+    y1 = _mm_loadu_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec8ui &load_a(void const *p)
+  {
+    y0 = _mm_load_si128((__m128i const *)p);
+    y1 = _mm_load_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8ui const &insert(uint32_t index, uint32_t value)
+  {
+    Vec8i::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint32_t extract(uint32_t index) const { return Vec8i::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint32_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4ui:
+  Vec4ui get_low() const { return y0; }
+  Vec4ui get_high() const { return y1; }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8ui operator+(Vec8ui const &a, Vec8ui const &b) { return Vec8ui(Vec8i(a) + Vec8i(b)); }
+
+// vector operator - : subtract
+static inline Vec8ui operator-(Vec8ui const &a, Vec8ui const &b) { return Vec8ui(Vec8i(a) - Vec8i(b)); }
+
+// vector operator * : multiply
+static inline Vec8ui operator*(Vec8ui const &a, Vec8ui const &b) { return Vec8ui(Vec8i(a) * Vec8i(b)); }
+
+// vector operator / : divide all elements by same integer
+static inline Vec8ui operator/(Vec8ui const &a, Divisor_ui const &d) { return Vec8ui(a.get_low() / d, a.get_high() / d); }
+
+// vector operator /= : divide
+static inline Vec8ui &operator/=(Vec8ui &a, Divisor_ui const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator>>(Vec8ui const &a, uint32_t b) { return Vec8ui(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator>>(Vec8ui const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right logical
+static inline Vec8ui &operator>>=(Vec8ui &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8ui &operator>>=(Vec8ui &a, int32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8ui operator<<(Vec8ui const &a, uint32_t b) { return Vec8ui((Vec8i)a << (int32_t)b); }
+
+// vector operator << : shift left all elements
+static inline Vec8ui operator<<(Vec8ui const &a, int32_t b) { return Vec8ui((Vec8i)a << (int32_t)b); }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8ib operator>(Vec8ui const &a, Vec8ui const &b)
+{
+  return Vec8i(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8ib operator<(Vec8ui const &a, Vec8ui const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8ib operator>=(Vec8ui const &a, Vec8ui const &b)
+{
+  return Vec8i(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8ib operator<=(Vec8ui const &a, Vec8ui const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec8ui operator&(Vec8ui const &a, Vec8ui const &b)
+{
+  return Vec8ui(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec8ui operator&&(Vec8ui const &a, Vec8ui const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec8ui operator|(Vec8ui const &a, Vec8ui const &b)
+{
+  return Vec8ui(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec8ui operator||(Vec8ui const &a, Vec8ui const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec8ui operator^(Vec8ui const &a, Vec8ui const &b)
+{
+  return Vec8ui(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8ui operator~(Vec8ui const &a) { return Vec8ui(~a.get_low(), ~a.get_high()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8ui select(Vec8ib const &s, Vec8ui const &a, Vec8ui const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8ui if_add(Vec8ib const &f, Vec8ui const &a, Vec8ui const &b) { return a + (Vec8ui(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add(Vec8ui const &a) { return horizontal_add((Vec8i)a); }
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x(Vec8ui const &a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); }
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec8ui add_saturated(Vec8ui const &a, Vec8ui const &b)
+{
+  return Vec8ui(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec8ui sub_saturated(Vec8ui const &a, Vec8ui const &b)
+{
+  return Vec8ui(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec8ui max(Vec8ui const &a, Vec8ui const &b)
+{
+  return Vec8ui(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8ui min(Vec8ui const &a, Vec8ui const &b)
+{
+  return Vec8ui(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 4 64-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec4q : public Vec256b
+{
+ public:
+  // Default constructor:
+  Vec4q() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec4q(int64_t i) { y0 = y1 = Vec2q(i); }
+  // Constructor to build from all elements:
+  Vec4q(int64_t i0, int64_t i1, int64_t i2, int64_t i3)
+  {
+    y0 = Vec2q(i0, i1);
+    y1 = Vec2q(i2, i3);
+  }
+  // Constructor to build from two Vec2q:
+  Vec4q(Vec2q const &a0, Vec2q const &a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256ie
+  Vec4q(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec4q &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec4q &load(void const *p)
+  {
+    y0 = _mm_loadu_si128((__m128i const *)p);
+    y1 = _mm_loadu_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec4q &load_a(void const *p)
+  {
+    y0 = _mm_load_si128((__m128i const *)p);
+    y1 = _mm_load_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec4q &load_partial(int n, void const *p)
+  {
+    if(n <= 0)
+      {
+        *this = 0;
+      }
+    else if(n <= 2)
+      {
+        *this = Vec4q(Vec2q().load_partial(n, p), 0);
+      }
+    else if(n < 4)
+      {
+        *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n - 2, (int64_t const *)p + 2));
+      }
+    else
+      {
+        load(p);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n <= 0)
+      {
+        return;
+      }
+    else if(n <= 2)
+      {
+        get_low().store_partial(n, p);
+      }
+    else if(n < 4)
+      {
+        get_low().store(p);
+        get_high().store_partial(n - 2, (int64_t *)p + 2);
+      }
+    else
+      {
+        store(p);
+      }
+  }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec4q &cutoff(int n)
+  {
+    *this = Vec32c(*this).cutoff(n * 8);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4q const &insert(uint32_t index, int64_t value)
+  {
+    if(index < 2)
+      {
+        y0 = Vec2q(y0).insert(index, value);
+      }
+    else
+      {
+        y1 = Vec2q(y1).insert(index - 2, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  // Note: This function is inefficient. Use store function if extracting more than one element
+  int64_t extract(uint32_t index) const
+  {
+    if(index < 2)
+      {
+        return Vec2q(y0).extract(index);
+      }
+    else
+      {
+        return Vec2q(y1).extract(index - 2);
+      }
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int64_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec2q:
+  Vec2q get_low() const { return y0; }
+  Vec2q get_high() const { return y1; }
+  static int size() { return 4; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec4qb: Vector of 4 Booleans for use with Vec4q and Vec4uq
+ *
+ *****************************************************************************/
+
+class Vec4qb : public Vec4q
+{
+ public:
+  // Default constructor:
+  Vec4qb() {}
+  // Constructor to build from all elements:
+  Vec4qb(bool x0, bool x1, bool x2, bool x3) : Vec4q(-int64_t(x0), -int64_t(x1), -int64_t(x2), -int64_t(x3)) {}
+  // Constructor to convert from type Vec256ie
+  Vec4qb(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec4qb &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec4qb(bool b) : Vec4q(-int64_t(b)) {}
+  // Assignment operator to broadcast scalar value:
+  Vec4qb &operator=(bool b)
+  {
+    *this = Vec4qb(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec4qb(int b);
+  Vec4qb &operator=(int x);
+
+ public:
+  // Member functions to split into two Vec2qb:
+  Vec2qb get_low() const { return y0; }
+  Vec2qb get_high() const { return y1; }
+  Vec4qb &insert(int index, bool a)
+  {
+    Vec4q::insert(index, -(int64_t)a);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  // Note: This function is inefficient. Use store function if extracting more than one element
+  bool extract(uint32_t index) const { return Vec4q::extract(index) != 0; }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+};
+
+/*****************************************************************************
+ *
+ *          Define operators for Vec4qb
+ *
+ *****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4qb operator&(Vec4qb const &a, Vec4qb const &b) { return Vec4qb(Vec256b(a) & Vec256b(b)); }
+static inline Vec4qb operator&&(Vec4qb const &a, Vec4qb const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec4qb &operator&=(Vec4qb &a, Vec4qb const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4qb operator|(Vec4qb const &a, Vec4qb const &b) { return Vec4qb(Vec256b(a) | Vec256b(b)); }
+static inline Vec4qb operator||(Vec4qb const &a, Vec4qb const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec4qb &operator|=(Vec4qb &a, Vec4qb const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4qb operator^(Vec4qb const &a, Vec4qb const &b) { return Vec4qb(Vec256b(a) ^ Vec256b(b)); }
+// vector operator ^= : bitwise xor
+static inline Vec4qb &operator^=(Vec4qb &a, Vec4qb const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4qb operator~(Vec4qb const &a) { return Vec4qb(~Vec256b(a)); }
+
+// vector operator ! : element not
+static inline Vec4qb operator!(Vec4qb const &a) { return ~a; }
+
+// vector function andnot
+static inline Vec4qb andnot(Vec4qb const &a, Vec4qb const &b) { return Vec4qb(andnot(Vec256b(a), Vec256b(b))); }
+
+/*****************************************************************************
+ *
+ *          Operators for Vec4q
+ *
+ *****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4q operator+(Vec4q const &a, Vec4q const &b) { return Vec4q(a.get_low() + b.get_low(), a.get_high() + b.get_high()); }
+
+// vector operator += : add
+static inline Vec4q &operator+=(Vec4q &a, Vec4q const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec4q operator++(Vec4q &a, int)
+{
+  Vec4q a0 = a;
+  a        = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec4q &operator++(Vec4q &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4q operator-(Vec4q const &a, Vec4q const &b) { return Vec4q(a.get_low() - b.get_low(), a.get_high() - b.get_high()); }
+
+// vector operator - : unary minus
+static inline Vec4q operator-(Vec4q const &a) { return Vec4q(-a.get_low(), -a.get_high()); }
+
+// vector operator -= : subtract
+static inline Vec4q &operator-=(Vec4q &a, Vec4q const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec4q operator--(Vec4q &a, int)
+{
+  Vec4q a0 = a;
+  a        = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec4q &operator--(Vec4q &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4q operator*(Vec4q const &a, Vec4q const &b) { return Vec4q(a.get_low() * b.get_low(), a.get_high() * b.get_high()); }
+
+// vector operator *= : multiply
+static inline Vec4q &operator*=(Vec4q &a, Vec4q const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator << : shift left
+static inline Vec4q operator<<(Vec4q const &a, int32_t b) { return Vec4q(a.get_low() << b, a.get_high() << b); }
+
+// vector operator <<= : shift left
+static inline Vec4q &operator<<=(Vec4q &a, int32_t b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec4q operator>>(Vec4q const &a, int32_t b) { return Vec4q(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >>= : shift right arithmetic
+static inline Vec4q &operator>>=(Vec4q &a, int32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4qb operator==(Vec4q const &a, Vec4q const &b)
+{
+  return Vec4q(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4qb operator!=(Vec4q const &a, Vec4q const &b)
+{
+  return Vec4q(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4qb operator<(Vec4q const &a, Vec4q const &b)
+{
+  return Vec4q(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4qb operator>(Vec4q const &a, Vec4q const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec4qb operator>=(Vec4q const &a, Vec4q const &b)
+{
+  return Vec4q(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec4qb operator<=(Vec4q const &a, Vec4q const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec4q operator&(Vec4q const &a, Vec4q const &b) { return Vec4q(a.get_low() & b.get_low(), a.get_high() & b.get_high()); }
+static inline Vec4q operator&&(Vec4q const &a, Vec4q const &b) { return a & b; }
+// vector operator &= : bitwise and
+static inline Vec4q &operator&=(Vec4q &a, Vec4q const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4q operator|(Vec4q const &a, Vec4q const &b) { return Vec4q(a.get_low() | b.get_low(), a.get_high() | b.get_high()); }
+static inline Vec4q operator||(Vec4q const &a, Vec4q const &b) { return a | b; }
+// vector operator |= : bitwise or
+static inline Vec4q &operator|=(Vec4q &a, Vec4q const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4q operator^(Vec4q const &a, Vec4q const &b) { return Vec4q(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); }
+// vector operator ^= : bitwise xor
+static inline Vec4q &operator^=(Vec4q &a, Vec4q const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4q operator~(Vec4q const &a) { return Vec4q(~a.get_low(), ~a.get_high()); }
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec4qb operator!(Vec4q const &a) { return Vec4q(!a.get_low(), !a.get_high()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4q select(Vec4qb const &s, Vec4q const &a, Vec4q const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4q if_add(Vec4qb const &f, Vec4q const &a, Vec4q const &b) { return a + (Vec4q(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int64_t horizontal_add(Vec4q const &a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// function max: a > b ? a : b
+static inline Vec4q max(Vec4q const &a, Vec4q const &b)
+{
+  return Vec4q(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec4q min(Vec4q const &a, Vec4q const &b)
+{
+  return Vec4q(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec4q abs(Vec4q const &a) { return Vec4q(abs(a.get_low()), abs(a.get_high())); }
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec4q abs_saturated(Vec4q const &a) { return Vec4q(abs_saturated(a.get_low()), abs_saturated(a.get_high())); }
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec4q rotate_left(Vec4q const &a, int b) { return Vec4q(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b)); }
+
+/*****************************************************************************
+ *
+ *          Vector of 4 64-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec4uq : public Vec4q
+{
+ public:
+  // Default constructor:
+  Vec4uq() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec4uq(uint64_t i) { y1 = y0 = Vec2q(i); }
+  // Constructor to build from all elements:
+  Vec4uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3)
+  {
+    y0 = Vec2q(i0, i1);
+    y1 = Vec2q(i2, i3);
+  }
+  // Constructor to build from two Vec2uq:
+  Vec4uq(Vec2uq const &a0, Vec2uq const &a1)
+  {
+    y0 = a0;
+    y1 = a1;
+  }
+  // Constructor to convert from type Vec256ie
+  Vec4uq(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec256ie
+  Vec4uq &operator=(Vec256ie const &x)
+  {
+    y0 = x.get_low();
+    y1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec4uq &load(void const *p)
+  {
+    y0 = _mm_loadu_si128((__m128i const *)p);
+    y1 = _mm_loadu_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec4uq &load_a(void const *p)
+  {
+    y0 = _mm_load_si128((__m128i const *)p);
+    y1 = _mm_load_si128((__m128i const *)p + 1);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec4uq const &insert(uint32_t index, uint64_t value)
+  {
+    Vec4q::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint64_t extract(uint32_t index) const { return Vec4q::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint64_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec2uq:
+  Vec2uq get_low() const { return y0; }
+  Vec2uq get_high() const { return y1; }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec4uq operator+(Vec4uq const &a, Vec4uq const &b) { return Vec4uq(Vec4q(a) + Vec4q(b)); }
+
+// vector operator - : subtract
+static inline Vec4uq operator-(Vec4uq const &a, Vec4uq const &b) { return Vec4uq(Vec4q(a) - Vec4q(b)); }
+
+// vector operator * : multiply element by element
+static inline Vec4uq operator*(Vec4uq const &a, Vec4uq const &b) { return Vec4uq(Vec4q(a) * Vec4q(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator>>(Vec4uq const &a, uint32_t b) { return Vec4uq(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator>>(Vec4uq const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right artihmetic
+static inline Vec4uq &operator>>=(Vec4uq &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec4uq operator<<(Vec4uq const &a, uint32_t b) { return Vec4uq((Vec4q)a << (int32_t)b); }
+
+// vector operator << : shift left all elements
+static inline Vec4uq operator<<(Vec4uq const &a, int32_t b) { return Vec4uq((Vec4q)a << b); }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec4qb operator>(Vec4uq const &a, Vec4uq const &b)
+{
+  return Vec4q(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec4qb operator<(Vec4uq const &a, Vec4uq const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec4qb operator>=(Vec4uq const &a, Vec4uq const &b)
+{
+  return Vec4q(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec4qb operator<=(Vec4uq const &a, Vec4uq const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec4uq operator&(Vec4uq const &a, Vec4uq const &b)
+{
+  return Vec4uq(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec4uq operator&&(Vec4uq const &a, Vec4uq const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec4uq operator|(Vec4uq const &a, Vec4uq const &b)
+{
+  return Vec4q(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec4uq operator||(Vec4uq const &a, Vec4uq const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec4uq operator^(Vec4uq const &a, Vec4uq const &b)
+{
+  return Vec4uq(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4uq operator~(Vec4uq const &a) { return Vec4uq(~a.get_low(), ~a.get_high()); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4uq select(Vec4qb const &s, Vec4uq const &a, Vec4uq const &b) { return selectb(s, a, b); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4uq if_add(Vec4qb const &f, Vec4uq const &a, Vec4uq const &b) { return a + (Vec4uq(f) & b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint64_t horizontal_add(Vec4uq const &a) { return horizontal_add((Vec4q)a); }
+
+// function max: a > b ? a : b
+static inline Vec4uq max(Vec4uq const &a, Vec4uq const &b)
+{
+  return Vec4uq(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec4uq min(Vec4uq const &a, Vec4uq const &b)
+{
+  return Vec4uq(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero.
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to select.
+* An index of -1 will generate zero. An index of -256 means don't care.
+*
+* Example:
+
+* Vec8i a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+* Vec8i b;
+* b = permute8i<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+// Shuffle vector of 4 64-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q permute4q(Vec4q const &a)
+{
+  return Vec4q(blend2q<i0, i1>(a.get_low(), a.get_high()), blend2q<i2, i3>(a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4uq permute4uq(Vec4uq const &a)
+{
+  return Vec4uq(permute4q<i0, i1, i2, i3>(a));
+}
+
+// Shuffle vector of 8 32-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i permute8i(Vec8i const &a)
+{
+  return Vec8i(blend4i<i0, i1, i2, i3>(a.get_low(), a.get_high()), blend4i<i4, i5, i6, i7>(a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8ui permute8ui(Vec8ui const &a)
+{
+  return Vec8ui(permute8i<i0, i1, i2, i3, i4, i5, i6, i7>(a));
+}
+
+// Shuffle vector of 16 16-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16s permute16s(Vec16s const &a)
+{
+  return Vec16s(blend8s<i0, i1, i2, i3, i4, i5, i6, i7>(a.get_low(), a.get_high()),
+                blend8s<i8, i9, i10, i11, i12, i13, i14, i15>(a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16us permute16us(Vec16us const &a)
+{
+  return Vec16us(permute16s<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15, int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, int i24, int i25, int i26, int i27, int i28,
+          int i29, int i30, int i31>
+static inline Vec32c permute32c(Vec32c const &a)
+{
+  return Vec32c(blend16c<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a.get_low(), a.get_high()),
+                blend16c<i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31>(a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15, int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, int i24, int i25, int i26, int i27, int i28,
+          int i29, int i30, int i31>
+static inline Vec32uc permute32uc(Vec32uc const &a)
+{
+  return Vec32uc(permute32c<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22,
+                            i23, i24, i25, i26, i27, i28, i29, i30, i31>(a));
+}
+
+/*****************************************************************************
+ *
+ *          Vector blend functions
+ *
+ ******************************************************************************
+ *
+ * These blend functions can mix elements from two different vectors and
+ * optionally set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select, where higher indexes indicate an element from the second source
+ * vector. For example, if each vector has 8 elements, then indexes 0 - 7
+ * will select an element from the first vector and indexes 8 - 15 will select
+ * an element from the second vector. A negative index will generate zero.
+ *
+ * Example:
+ * Vec8i a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8i b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+ * Vec8i c;
+ * c = blend8i<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+// helper function used below
+template <int n>
+static inline Vec2q select4(Vec4q const &a, Vec4q const &b)
+{
+  switch(n)
+    {
+      case 0:
+        return a.get_low();
+      case 1:
+        return a.get_high();
+      case 2:
+        return b.get_low();
+      case 3:
+        return b.get_high();
+    }
+  return _mm_setzero_si128();
+}
+
+// blend vectors Vec4q
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q blend4q(Vec4q const &a, Vec4q const &b)
+{
+  const int j0 = i0 >= 0 ? i0 / 2 : i0;
+  const int j1 = i1 >= 0 ? i1 / 2 : i1;
+  const int j2 = i2 >= 0 ? i2 / 2 : i2;
+  const int j3 = i3 >= 0 ? i3 / 2 : i3;
+  Vec2q x0, x1;
+
+  if(j0 == j1 || i0 < 0 || i1 < 0)
+    {  // both from same
+      const int k0 = j0 >= 0 ? j0 : j1;
+      x0           = permute2q<i0 & -7, i1 & -7>(select4<k0>(a, b));
+    }
+  else
+    {
+      x0 = blend2q<i0 & -7, (i1 & -7) | 2>(select4<j0>(a, b), select4<j1>(a, b));
+    }
+  if(j2 == j3 || i2 < 0 || i3 < 0)
+    {  // both from same
+      const int k1 = j2 >= 0 ? j2 : j3;
+      x1           = permute2q<i2 & -7, i3 & -7>(select4<k1>(a, b));
+    }
+  else
+    {
+      x1 = blend2q<i2 & -7, (i3 & -7) | 2>(select4<j2>(a, b), select4<j3>(a, b));
+    }
+  return Vec4q(x0, x1);
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4uq blend4uq(Vec4uq const &a, Vec4uq const &b)
+{
+  return Vec4uq(blend4q<i0, i1, i2, i3>(a, b));
+}
+
+// helper function used below
+template <int n>
+static inline Vec4i select4(Vec8i const &a, Vec8i const &b)
+{
+  switch(n)
+    {
+      case 0:
+        return a.get_low();
+      case 1:
+        return a.get_high();
+      case 2:
+        return b.get_low();
+      case 3:
+        return b.get_high();
+    }
+  return _mm_setzero_si128();
+}
+
+// blend vectors Vec8i
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i blend8i(Vec8i const &a, Vec8i const &b)
+{
+  const int j0 = i0 >= 0 ? i0 / 4 : i0;
+  const int j1 = i1 >= 0 ? i1 / 4 : i1;
+  const int j2 = i2 >= 0 ? i2 / 4 : i2;
+  const int j3 = i3 >= 0 ? i3 / 4 : i3;
+  const int j4 = i4 >= 0 ? i4 / 4 : i4;
+  const int j5 = i5 >= 0 ? i5 / 4 : i5;
+  const int j6 = i6 >= 0 ? i6 / 4 : i6;
+  const int j7 = i7 >= 0 ? i7 / 4 : i7;
+  Vec4i x0, x1;
+
+  const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
+  const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
+  const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
+  const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
+
+  // Combine all the indexes into a single bitfield, with 4 bits for each
+  const int m1 = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                 (i6 & 0xF) << 24 | (i7 & 0xF) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12 |
+                 (i4 < 0 ? 0 : 0xF) << 16 | (i5 < 0 ? 0 : 0xF) << 20 | (i6 < 0 ? 0 : 0xF) << 24 | (i7 < 0 ? 0 : 0xF) << 28;
+
+  if(r0 < 0)
+    {
+      x0 = _mm_setzero_si128();
+    }
+  else if(((m1 ^ r0 * 0x4444) & 0xCCCC & mz) == 0)
+    {
+      // i0 - i3 all from same source
+      x0 = permute4i<i0 & -13, i1 & -13, i2 & -13, i3 & -13>(select4<r0>(a, b));
+    }
+  else if((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0))
+    {
+      // i0 - i3 all from two sources
+      const int k0 = i0 >= 0 ? i0 & 3 : i0;
+      const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
+      const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
+      const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
+      x0           = blend4i<k0, k1, k2, k3>(select4<r0>(a, b), select4<s0>(a, b));
+    }
+  else
+    {
+      // i0 - i3 from three or four different sources
+      x0 = blend4i<0, 1, 6, 7>(blend4i<i0 & -13, (i1 & -13) | 4, -0x100, -0x100>(select4<j0>(a, b), select4<j1>(a, b)),
+                               blend4i<-0x100, -0x100, i2 & -13, (i3 & -13) | 4>(select4<j2>(a, b), select4<j3>(a, b)));
+    }
+
+  if(r1 < 0)
+    {
+      x1 = _mm_setzero_si128();
+    }
+  else if(((m1 ^ uint32_t(r1) * 0x44440000u) & 0xCCCC0000 & mz) == 0)
+    {
+      // i4 - i7 all from same source
+      x1 = permute4i<i4 & -13, i5 & -13, i6 & -13, i7 & -13>(select4<r1>(a, b));
+    }
+  else if((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1))
+    {
+      // i4 - i7 all from two sources
+      const int k4 = i4 >= 0 ? i4 & 3 : i4;
+      const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
+      const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
+      const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
+      x1           = blend4i<k4, k5, k6, k7>(select4<r1>(a, b), select4<s1>(a, b));
+    }
+  else
+    {
+      // i4 - i7 from three or four different sources
+      x1 = blend4i<0, 1, 6, 7>(blend4i<i4 & -13, (i5 & -13) | 4, -0x100, -0x100>(select4<j4>(a, b), select4<j5>(a, b)),
+                               blend4i<-0x100, -0x100, i6 & -13, (i7 & -13) | 4>(select4<j6>(a, b), select4<j7>(a, b)));
+    }
+
+  return Vec8i(x0, x1);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8ui blend8ui(Vec8ui const &a, Vec8ui const &b)
+{
+  return Vec8ui(blend8i<i0, i1, i2, i3, i4, i5, i6, i7>(a, b));
+}
+
+// helper function used below
+template <int n>
+static inline Vec8s select4(Vec16s const &a, Vec16s const &b)
+{
+  switch(n)
+    {
+      case 0:
+        return a.get_low();
+      case 1:
+        return a.get_high();
+      case 2:
+        return b.get_low();
+      case 3:
+        return b.get_high();
+    }
+  return _mm_setzero_si128();
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16s blend16s(Vec16s const &a, Vec16s const &b)
+{
+  const int j0  = i0 >= 0 ? i0 / 8 : i0;
+  const int j1  = i1 >= 0 ? i1 / 8 : i1;
+  const int j2  = i2 >= 0 ? i2 / 8 : i2;
+  const int j3  = i3 >= 0 ? i3 / 8 : i3;
+  const int j4  = i4 >= 0 ? i4 / 8 : i4;
+  const int j5  = i5 >= 0 ? i5 / 8 : i5;
+  const int j6  = i6 >= 0 ? i6 / 8 : i6;
+  const int j7  = i7 >= 0 ? i7 / 8 : i7;
+  const int j8  = i8 >= 0 ? i8 / 8 : i8;
+  const int j9  = i9 >= 0 ? i9 / 8 : i9;
+  const int j10 = i10 >= 0 ? i10 / 8 : i10;
+  const int j11 = i11 >= 0 ? i11 / 8 : i11;
+  const int j12 = i12 >= 0 ? i12 / 8 : i12;
+  const int j13 = i13 >= 0 ? i13 / 8 : i13;
+  const int j14 = i14 >= 0 ? i14 / 8 : i14;
+  const int j15 = i15 >= 0 ? i15 / 8 : i15;
+
+  Vec8s x0, x1;
+
+  const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3 >= 0 ? j3 : j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
+  const int r1 =
+      j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
+  const int s0 = (j1 >= 0 && j1 != r0)
+                     ? j1
+                     : (j2 >= 0 && j2 != r0)
+                           ? j2
+                           : (j3 >= 0 && j3 != r0)
+                                 ? j3
+                                 : (j4 >= 0 && j4 != r0) ? j4 : (j5 >= 0 && j5 != r0) ? j5 : (j6 >= 0 && j6 != r0) ? j6 : j7;
+  const int s1 = (j9 >= 0 && j9 != r1)
+                     ? j9
+                     : (j10 >= 0 && j10 != r1)
+                           ? j10
+                           : (j11 >= 0 && j11 != r1)
+                                 ? j11
+                                 : (j12 >= 0 && j12 != r1) ? j12 : (j13 >= 0 && j13 != r1) ? j13 : (j14 >= 0 && j14 != r1) ? j14 : j15;
+
+  if(r0 < 0)
+    {
+      x0 = _mm_setzero_si128();
+    }
+  else if(r0 == s0)
+    {
+      // i0 - i7 all from same source
+      x0 = permute8s<i0 & -25, i1 & -25, i2 & -25, i3 & -25, i4 & -25, i5 & -25, i6 & -25, i7 & -25>(select4<r0>(a, b));
+    }
+  else if((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0) && (j4 < 0 || j4 == r0 || j4 == s0) &&
+          (j5 < 0 || j5 == r0 || j5 == s0) && (j6 < 0 || j6 == r0 || j6 == s0) && (j7 < 0 || j7 == r0 || j7 == s0))
+    {
+      // i0 - i7 all from two sources
+      const int k0 = i0 >= 0 ? i0 & 7 : i0;
+      const int k1 = (i1 >= 0 ? i1 & 7 : i1) | (j1 == s0 ? 8 : 0);
+      const int k2 = (i2 >= 0 ? i2 & 7 : i2) | (j2 == s0 ? 8 : 0);
+      const int k3 = (i3 >= 0 ? i3 & 7 : i3) | (j3 == s0 ? 8 : 0);
+      const int k4 = (i4 >= 0 ? i4 & 7 : i4) | (j4 == s0 ? 8 : 0);
+      const int k5 = (i5 >= 0 ? i5 & 7 : i5) | (j5 == s0 ? 8 : 0);
+      const int k6 = (i6 >= 0 ? i6 & 7 : i6) | (j6 == s0 ? 8 : 0);
+      const int k7 = (i7 >= 0 ? i7 & 7 : i7) | (j7 == s0 ? 8 : 0);
+      x0           = blend8s<k0, k1, k2, k3, k4, k5, k6, k7>(select4<r0>(a, b), select4<s0>(a, b));
+    }
+  else
+    {
+      // i0 - i7 from three or four different sources
+      const int n0 = j0 >= 0 ? j0 / 2 * 8 + 0 : j0;
+      const int n1 = j1 >= 0 ? j1 / 2 * 8 + 1 : j1;
+      const int n2 = j2 >= 0 ? j2 / 2 * 8 + 2 : j2;
+      const int n3 = j3 >= 0 ? j3 / 2 * 8 + 3 : j3;
+      const int n4 = j4 >= 0 ? j4 / 2 * 8 + 4 : j4;
+      const int n5 = j5 >= 0 ? j5 / 2 * 8 + 5 : j5;
+      const int n6 = j6 >= 0 ? j6 / 2 * 8 + 6 : j6;
+      const int n7 = j7 >= 0 ? j7 / 2 * 8 + 7 : j7;
+      x0           = blend8s<n0, n1, n2, n3, n4, n5, n6, n7>(
+          blend8s < j0 & 2 ? -256 : i0 & 15, j1 & 2 ? -256 : i1 & 15, j2 & 2 ? -256 : i2 & 15, j3 & 2 ? -256 : i3 & 15,
+          j4 & 2 ? -256 : i4 & 15, j5 & 2 ? -256 : i5 & 15, j6 & 2 ? -256 : i6 & 15,
+          j7 & 2 ? -256 : i7 & 15 > (a.get_low(), a.get_high()), blend8s < (j0 ^ 2) & 6 ? -256 : i0 & 15,
+          (j1 ^ 2) & 6 ? -256 : i1 & 15, (j2 ^ 2) & 6 ? -256 : i2 & 15, (j3 ^ 2) & 6 ? -256 : i3 & 15, (j4 ^ 2) & 6 ? -256 : i4 & 15,
+          (j5 ^ 2) & 6 ? -256 : i5 & 15, (j6 ^ 2) & 6 ? -256 : i6 & 15, (j7 ^ 2) & 6 ? -256 : i7 & 15 > (b.get_low(), b.get_high()));
+    }
+
+  if(r1 < 0)
+    {
+      x1 = _mm_setzero_si128();
+    }
+  else if(r1 == s1)
+    {
+      // i8 - i15 all from same source
+      x1 = permute8s<i8 & -25, i9 & -25, i10 & -25, i11 & -25, i12 & -25, i13 & -25, i14 & -25, i15 & -25>(select4<r1>(a, b));
+    }
+  else if((j10 < 0 || j10 == r1 || j10 == s1) && (j11 < 0 || j11 == r1 || j11 == s1) && (j12 < 0 || j12 == r1 || j12 == s1) &&
+          (j13 < 0 || j13 == r1 || j13 == s1) && (j14 < 0 || j14 == r1 || j14 == s1) && (j15 < 0 || j15 == r1 || j15 == s1))
+    {
+      // i8 - i15 all from two sources
+      const int k8  = i8 >= 0 ? i8 & 7 : i8;
+      const int k9  = (i9 >= 0 ? i9 & 7 : i9) | (j9 == s1 ? 8 : 0);
+      const int k10 = (i10 >= 0 ? i10 & 7 : i10) | (j10 == s1 ? 8 : 0);
+      const int k11 = (i11 >= 0 ? i11 & 7 : i11) | (j11 == s1 ? 8 : 0);
+      const int k12 = (i12 >= 0 ? i12 & 7 : i12) | (j12 == s1 ? 8 : 0);
+      const int k13 = (i13 >= 0 ? i13 & 7 : i13) | (j13 == s1 ? 8 : 0);
+      const int k14 = (i14 >= 0 ? i14 & 7 : i14) | (j14 == s1 ? 8 : 0);
+      const int k15 = (i15 >= 0 ? i15 & 7 : i15) | (j15 == s1 ? 8 : 0);
+      x1            = blend8s<k8, k9, k10, k11, k12, k13, k14, k15>(select4<r1>(a, b), select4<s1>(a, b));
+    }
+  else
+    {
+      // i8 - i15 from three or four different sources
+      const int n8  = j8 >= 0 ? j8 / 2 * 8 + 0 : j8;
+      const int n9  = j9 >= 0 ? j9 / 2 * 8 + 1 : j9;
+      const int n10 = j10 >= 0 ? j10 / 2 * 8 + 2 : j10;
+      const int n11 = j11 >= 0 ? j11 / 2 * 8 + 3 : j11;
+      const int n12 = j12 >= 0 ? j12 / 2 * 8 + 4 : j12;
+      const int n13 = j13 >= 0 ? j13 / 2 * 8 + 5 : j13;
+      const int n14 = j14 >= 0 ? j14 / 2 * 8 + 6 : j14;
+      const int n15 = j15 >= 0 ? j15 / 2 * 8 + 7 : j15;
+      x1            = blend8s<n8, n9, n10, n11, n12, n13, n14, n15>(
+          blend8s < j8 & 2 ? -256 : i8 & 15, j9 & 2 ? -256 : i9 & 15, j10 & 2 ? -256 : i10 & 15, j11 & 2 ? -256 : i11 & 15,
+          j12 & 2 ? -256 : i12 & 15, j13 & 2 ? -256 : i13 & 15, j14 & 2 ? -256 : i14 & 15,
+          j15 & 2 ? -256 : i15 & 15 > (a.get_low(), a.get_high()), blend8s < (j8 ^ 2) & 6 ? -256 : i8 & 15,
+          (j9 ^ 2) & 6 ? -256 : i9 & 15, (j10 ^ 2) & 6 ? -256 : i10 & 15, (j11 ^ 2) & 6 ? -256 : i11 & 15,
+          (j12 ^ 2) & 6 ? -256 : i12 & 15, (j13 ^ 2) & 6 ? -256 : i13 & 15, (j14 ^ 2) & 6 ? -256 : i14 & 15,
+          (j15 ^ 2) & 6 ? -256 : i15 & 15 > (b.get_low(), b.get_high()));
+    }
+  return Vec16s(x0, x1);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16us blend16us(Vec16us const &a, Vec16us const &b)
+{
+  return Vec16us(blend16s<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a, b));
+}
+
+// helper function used below
+template <int n>
+static inline Vec16c select4(Vec32c const &a, Vec32c const &b)
+{
+  switch(n)
+    {
+      case 0:
+        return a.get_low();
+      case 1:
+        return a.get_high();
+      case 2:
+        return b.get_low();
+      case 3:
+        return b.get_high();
+    }
+  return _mm_setzero_si128();
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15, int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, int i24, int i25, int i26, int i27, int i28,
+          int i29, int i30, int i31>
+static inline Vec32c blend32c(Vec32c const &a, Vec32c const &b)
+{
+  // j0 - j31 indicate one of four 16-byte sources
+  const int j0  = i0 >= 0 ? i0 / 16 : i0;
+  const int j1  = i1 >= 0 ? i1 / 16 : i1;
+  const int j2  = i2 >= 0 ? i2 / 16 : i2;
+  const int j3  = i3 >= 0 ? i3 / 16 : i3;
+  const int j4  = i4 >= 0 ? i4 / 16 : i4;
+  const int j5  = i5 >= 0 ? i5 / 16 : i5;
+  const int j6  = i6 >= 0 ? i6 / 16 : i6;
+  const int j7  = i7 >= 0 ? i7 / 16 : i7;
+  const int j8  = i8 >= 0 ? i8 / 16 : i8;
+  const int j9  = i9 >= 0 ? i9 / 16 : i9;
+  const int j10 = i10 >= 0 ? i10 / 16 : i10;
+  const int j11 = i11 >= 0 ? i11 / 16 : i11;
+  const int j12 = i12 >= 0 ? i12 / 16 : i12;
+  const int j13 = i13 >= 0 ? i13 / 16 : i13;
+  const int j14 = i14 >= 0 ? i14 / 16 : i14;
+  const int j15 = i15 >= 0 ? i15 / 16 : i15;
+  const int j16 = i16 >= 0 ? i16 / 16 : i16;
+  const int j17 = i17 >= 0 ? i17 / 16 : i17;
+  const int j18 = i18 >= 0 ? i18 / 16 : i18;
+  const int j19 = i19 >= 0 ? i19 / 16 : i19;
+  const int j20 = i20 >= 0 ? i20 / 16 : i20;
+  const int j21 = i21 >= 0 ? i21 / 16 : i21;
+  const int j22 = i22 >= 0 ? i22 / 16 : i22;
+  const int j23 = i23 >= 0 ? i23 / 16 : i23;
+  const int j24 = i24 >= 0 ? i24 / 16 : i24;
+  const int j25 = i25 >= 0 ? i25 / 16 : i25;
+  const int j26 = i26 >= 0 ? i26 / 16 : i26;
+  const int j27 = i27 >= 0 ? i27 / 16 : i27;
+  const int j28 = i28 >= 0 ? i28 / 16 : i28;
+  const int j29 = i29 >= 0 ? i29 / 16 : i29;
+  const int j30 = i30 >= 0 ? i30 / 16 : i30;
+  const int j31 = i31 >= 0 ? i31 / 16 : i31;
+
+  Vec16c x0, x1;
+
+  // r0, s0 = first two sources of low  destination (i0  - i15)
+  // r1, s1 = first two sources of high destination (i16 - i31)
+  const int r0 =
+      j0 >= 0
+          ? j0
+          : j1 >= 0
+                ? j1
+                : j2 >= 0
+                      ? j2
+                      : j3 >= 0
+                            ? j3
+                            : j4 >= 0
+                                  ? j4
+                                  : j5 >= 0
+                                        ? j5
+                                        : j6 >= 0
+                                              ? j6
+                                              : j7 >= 0
+                                                    ? j7
+                                                    : j8 >= 0 ? j8
+                                                              : j9 >= 0 ? j9
+                                                                        : j10 >= 0
+                                                                              ? j10
+                                                                              : j11 >= 0
+                                                                                    ? j11
+                                                                                    : j12 >= 0 ? j12
+                                                                                               : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
+  const int r1 =
+      j16 >= 0
+          ? j16
+          : j17 >= 0
+                ? j17
+                : j18 >= 0
+                      ? j18
+                      : j19 >= 0
+                            ? j19
+                            : j20 >= 0
+                                  ? j20
+                                  : j21 >= 0
+                                        ? j21
+                                        : j22 >= 0
+                                              ? j22
+                                              : j23 >= 0
+                                                    ? j23
+                                                    : j24 >= 0
+                                                          ? j24
+                                                          : j25 >= 0
+                                                                ? j25
+                                                                : j26 >= 0
+                                                                      ? j26
+                                                                      : j27 >= 0
+                                                                            ? j27
+                                                                            : j28 >= 0 ? j28 : j29 >= 0 ? j29 : j30 >= 0 ? j30 : j31;
+  const int s0 = (j1 >= 0 && j1 != r0)
+                     ? j1
+                     : (j2 >= 0 && j2 != r0)
+                           ? j2
+                           : (j3 >= 0 && j3 != r0)
+                                 ? j3
+                                 : (j4 >= 0 && j4 != r0)
+                                       ? j4
+                                       : (j5 >= 0 && j5 != r0)
+                                             ? j5
+                                             : (j6 >= 0 && j6 != r0)
+                                                   ? j6
+                                                   : (j7 >= 0 && j7 != r0)
+                                                         ? j7
+                                                         : (j8 >= 0 && j8 != r0)
+                                                               ? j8
+                                                               : (j9 >= 0 && j9 != r0)
+                                                                     ? j9
+                                                                     : (j10 >= 0 && j10 != r0)
+                                                                           ? j10
+                                                                           : (j11 >= 0 && j11 != r0)
+                                                                                 ? j11
+                                                                                 : (j12 >= 0 && j12 != r0)
+                                                                                       ? j12
+                                                                                       : (j13 >= 0 && j13 != r0)
+                                                                                             ? j13
+                                                                                             : (j14 >= 0 && j14 != r0) ? j14 : j15;
+  const int s1 = (j17 >= 0 && j17 != r1)
+                     ? j17
+                     : (j18 >= 0 && j18 != r1)
+                           ? j18
+                           : (j19 >= 0 && j19 != r1)
+                                 ? j19
+                                 : (j20 >= 0 && j20 != r1)
+                                       ? j20
+                                       : (j21 >= 0 && j21 != r1)
+                                             ? j21
+                                             : (j22 >= 0 && j22 != r1)
+                                                   ? j22
+                                                   : (j23 >= 0 && j23 != r1)
+                                                         ? j23
+                                                         : (j24 >= 0 && j24 != r1)
+                                                               ? j24
+                                                               : (j25 >= 0 && j25 != r1)
+                                                                     ? j25
+                                                                     : (j26 >= 0 && j26 != r1)
+                                                                           ? j26
+                                                                           : (j27 >= 0 && j27 != r1)
+                                                                                 ? j27
+                                                                                 : (j28 >= 0 && j28 != r1)
+                                                                                       ? j28
+                                                                                       : (j29 >= 0 && j29 != r1)
+                                                                                             ? j29
+                                                                                             : (j30 >= 0 && j30 != r1) ? j30 : j31;
+
+  if(r0 < 0)
+    {
+      x0 = _mm_setzero_si128();
+    }
+  else if(r0 == s0)
+    {
+      // i0 - i15 all from same source
+      x0 = permute16c<i0 & -49, i1 & -49, i2 & -49, i3 & -49, i4 & -49, i5 & -49, i6 & -49, i7 & -49, i8 & -49, i9 & -49, i10 & -49,
+                      i11 & -49, i12 & -49, i13 & -49, i14 & -49, i15 & -49>(select4<r0>(a, b));
+    }
+  else if((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0) && (j4 < 0 || j4 == r0 || j4 == s0) &&
+          (j5 < 0 || j5 == r0 || j5 == s0) && (j6 < 0 || j6 == r0 || j6 == s0) && (j7 < 0 || j7 == r0 || j7 == s0) &&
+          (j8 < 0 || j8 == r0 || j8 == s0) && (j9 < 0 || j9 == r0 || j9 == s0) && (j10 < 0 || j10 == r0 || j10 == s0) &&
+          (j11 < 0 || j11 == r0 || j11 == s0) && (j12 < 0 || j12 == r0 || j12 == s0) && (j13 < 0 || j13 == r0 || j13 == s0) &&
+          (j14 < 0 || j14 == r0 || j14 == s0) && (j15 < 0 || j15 == r0 || j15 == s0))
+    {
+      // i0 - i15 all from two sources
+      const int k0  = i0 >= 0 ? i0 & 15 : i0;
+      const int k1  = (i1 >= 0 ? i1 & 15 : i1) | (j1 == s0 ? 16 : 0);
+      const int k2  = (i2 >= 0 ? i2 & 15 : i2) | (j2 == s0 ? 16 : 0);
+      const int k3  = (i3 >= 0 ? i3 & 15 : i3) | (j3 == s0 ? 16 : 0);
+      const int k4  = (i4 >= 0 ? i4 & 15 : i4) | (j4 == s0 ? 16 : 0);
+      const int k5  = (i5 >= 0 ? i5 & 15 : i5) | (j5 == s0 ? 16 : 0);
+      const int k6  = (i6 >= 0 ? i6 & 15 : i6) | (j6 == s0 ? 16 : 0);
+      const int k7  = (i7 >= 0 ? i7 & 15 : i7) | (j7 == s0 ? 16 : 0);
+      const int k8  = (i8 >= 0 ? i8 & 15 : i8) | (j8 == s0 ? 16 : 0);
+      const int k9  = (i9 >= 0 ? i9 & 15 : i9) | (j9 == s0 ? 16 : 0);
+      const int k10 = (i10 >= 0 ? i10 & 15 : i10) | (j10 == s0 ? 16 : 0);
+      const int k11 = (i11 >= 0 ? i11 & 15 : i11) | (j11 == s0 ? 16 : 0);
+      const int k12 = (i12 >= 0 ? i12 & 15 : i12) | (j12 == s0 ? 16 : 0);
+      const int k13 = (i13 >= 0 ? i13 & 15 : i13) | (j13 == s0 ? 16 : 0);
+      const int k14 = (i14 >= 0 ? i14 & 15 : i14) | (j14 == s0 ? 16 : 0);
+      const int k15 = (i15 >= 0 ? i15 & 15 : i15) | (j15 == s0 ? 16 : 0);
+      x0 = blend16c<k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14, k15>(select4<r0>(a, b), select4<s0>(a, b));
+    }
+  else
+    {
+      // i0 - i15 from three or four different sources
+      const int n0  = j0 >= 0 ? j0 / 2 * 16 + 0 : j0;
+      const int n1  = j1 >= 0 ? j1 / 2 * 16 + 1 : j1;
+      const int n2  = j2 >= 0 ? j2 / 2 * 16 + 2 : j2;
+      const int n3  = j3 >= 0 ? j3 / 2 * 16 + 3 : j3;
+      const int n4  = j4 >= 0 ? j4 / 2 * 16 + 4 : j4;
+      const int n5  = j5 >= 0 ? j5 / 2 * 16 + 5 : j5;
+      const int n6  = j6 >= 0 ? j6 / 2 * 16 + 6 : j6;
+      const int n7  = j7 >= 0 ? j7 / 2 * 16 + 7 : j7;
+      const int n8  = j8 >= 0 ? j8 / 2 * 16 + 8 : j8;
+      const int n9  = j9 >= 0 ? j9 / 2 * 16 + 9 : j9;
+      const int n10 = j10 >= 0 ? j10 / 2 * 16 + 10 : j10;
+      const int n11 = j11 >= 0 ? j11 / 2 * 16 + 11 : j11;
+      const int n12 = j12 >= 0 ? j12 / 2 * 16 + 12 : j12;
+      const int n13 = j13 >= 0 ? j13 / 2 * 16 + 13 : j13;
+      const int n14 = j14 >= 0 ? j14 / 2 * 16 + 14 : j14;
+      const int n15 = j15 >= 0 ? j15 / 2 * 16 + 15 : j15;
+
+      Vec16c x0a = blend16c < j0 & 2 ? -256 : i0 & 31, j1 & 2 ? -256 : i1 & 31, j2 & 2 ? -256 : i2 & 31, j3 & 2 ? -256 : i3 & 31,
+             j4 & 2 ? -256 : i4 & 31, j5 & 2 ? -256 : i5 & 31, j6 & 2 ? -256 : i6 & 31, j7 & 2 ? -256 : i7 & 31,
+             j8 & 2 ? -256 : i8 & 31, j9 & 2 ? -256 : i9 & 31, j10 & 2 ? -256 : i10 & 31, j11 & 2 ? -256 : i11 & 31,
+             j12 & 2 ? -256 : i12 & 31, j13 & 2 ? -256 : i13 & 31, j14 & 2 ? -256 : i14 & 31,
+             j15 & 2 ? -256 : i15 & 31 > (a.get_low(), a.get_high());
+      Vec16c x0b = blend16c < (j0 ^ 2) & 6 ? -256 : i0 & 31, (j1 ^ 2) & 6 ? -256 : i1 & 31, (j2 ^ 2) & 6 ? -256 : i2 & 31,
+             (j3 ^ 2) & 6 ? -256 : i3 & 31, (j4 ^ 2) & 6 ? -256 : i4 & 31, (j5 ^ 2) & 6 ? -256 : i5 & 31,
+             (j6 ^ 2) & 6 ? -256 : i6 & 31, (j7 ^ 2) & 6 ? -256 : i7 & 31, (j8 ^ 2) & 6 ? -256 : i8 & 31,
+             (j9 ^ 2) & 6 ? -256 : i9 & 31, (j10 ^ 2) & 6 ? -256 : i10 & 31, (j11 ^ 2) & 6 ? -256 : i11 & 31,
+             (j12 ^ 2) & 6 ? -256 : i12 & 31, (j13 ^ 2) & 6 ? -256 : i13 & 31, (j14 ^ 2) & 6 ? -256 : i14 & 31,
+             (j15 ^ 2) & 6 ? -256 : i15 & 31 > (b.get_low(), b.get_high());
+      x0 = blend16c<n0, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11, n12, n13, n14, n15>(x0a, x0b);
+    }
+
+  if(r1 < 0)
+    {
+      x1 = _mm_setzero_si128();
+    }
+  else if(r1 == s1)
+    {
+      // i16 - i31 all from same source
+      x1 = permute16c<i16 & -49, i17 & -49, i18 & -49, i19 & -49, i20 & -49, i21 & -49, i22 & -49, i23 & -49, i24 & -49, i25 & -49,
+                      i26 & -49, i27 & -49, i28 & -49, i29 & -49, i30 & -49, i31 & -49>(select4<r1>(a, b));
+    }
+  else if((j18 < 0 || j18 == r1 || j18 == s1) && (j19 < 0 || j19 == r1 || j19 == s1) && (j20 < 0 || j20 == r1 || j20 == s1) &&
+          (j21 < 0 || j21 == r1 || j21 == s1) && (j22 < 0 || j22 == r1 || j22 == s1) && (j23 < 0 || j23 == r1 || j23 == s1) &&
+          (j24 < 0 || j24 == r1 || j24 == s1) && (j25 < 0 || j25 == r1 || j25 == s1) && (j26 < 0 || j26 == r1 || j26 == s1) &&
+          (j27 < 0 || j27 == r1 || j27 == s1) && (j28 < 0 || j28 == r1 || j28 == s1) && (j29 < 0 || j29 == r1 || j29 == s1) &&
+          (j30 < 0 || j30 == r1 || j30 == s1) && (j31 < 0 || j31 == r1 || j31 == s1))
+    {
+      // i16 - i31 all from two sources
+      const int k16 = i16 >= 0 ? i16 & 15 : i16;
+      const int k17 = (i17 >= 0 ? i17 & 15 : i17) | (j17 == s1 ? 16 : 0);
+      const int k18 = (i18 >= 0 ? i18 & 15 : i18) | (j18 == s1 ? 16 : 0);
+      const int k19 = (i19 >= 0 ? i19 & 15 : i19) | (j19 == s1 ? 16 : 0);
+      const int k20 = (i20 >= 0 ? i20 & 15 : i20) | (j20 == s1 ? 16 : 0);
+      const int k21 = (i21 >= 0 ? i21 & 15 : i21) | (j21 == s1 ? 16 : 0);
+      const int k22 = (i22 >= 0 ? i22 & 15 : i22) | (j22 == s1 ? 16 : 0);
+      const int k23 = (i23 >= 0 ? i23 & 15 : i23) | (j23 == s1 ? 16 : 0);
+      const int k24 = (i24 >= 0 ? i24 & 15 : i24) | (j24 == s1 ? 16 : 0);
+      const int k25 = (i25 >= 0 ? i25 & 15 : i25) | (j25 == s1 ? 16 : 0);
+      const int k26 = (i26 >= 0 ? i26 & 15 : i26) | (j26 == s1 ? 16 : 0);
+      const int k27 = (i27 >= 0 ? i27 & 15 : i27) | (j27 == s1 ? 16 : 0);
+      const int k28 = (i28 >= 0 ? i28 & 15 : i28) | (j28 == s1 ? 16 : 0);
+      const int k29 = (i29 >= 0 ? i29 & 15 : i29) | (j29 == s1 ? 16 : 0);
+      const int k30 = (i30 >= 0 ? i30 & 15 : i30) | (j30 == s1 ? 16 : 0);
+      const int k31 = (i31 >= 0 ? i31 & 15 : i31) | (j31 == s1 ? 16 : 0);
+      x1            = blend16c<k16, k17, k18, k19, k20, k21, k22, k23, k24, k25, k26, k27, k28, k29, k30, k31>(select4<r1>(a, b),
+                                                                                                    select4<s1>(a, b));
+    }
+  else
+    {
+      // i16 - i31 from three or four different sources
+      const int n16 = j16 >= 0 ? j16 / 2 * 16 + 0 : j16;
+      const int n17 = j17 >= 0 ? j17 / 2 * 16 + 1 : j17;
+      const int n18 = j18 >= 0 ? j18 / 2 * 16 + 2 : j18;
+      const int n19 = j19 >= 0 ? j19 / 2 * 16 + 3 : j19;
+      const int n20 = j20 >= 0 ? j20 / 2 * 16 + 4 : j20;
+      const int n21 = j21 >= 0 ? j21 / 2 * 16 + 5 : j21;
+      const int n22 = j22 >= 0 ? j22 / 2 * 16 + 6 : j22;
+      const int n23 = j23 >= 0 ? j23 / 2 * 16 + 7 : j23;
+      const int n24 = j24 >= 0 ? j24 / 2 * 16 + 8 : j24;
+      const int n25 = j25 >= 0 ? j25 / 2 * 16 + 9 : j25;
+      const int n26 = j26 >= 0 ? j26 / 2 * 16 + 10 : j26;
+      const int n27 = j27 >= 0 ? j27 / 2 * 16 + 11 : j27;
+      const int n28 = j28 >= 0 ? j28 / 2 * 16 + 12 : j28;
+      const int n29 = j29 >= 0 ? j29 / 2 * 16 + 13 : j29;
+      const int n30 = j30 >= 0 ? j30 / 2 * 16 + 14 : j30;
+      const int n31 = j31 >= 0 ? j31 / 2 * 16 + 15 : j31;
+      x1            = blend16c<n16, n17, n18, n19, n20, n21, n22, n23, n24, n25, n26, n27, n28, n29, n30, n31>(
+          blend16c < j16 & 2 ? -256 : i16 & 31, j17 & 2 ? -256 : i17 & 31, j18 & 2 ? -256 : i18 & 31, j19 & 2 ? -256 : i19 & 31,
+          j20 & 2 ? -256 : i20 & 31, j21 & 2 ? -256 : i21 & 31, j22 & 2 ? -256 : i22 & 31, j23 & 2 ? -256 : i23 & 31,
+          j24 & 2 ? -256 : i24 & 31, j25 & 2 ? -256 : i25 & 31, j26 & 2 ? -256 : i26 & 31, j27 & 2 ? -256 : i27 & 31,
+          j28 & 2 ? -256 : i28 & 31, j29 & 2 ? -256 : i29 & 31, j30 & 2 ? -256 : i30 & 31,
+          j31 & 2 ? -256 : i31 & 31 > (a.get_low(), a.get_high()), blend16c < (j16 ^ 2) & 6 ? -256 : i16 & 31,
+          (j17 ^ 2) & 6 ? -256 : i17 & 31, (j18 ^ 2) & 6 ? -256 : i18 & 31, (j19 ^ 2) & 6 ? -256 : i19 & 31,
+          (j20 ^ 2) & 6 ? -256 : i20 & 31, (j21 ^ 2) & 6 ? -256 : i21 & 31, (j22 ^ 2) & 6 ? -256 : i22 & 31,
+          (j23 ^ 2) & 6 ? -256 : i23 & 31, (j24 ^ 2) & 6 ? -256 : i24 & 31, (j25 ^ 2) & 6 ? -256 : i25 & 31,
+          (j26 ^ 2) & 6 ? -256 : i26 & 31, (j27 ^ 2) & 6 ? -256 : i27 & 31, (j28 ^ 2) & 6 ? -256 : i28 & 31,
+          (j29 ^ 2) & 6 ? -256 : i29 & 31, (j30 ^ 2) & 6 ? -256 : i30 & 31,
+          (j31 ^ 2) & 6 ? -256 : i31 & 31 > (b.get_low(), b.get_high()));
+    }
+  return Vec32c(x0, x1);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15, int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, int i24, int i25, int i26, int i27, int i28,
+          int i29, int i30, int i31>
+static inline Vec32uc blend32uc(Vec32uc const &a, Vec32uc const &b)
+{
+  return Vec32uc(blend32c<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23,
+                          i24, i25, i26, i27, i28, i29, i30, i31>(a, b));
+}
+
+/*****************************************************************************
+ *
+ *          Vector lookup functions
+ *
+ ******************************************************************************
+ *
+ * These functions use vector elements as indexes into a table.
+ * The table is given as one or more vectors or as an array.
+ *
+ * This can be used for several purposes:
+ *  - table lookup
+ *  - permute or blend with variable indexes
+ *  - blend from more than two sources
+ *  - gather non-contiguous data
+ *
+ * An index out of range may produce any value - the actual value produced is
+ * implementation dependent and may be different for different instruction
+ * sets. An index out of range does not produce an error message or exception.
+ *
+ * Example:
+ * Vec8i a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+ * Vec8i b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8i c;
+ * c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+ *
+ *****************************************************************************/
+
+static inline Vec32c lookup32(Vec32c const &index, Vec32c const &table)
+{
+#if defined(__XOP__)  // AMD XOP instruction set. Use VPPERM
+  Vec16c t0 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_low());
+  Vec16c t1 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_high());
+  return Vec32c(t0, t1);
+#else
+  Vec16c t0 = lookup32(index.get_low(), table.get_low(), table.get_high());
+  Vec16c t1 = lookup32(index.get_high(), table.get_low(), table.get_high());
+  return Vec32c(t0, t1);
+#endif
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32uc const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 16)
+    {
+      Vec16c tt = Vec16c().load(table);
+      Vec16c r0 = lookup16(index.get_low(), tt);
+      Vec16c r1 = lookup16(index.get_high(), tt);
+      return Vec32c(r0, r1);
+    }
+  if(n <= 32)
+    return lookup32(index, Vec32c().load(table));
+  // n > 32. Limit index
+  Vec32uc index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec32uc(index) & uint8_t(n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec32uc(index), uint8_t(n - 1));
+    }
+  uint8_t ii[32];
+  index1.store(ii);
+  int8_t rr[32];
+  for(int j = 0; j < 32; j++)
+    {
+      rr[j] = ((int8_t *)table)[ii[j]];
+    }
+  return Vec32c().load(rr);
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32c const &index, void const *table)
+{
+  return lookup<n>(Vec32uc(index), table);
+}
+
+static inline Vec16s lookup16(Vec16s const &index, Vec16s const &table)
+{
+  Vec8s t0 = lookup16(index.get_low(), table.get_low(), table.get_high());
+  Vec8s t1 = lookup16(index.get_high(), table.get_low(), table.get_high());
+  return Vec16s(t0, t1);
+}
+
+template <int n>
+static inline Vec16s lookup(Vec16s const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 8)
+    {
+      Vec8s table1 = Vec8s().load(table);
+      return Vec16s(lookup8(index.get_low(), table1), lookup8(index.get_high(), table1));
+    }
+  if(n <= 16)
+    return lookup16(index, Vec16s().load(table));
+  // n > 16. Limit index
+  Vec16us i1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      i1 = Vec16us(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      i1 = min(Vec16us(index), n - 1);
+    }
+  int16_t const *t = (int16_t const *)table;
+  return Vec16s(t[i1[0]], t[i1[1]], t[i1[2]], t[i1[3]], t[i1[4]], t[i1[5]], t[i1[6]], t[i1[7]], t[i1[8]], t[i1[9]], t[i1[10]],
+                t[i1[11]], t[i1[12]], t[i1[13]], t[i1[14]], t[i1[15]]);
+}
+
+static inline Vec8i lookup8(Vec8i const &index, Vec8i const &table)
+{
+  Vec4i t0 = lookup8(index.get_low(), table.get_low(), table.get_high());
+  Vec4i t1 = lookup8(index.get_high(), table.get_low(), table.get_high());
+  return Vec8i(t0, t1);
+}
+
+template <int n>
+static inline Vec8i lookup(Vec8i const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 4)
+    {
+      Vec4i table1 = Vec4i().load(table);
+      return Vec8i(lookup4(index.get_low(), table1), lookup4(index.get_high(), table1));
+    }
+  if(n <= 8)
+    {
+      return lookup8(index, Vec8i().load(table));
+    }
+  // n > 8. Limit index
+  Vec8ui i1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      i1 = Vec8ui(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      i1 = min(Vec8ui(index), n - 1);
+    }
+  int32_t const *t = (int32_t const *)table;
+  return Vec8i(t[i1[0]], t[i1[1]], t[i1[2]], t[i1[3]], t[i1[4]], t[i1[5]], t[i1[6]], t[i1[7]]);
+}
+
+static inline Vec4q lookup4(Vec4q const &index, Vec4q const &table)
+{
+  return lookup8(Vec8i(index * 0x200000002ll + 0x100000000ll), Vec8i(table));
+}
+
+template <int n>
+static inline Vec4q lookup(Vec4q const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  // n > 0. Limit index
+  Vec4uq index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec4uq(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1.
+      // There is no 64-bit min instruction, but we can use the 32-bit unsigned min,
+      // since n is a 32-bit integer
+      index1 = Vec4uq(min(Vec8ui(index), constant8i<n - 1, 0, n - 1, 0, n - 1, 0, n - 1, 0>()));
+    }
+  uint32_t ii[8];
+  index1.store(ii);  // use only lower 32 bits of each index
+  int64_t const *tt = (int64_t const *)table;
+  return Vec4q(tt[ii[0]], tt[ii[2]], tt[ii[4]], tt[ii[6]]);
+}
+
+/*****************************************************************************
+ *
+ *          Other permutations with variable indexes
+ *
+ *****************************************************************************/
+
+// Function shift_bytes_up: shift whole vector left by b bytes.
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec32c shift_bytes_up(Vec32c const &a, int b)
+{
+  if(b < 16)
+    {
+      return Vec32c(shift_bytes_up(a.get_low(), b), shift_bytes_up(a.get_high(), b) | shift_bytes_down(a.get_low(), 16 - b));
+    }
+  else
+    {
+      return Vec32c(Vec16c(0), shift_bytes_up(a.get_high(), b - 16));
+    }
+}
+
+// Function shift_bytes_down: shift whole vector right by b bytes
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec32c shift_bytes_down(Vec32c const &a, int b)
+{
+  if(b < 16)
+    {
+      return Vec32c(shift_bytes_down(a.get_low(), b) | shift_bytes_up(a.get_high(), 16 - b), shift_bytes_down(a.get_high(), b));
+    }
+  else
+    {
+      return Vec32c(shift_bytes_down(a.get_high(), b - 16), Vec16c(0));
+    }
+}
+
+/*****************************************************************************
+ *
+ *          Gather functions with fixed indexes
+ *
+ *****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3, i4, i5, i6, i7
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i gather8i(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) >= 0> Negative_array_index;  // Error message if index is negative
+  const int i01min   = i0 < i1 ? i0 : i1;
+  const int i23min   = i2 < i3 ? i2 : i3;
+  const int i45min   = i4 < i5 ? i4 : i5;
+  const int i67min   = i6 < i7 ? i6 : i7;
+  const int i0123min = i01min < i23min ? i01min : i23min;
+  const int i4567min = i45min < i67min ? i45min : i67min;
+  const int imin     = i0123min < i4567min ? i0123min : i4567min;
+  const int i01max   = i0 > i1 ? i0 : i1;
+  const int i23max   = i2 > i3 ? i2 : i3;
+  const int i45max   = i4 > i5 ? i4 : i5;
+  const int i67max   = i6 > i7 ? i6 : i7;
+  const int i0123max = i01max > i23max ? i01max : i23max;
+  const int i4567max = i45max > i67max ? i45max : i67max;
+  const int imax     = i0123max > i4567max ? i0123max : i4567max;
+
+  if(imax - imin <= 7)
+    {
+      // load one contiguous block and permute
+      if(imax > 7)
+        {
+          // make sure we don't read past the end of the array
+          Vec8i b = Vec8i().load((int32_t const *)a + imax - 7);
+          return permute8i<i0 - imax + 7, i1 - imax + 7, i2 - imax + 7, i3 - imax + 7, i4 - imax + 7, i5 - imax + 7, i6 - imax + 7,
+                           i7 - imax + 7>(b);
+        }
+      else
+        {
+          Vec8i b = Vec8i().load((int32_t const *)a + imin);
+          return permute8i<i0 - imin, i1 - imin, i2 - imin, i3 - imin, i4 - imin, i5 - imin, i6 - imin, i7 - imin>(b);
+        }
+    }
+  if((i0 < imin + 8 || i0 > imax - 8) && (i1 < imin + 8 || i1 > imax - 8) && (i2 < imin + 8 || i2 > imax - 8) &&
+     (i3 < imin + 8 || i3 > imax - 8) && (i4 < imin + 8 || i4 > imax - 8) && (i5 < imin + 8 || i5 > imax - 8) &&
+     (i6 < imin + 8 || i6 > imax - 8) && (i7 < imin + 8 || i7 > imax - 8))
+    {
+      // load two contiguous blocks and blend
+      Vec8i b      = Vec8i().load((int32_t const *)a + imin);
+      Vec8i c      = Vec8i().load((int32_t const *)a + imax - 7);
+      const int j0 = i0 < imin + 8 ? i0 - imin : 15 - imax + i0;
+      const int j1 = i1 < imin + 8 ? i1 - imin : 15 - imax + i1;
+      const int j2 = i2 < imin + 8 ? i2 - imin : 15 - imax + i2;
+      const int j3 = i3 < imin + 8 ? i3 - imin : 15 - imax + i3;
+      const int j4 = i4 < imin + 8 ? i4 - imin : 15 - imax + i4;
+      const int j5 = i5 < imin + 8 ? i5 - imin : 15 - imax + i5;
+      const int j6 = i6 < imin + 8 ? i6 - imin : 15 - imax + i6;
+      const int j7 = i7 < imin + 8 ? i7 - imin : 15 - imax + i7;
+      return blend8i<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+  // use lookup function
+  return lookup<imax + 1>(Vec8i(i0, i1, i2, i3, i4, i5, i6, i7), a);
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q gather4q(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3) >= 0> Negative_array_index;  // Error message if index is negative
+  const int i01min = i0 < i1 ? i0 : i1;
+  const int i23min = i2 < i3 ? i2 : i3;
+  const int imin   = i01min < i23min ? i01min : i23min;
+  const int i01max = i0 > i1 ? i0 : i1;
+  const int i23max = i2 > i3 ? i2 : i3;
+  const int imax   = i01max > i23max ? i01max : i23max;
+  if(imax - imin <= 3)
+    {
+      // load one contiguous block and permute
+      if(imax > 3)
+        {
+          // make sure we don't read past the end of the array
+          Vec4q b = Vec4q().load((int64_t const *)a + imax - 3);
+          return permute4q<i0 - imax + 3, i1 - imax + 3, i2 - imax + 3, i3 - imax + 3>(b);
+        }
+      else
+        {
+          Vec4q b = Vec4q().load((int64_t const *)a + imin);
+          return permute4q<i0 - imin, i1 - imin, i2 - imin, i3 - imin>(b);
+        }
+    }
+  if((i0 < imin + 4 || i0 > imax - 4) && (i1 < imin + 4 || i1 > imax - 4) && (i2 < imin + 4 || i2 > imax - 4) &&
+     (i3 < imin + 4 || i3 > imax - 4))
+    {
+      // load two contiguous blocks and blend
+      Vec4q b      = Vec4q().load((int64_t const *)a + imin);
+      Vec4q c      = Vec4q().load((int64_t const *)a + imax - 3);
+      const int j0 = i0 < imin + 4 ? i0 - imin : 7 - imax + i0;
+      const int j1 = i1 < imin + 4 ? i1 - imin : 7 - imax + i1;
+      const int j2 = i2 < imin + 4 ? i2 - imin : 7 - imax + i2;
+      const int j3 = i3 < imin + 4 ? i3 - imin : 7 - imax + i3;
+      return blend4q<j0, j1, j2, j3>(b, c);
+    }
+  // use lookup function
+  return lookup<imax + 1>(Vec4q(i0, i1, i2, i3), a);
+}
+
+/*****************************************************************************
+ *
+ *          Vector scatter functions
+ *
+ ******************************************************************************
+ *
+ * These functions write the elements of a vector to arbitrary positions in an
+ * array in memory. Each vector element is written to an array position
+ * determined by an index. An element is not written if the corresponding
+ * index is out of range.
+ * The indexes can be specified as constant template parameters or as an
+ * integer vector.
+ *
+ * The scatter functions are useful if the data are distributed in a sparce
+ * manner into the array. If the array is dense then it is more efficient
+ * to permute the data into the right positions and then write the whole
+ * permuted vector into the array.
+ *
+ * Example:
+ * Vec8q a(10,11,12,13,14,15,16,17);
+ * int64_t b[16] = {0};
+ * scatter<0,2,14,10,1,-1,5,9>(a,b);
+ * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
+ *
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8i const &data, void *array)
+{
+  int32_t *arr       = (int32_t *)array;
+  const int index[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+  for(int i = 0; i < 8; i++)
+    {
+      if(index[i] >= 0)
+        arr[index[i]] = data[i];
+    }
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4q const &data, void *array)
+{
+  int64_t *arr       = (int64_t *)array;
+  const int index[4] = {i0, i1, i2, i3};
+  for(int i = 0; i < 4; i++)
+    {
+      if(index[i] >= 0)
+        arr[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec8i const &index, uint32_t limit, Vec8i const &data, void *array)
+{
+  int32_t *arr = (int32_t *)array;
+  for(int i = 0; i < 8; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        arr[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec4q const &index, uint32_t limit, Vec4q const &data, void *array)
+{
+  int64_t *arr = (int64_t *)array;
+  for(int i = 0; i < 4; i++)
+    {
+      if(uint64_t(index[i]) < uint64_t(limit))
+        arr[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec4i const &index, uint32_t limit, Vec4q const &data, void *array)
+{
+  int64_t *arr = (int64_t *)array;
+  for(int i = 0; i < 4; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        arr[index[i]] = data[i];
+    }
+}
+
+/*****************************************************************************
+ *
+ *          Functions for conversion between integer sizes
+ *
+ *****************************************************************************/
+
+// Extend 8-bit integers to 16-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 16 elements to 16 bits with sign extension
+static inline Vec16s extend_low(Vec32c const &a) { return Vec16s(extend_low(a.get_low()), extend_high(a.get_low())); }
+
+// Function extend_high : extends the high 16 elements to 16 bits with sign extension
+static inline Vec16s extend_high(Vec32c const &a) { return Vec16s(extend_low(a.get_high()), extend_high(a.get_high())); }
+
+// Function extend_low : extends the low 16 elements to 16 bits with zero extension
+static inline Vec16us extend_low(Vec32uc const &a) { return Vec16us(extend_low(a.get_low()), extend_high(a.get_low())); }
+
+// Function extend_high : extends the high 19 elements to 16 bits with zero extension
+static inline Vec16us extend_high(Vec32uc const &a) { return Vec16us(extend_low(a.get_high()), extend_high(a.get_high())); }
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 32 bits with sign extension
+static inline Vec8i extend_low(Vec16s const &a) { return Vec8i(extend_low(a.get_low()), extend_high(a.get_low())); }
+
+// Function extend_high : extends the high 8 elements to 32 bits with sign extension
+static inline Vec8i extend_high(Vec16s const &a) { return Vec8i(extend_low(a.get_high()), extend_high(a.get_high())); }
+
+// Function extend_low : extends the low 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_low(Vec16us const &a) { return Vec8ui(extend_low(a.get_low()), extend_high(a.get_low())); }
+
+// Function extend_high : extends the high 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_high(Vec16us const &a) { return Vec8ui(extend_low(a.get_high()), extend_high(a.get_high())); }
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 4 elements to 64 bits with sign extension
+static inline Vec4q extend_low(Vec8i const &a) { return Vec4q(extend_low(a.get_low()), extend_high(a.get_low())); }
+
+// Function extend_high : extends the high 4 elements to 64 bits with sign extension
+static inline Vec4q extend_high(Vec8i const &a) { return Vec4q(extend_low(a.get_high()), extend_high(a.get_high())); }
+
+// Function extend_low : extends the low 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_low(Vec8ui const &a) { return Vec4uq(extend_low(a.get_low()), extend_high(a.get_low())); }
+
+// Function extend_high : extends the high 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_high(Vec8ui const &a) { return Vec4uq(extend_low(a.get_high()), extend_high(a.get_high())); }
+
+// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec32c compress(Vec16s const &low, Vec16s const &high)
+{
+  return Vec32c(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed, with saturation
+static inline Vec32c compress_saturated(Vec16s const &low, Vec16s const &high)
+{
+  return Vec32c(compress_saturated(low.get_low(), low.get_high()), compress_saturated(high.get_low(), high.get_high()));
+}
+
+// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
+// Unsigned, overflow wraps around
+static inline Vec32uc compress(Vec16us const &low, Vec16us const &high)
+{
+  return Vec32uc(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Unsigned, with saturation
+static inline Vec32uc compress_saturated(Vec16us const &low, Vec16us const &high)
+{
+  return Vec32uc(compress_saturated(low.get_low(), low.get_high()), compress_saturated(high.get_low(), high.get_high()));
+}
+
+// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16s compress(Vec8i const &low, Vec8i const &high)
+{
+  return Vec16s(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed with saturation
+static inline Vec16s compress_saturated(Vec8i const &low, Vec8i const &high)
+{
+  return Vec16s(compress_saturated(low.get_low(), low.get_high()), compress_saturated(high.get_low(), high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16us compress(Vec8ui const &low, Vec8ui const &high)
+{
+  return Vec16us(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Unsigned, with saturation
+static inline Vec16us compress_saturated(Vec8ui const &low, Vec8ui const &high)
+{
+  return Vec16us(compress_saturated(low.get_low(), low.get_high()), compress_saturated(high.get_low(), high.get_high()));
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec8i compress(Vec4q const &low, Vec4q const &high)
+{
+  return Vec8i(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec8i compress_saturated(Vec4q const &low, Vec4q const &high)
+{
+  return Vec8i(compress_saturated(low.get_low(), low.get_high()), compress_saturated(high.get_low(), high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8ui compress(Vec4uq const &low, Vec4uq const &high) { return Vec8ui(compress((Vec4q)low, (Vec4q)high)); }
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec8ui compress_saturated(Vec4uq const &low, Vec4uq const &high)
+{
+  return Vec8ui(compress_saturated(low.get_low(), low.get_high()), compress_saturated(high.get_low(), high.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Integer division 2: divisor is a compile-time constant
+ *
+ *****************************************************************************/
+
+// Divide Vec8i by compile-time constant
+template <int32_t d>
+static inline Vec8i divide_by_i(Vec8i const &a)
+{
+  return Vec8i(divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// define Vec8i a / const_int(d)
+template <int32_t d>
+static inline Vec8i operator/(Vec8i const &a, Const_int_t<d>)
+{
+  return divide_by_i<d>(a);
+}
+
+// define Vec8i a / const_uint(d)
+template <uint32_t d>
+static inline Vec8i operator/(Vec8i const &a, Const_uint_t<d>)
+{
+  Static_error_check<(d < 0x80000000u)> Error_overflow_dividing_signed_by_unsigned;  // Error: dividing signed by overflowing unsigned
+  return divide_by_i<int32_t(d)>(a);                                                 // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8i &operator/=(Vec8i &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8i &operator/=(Vec8i &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// Divide Vec8ui by compile-time constant
+template <uint32_t d>
+static inline Vec8ui divide_by_ui(Vec8ui const &a)
+{
+  return Vec8ui(divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
+}
+
+// define Vec8ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec8ui operator/(Vec8ui const &a, Const_uint_t<d>)
+{
+  return divide_by_ui<d>(a);
+}
+
+// define Vec8ui a / const_int(d)
+template <int32_t d>
+static inline Vec8ui operator/(Vec8ui const &a, Const_int_t<d>)
+{
+  Static_error_check<(d >= 0)> Error_dividing_unsigned_by_negative;  // Error: dividing unsigned by negative is ambiguous
+  return divide_by_ui<d>(a);                                         // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8ui &operator/=(Vec8ui &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8ui &operator/=(Vec8ui &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// Divide Vec16s by compile-time constant
+template <int d>
+static inline Vec16s divide_by_i(Vec16s const &a)
+{
+  return Vec16s(divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// define Vec16s a / const_int(d)
+template <int d>
+static inline Vec16s operator/(Vec16s const &a, Const_int_t<d>)
+{
+  return divide_by_i<d>(a);
+}
+
+// define Vec16s a / const_uint(d)
+template <uint32_t d>
+static inline Vec16s operator/(Vec16s const &a, Const_uint_t<d>)
+{
+  Static_error_check<(d < 0x8000u)> Error_overflow_dividing_signed_by_unsigned;  // Error: dividing signed by overflowing unsigned
+  return divide_by_i<int(d)>(a);                                                 // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16s &operator/=(Vec16s &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16s &operator/=(Vec16s &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// Divide Vec16us by compile-time constant
+template <uint32_t d>
+static inline Vec16us divide_by_ui(Vec16us const &a)
+{
+  return Vec16us(divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
+}
+
+// define Vec16us a / const_uint(d)
+template <uint32_t d>
+static inline Vec16us operator/(Vec16us const &a, Const_uint_t<d>)
+{
+  return divide_by_ui<d>(a);
+}
+
+// define Vec16us a / const_int(d)
+template <int d>
+static inline Vec16us operator/(Vec16us const &a, Const_int_t<d>)
+{
+  Static_error_check<(d >= 0)> Error_dividing_unsigned_by_negative;  // Error: dividing unsigned by negative is ambiguous
+  return divide_by_ui<d>(a);                                         // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16us &operator/=(Vec16us &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16us &operator/=(Vec16us &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// define Vec32c a / const_int(d)
+template <int d>
+static inline Vec32c operator/(Vec32c const &a, Const_int_t<d>)
+{
+  // expand into two Vec16s
+  Vec16s low  = extend_low(a) / Const_int_t<d>();
+  Vec16s high = extend_high(a) / Const_int_t<d>();
+  return compress(low, high);
+}
+
+// define Vec32c a / const_uint(d)
+template <uint32_t d>
+static inline Vec32c operator/(Vec32c const &a, Const_uint_t<d>)
+{
+  Static_error_check<(uint8_t(d) < 0x80u)>
+      Error_overflow_dividing_signed_by_unsigned;  // Error: dividing signed by overflowing unsigned
+  return a / Const_int_t<d>();                     // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32c &operator/=(Vec32c &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32c &operator/=(Vec32c &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// define Vec32uc a / const_uint(d)
+template <uint32_t d>
+static inline Vec32uc operator/(Vec32uc const &a, Const_uint_t<d>)
+{
+  // expand into two Vec16us
+  Vec16us low  = extend_low(a) / Const_uint_t<d>();
+  Vec16us high = extend_high(a) / Const_uint_t<d>();
+  return compress(low, high);
+}
+
+// define Vec32uc a / const_int(d)
+template <int d>
+static inline Vec32uc operator/(Vec32uc const &a, Const_int_t<d>)
+{
+  Static_error_check<(int8_t(d) >= 0)> Error_dividing_unsigned_by_negative;  // Error: dividing unsigned by negative is ambiguous
+  return a / Const_uint_t<d>();                                              // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32uc &operator/=(Vec32uc &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32uc &operator/=(Vec32uc &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal scan functions
+ *
+ *****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec32cb const &x)
+{
+  int a1 = horizontal_find_first(x.get_low());
+  if(a1 >= 0)
+    return a1;
+  int a2 = horizontal_find_first(x.get_high());
+  if(a2 < 0)
+    return a2;
+  return a2 + 16;
+  ;
+}
+
+static inline int horizontal_find_first(Vec16sb const &x) { return horizontal_find_first(Vec32cb(x)) >> 1; }
+
+static inline int horizontal_find_first(Vec8ib const &x) { return horizontal_find_first(Vec32cb(x)) >> 2; }
+
+static inline int horizontal_find_first(Vec4qb const &x) { return horizontal_find_first(Vec32cb(x)) >> 3; }
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec32cb const &x) { return horizontal_count(x.get_low()) + horizontal_count(x.get_high()); }
+
+static inline uint32_t horizontal_count(Vec16sb const &x) { return horizontal_count(Vec32cb(x)) >> 1; }
+
+static inline uint32_t horizontal_count(Vec8ib const &x) { return horizontal_count(Vec32cb(x)) >> 2; }
+
+static inline uint32_t horizontal_count(Vec4qb const &x) { return horizontal_count(Vec32cb(x)) >> 3; }
+
+/*****************************************************************************
+ *
+ *          Boolean <-> bitfield conversion functions
+ *
+ *****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint32_t to_bits(Vec32cb const &x) { return to_bits(x.get_low()) | (uint32_t)to_bits(x.get_high()) << 16; }
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec32cb to_Vec32cb(uint32_t x) { return Vec32c(to_Vec16cb(uint16_t(x)), to_Vec16cb(uint16_t(x >> 16))); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16sb const &x) { return to_bits(x.get_low()) | (uint16_t)to_bits(x.get_high()) << 8; }
+
+// to_Vec16sb: convert integer bitfield to boolean vector
+static inline Vec16sb to_Vec16sb(uint16_t x) { return Vec16s(to_Vec8sb(uint8_t(x)), to_Vec8sb(uint8_t(x >> 8))); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8ib const &x) { return to_bits(x.get_low()) | (uint8_t)to_bits(x.get_high()) << 4; }
+
+// to_Vec8ib: convert integer bitfield to boolean vector
+static inline Vec8ib to_Vec8ib(uint8_t x) { return Vec8i(to_Vec4ib(x), to_Vec4ib(x >> 4)); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4qb const &x) { return to_bits(x.get_low()) | to_bits(x.get_high()) << 2; }
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec4qb to_Vec4qb(uint8_t x) { return Vec4q(to_Vec2qb(x), to_Vec2qb(x >> 2)); }
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORI256_H
diff --git a/src/vectorclass/vectori512.h b/src/vectorclass/vectori512.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b716dfffee15f0be522b17b71fb9479da13c897
--- /dev/null
+++ b/src/vectorclass/vectori512.h
@@ -0,0 +1,2668 @@
+/****************************  vectori512.h   *******************************
+ * Author:        Agner Fog
+ * Date created:  2014-07-23
+ * Last modified: 2017-02-19
+ * Version:       1.27
+ * Project:       vector classes
+ * Description:
+ * Header file defining integer vector classes as interface to intrinsic
+ * functions in x86 microprocessors with AVX512 and later instruction sets.
+ *
+ * Instructions:
+ * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
+ * instruction set, which must be at least AVX512.
+ *
+ * The following vector classes are defined here:
+ * Vec16i    Vector of  16  32-bit signed   integers
+ * Vec16ui   Vector of  16  32-bit unsigned integers
+ * Vec16ib   Vector of  16  Booleans for use with Vec16i and Vec16ui
+ * Vec8q     Vector of   8  64-bit signed   integers
+ * Vec8uq    Vector of   8  64-bit unsigned integers
+ * Vec8qb    Vector of   8  Booleans for use with Vec8q and Vec8uq
+ *
+ * Each vector object is represented internally in the CPU as a 512-bit register.
+ * This header file defines operators and functions for these vectors.
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2014-2017 GNU General Public License http://www.gnu.org/licenses
+ *****************************************************************************/
+
+// check combination of header files
+#if defined(VECTORI512_H)
+#if VECTORI512_H != 2
+#error Two different versions of vectori512.h included
+#endif
+#else
+#define VECTORI512_H 2
+
+#ifdef VECTORF512_H
+#error Please put header file vectori512.h before vectorf512.h
+#endif
+
+#if INSTRSET < 9  // AVX512 required
+#error Wrong instruction set for vectori512.h, AVX512 required or use vectori512e.h
+#endif
+
+#include "vectori256.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+// Bug fix for missing intrinsics:
+// _mm512_cmpgt_epu32_mask, _mm512_cmpgt_epu64_mask
+// all typecast intrinsics
+// Fix expected in GCC version 4.9.3 or 5.0. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61878
+
+// questionable
+// _mm512_mask_mov_epi32 check select(). Doc at https://software.intel.com/en-us/node/513888 is wrong. Bug report filed
+
+#if defined(GCC_VERSION) && GCC_VERSION < 50000 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+
+static inline __m512i _mm512_castsi256_si512(__m256i x)
+{
+  union
+  {
+    __m512i a;
+    __m256i b;
+  } u;
+  u.b = x;
+  return u.a;
+}
+
+static inline __m256i _mm512_castsi512_si256(__m512i x)
+{
+  union
+  {
+    __m512i a;
+    __m256i b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+static inline __m512i _mm512_castsi128_si512(__m128i x)
+{
+  union
+  {
+    __m128i a;
+    __m512i b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+static inline __m128i _mm512_castsi512_si128(__m512i x)
+{
+  union
+  {
+    __m512i a;
+    __m128i b;
+  } u;
+  u.a = x;
+  return u.b;
+}
+
+#endif
+
+/*****************************************************************************
+ *
+ *          Generate compile-time constant vector
+ *
+ *****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory.
+// Can be converted to any integer vector type
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline __m512i constant16i()
+{
+  static const union
+  {
+    int32_t i[16];
+    __m512i zmm;
+  } u = {{i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}};
+  return u.zmm;
+}
+
+/*****************************************************************************
+ *
+ *          Boolean vector base classes for AVX512
+ *
+ *****************************************************************************/
+
+class Vec16b
+{
+ protected:
+  __mmask16 m16;  // Boolean vector
+ public:
+  // Default constructor:
+  Vec16b() {}
+  // Constructor to convert from type __mmask16 used in intrinsics:
+  Vec16b(__mmask16 x) { m16 = x; }
+  // Constructor to build from all elements:
+  Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, bool b8, bool b9, bool b10, bool b11, bool b12,
+         bool b13, bool b14, bool b15)
+  {
+    m16 = uint16_t(b0 | b1 << 1 | b2 << 2 | b3 << 3 | b4 << 4 | b5 << 5 | b6 << 6 | b7 << 7 | b8 << 8 | b9 << 9 | b10 << 10 |
+                   b11 << 11 | b12 << 12 | b13 << 13 | b14 << 14 | b15 << 15);
+  }
+  // Constructor to broadcast single value:
+  Vec16b(bool b) { m16 = __mmask16(-int16_t(b)); }
+
+ private:  // Prevent constructing from int, etc.
+  Vec16b(int b);
+
+ public:
+  // Constructor to make from two halves
+  Vec16b(Vec8ib const &x0, Vec8ib const &x1)
+  {
+    // = Vec16i(x0,x1) != 0;  (not defined yet)
+    __m512i z = _mm512_inserti64x4(_mm512_castsi256_si512(x0), x1, 1);
+    m16       = _mm512_cmpneq_epi32_mask(z, _mm512_setzero_epi32());
+  }
+  // Assignment operator to convert from type __mmask16 used in intrinsics:
+  Vec16b &operator=(__mmask16 x)
+  {
+    m16 = x;
+    return *this;
+  }
+  // Assignment operator to broadcast scalar value:
+  Vec16b &operator=(bool b)
+  {
+    m16 = Vec16b(b);
+    return *this;
+  }
+
+ private:  // Prevent assigning int because of ambiguity
+  Vec16b &operator=(int x);
+
+ public:
+  // Type cast operator to convert to __mmask16 used in intrinsics
+  operator __mmask16() const { return m16; }
+  // split into two halves
+  Vec8ib get_low() const { return to_Vec8ib((uint8_t)m16); }
+  Vec8ib get_high() const { return to_Vec8ib((uint16_t)m16 >> 8); }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec16b const &insert(uint32_t index, bool value)
+  {
+    m16 = __mmask16(((uint16_t)m16 & ~(1 << index)) | (int)value << index);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const { return ((uint32_t)m16 >> index) & 1; }
+  // Extract a single element. Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 16; }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec16b operator&(Vec16b a, Vec16b b) { return _mm512_kand(a, b); }
+static inline Vec16b operator&&(Vec16b a, Vec16b b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec16b operator|(Vec16b a, Vec16b b) { return _mm512_kor(a, b); }
+static inline Vec16b operator||(Vec16b a, Vec16b b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec16b operator^(Vec16b a, Vec16b b) { return _mm512_kxor(a, b); }
+
+// vector operator ~ : bitwise not
+static inline Vec16b operator~(Vec16b a) { return _mm512_knot(a); }
+
+// vector operator ! : element not
+static inline Vec16b operator!(Vec16b a) { return ~a; }
+
+// vector operator &= : bitwise and
+static inline Vec16b &operator&=(Vec16b &a, Vec16b b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16b &operator|=(Vec16b &a, Vec16b b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16b &operator^=(Vec16b &a, Vec16b b)
+{
+  a = a ^ b;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Functions for boolean vectors
+ *
+ *****************************************************************************/
+
+// function andnot: a & ~ b
+static inline Vec16b andnot(Vec16b a, Vec16b b) { return _mm512_kandn(b, a); }
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec16b const &a) { return (uint16_t)(__mmask16)a == 0xFFFF; }
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec16b const &a) { return (uint16_t)(__mmask16)a != 0; }
+
+/*****************************************************************************
+ *
+ *          Vec16ib: Vector of 16 Booleans for use with Vec16i and Vec16ui
+ *
+ *****************************************************************************/
+
+class Vec16ib : public Vec16b
+{
+ public:
+  // Default constructor:
+  Vec16ib() {}
+  Vec16ib(Vec16b x) { m16 = x; }
+  // Constructor to build from all elements:
+  Vec16ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, bool x8, bool x9, bool x10, bool x11, bool x12,
+          bool x13, bool x14, bool x15)
+      : Vec16b(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15)
+  {
+  }
+  // Constructor to convert from type __mmask16 used in intrinsics:
+  Vec16ib(__mmask16 x) { m16 = x; }
+  // Constructor to broadcast single value:
+  Vec16ib(bool b) : Vec16b(b) {}
+
+ private:  // Prevent constructing from int, etc.
+  Vec16ib(int b);
+
+ public:
+  // Constructor to make from two halves
+  Vec16ib(Vec8ib const &x0, Vec8ib const &x1) { m16 = Vec16b(x0, x1); }
+  // Assignment operator to convert from type __mmask16 used in intrinsics:
+  Vec16ib &operator=(__mmask16 x)
+  {
+    m16 = x;
+    return *this;
+  }
+  // Assignment operator to broadcast scalar value:
+  Vec16ib &operator=(bool b)
+  {
+    m16 = Vec16b(b);
+    return *this;
+  }
+
+ private:  // Prevent assigning int because of ambiguity
+  Vec16ib &operator=(int x);
+
+ public:
+};
+
+// Define operators for Vec16ib
+
+// vector operator & : bitwise and
+static inline Vec16ib operator&(Vec16ib a, Vec16ib b) { return Vec16b(a) & Vec16b(b); }
+static inline Vec16ib operator&&(Vec16ib a, Vec16ib b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec16ib operator|(Vec16ib a, Vec16ib b) { return Vec16b(a) | Vec16b(b); }
+static inline Vec16ib operator||(Vec16ib a, Vec16ib b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec16ib operator^(Vec16ib a, Vec16ib b) { return Vec16b(a) ^ Vec16b(b); }
+
+// vector operator ~ : bitwise not
+static inline Vec16ib operator~(Vec16ib a) { return ~Vec16b(a); }
+
+// vector operator ! : element not
+static inline Vec16ib operator!(Vec16ib a) { return ~a; }
+
+// vector operator &= : bitwise and
+static inline Vec16ib &operator&=(Vec16ib &a, Vec16ib b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16ib &operator|=(Vec16ib &a, Vec16ib b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16ib &operator^=(Vec16ib &a, Vec16ib b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector function andnot
+static inline Vec16ib andnot(Vec16ib a, Vec16ib b) { return Vec16ib(andnot(Vec16b(a), Vec16b(b))); }
+
+/*****************************************************************************
+ *
+ *          Vec8b: Base class vector of 8 Booleans
+ *
+ *****************************************************************************/
+
+class Vec8b : public Vec16b
+{
+ public:
+  // Default constructor:
+  Vec8b() {}
+  // Constructor to convert from type __mmask16 used in intrinsics:
+  Vec8b(__mmask16 x) { m16 = x; }
+  // Constructor to build from all elements:
+  Vec8b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7)
+  {
+    m16 = uint16_t(b0 | b1 << 1 | b2 << 2 | b3 << 3 | b4 << 4 | b5 << 5 | b6 << 6 | b7 << 7);
+  }
+  Vec8b(Vec16b const &x) { m16 = __mmask16(x); }
+  // Constructor to broadcast single value:
+  Vec8b(bool b) { m16 = __mmask16(-int8_t(b)); }
+  // Assignment operator to convert from type __mmask16 used in intrinsics:
+  Vec8b &operator=(__mmask16 x)
+  {
+    m16 = x;
+    return *this;
+  }
+
+ private:  // Prevent constructing from int etc. because of ambiguity
+  Vec8b(int b);
+  Vec8b &operator=(int x);
+
+ public:
+  // split into two halves
+  Vec4qb get_low() const { return Vec4qb(Vec4q(_mm512_castsi512_si256(_mm512_maskz_set1_epi64(__mmask16(m16), -1LL)))); }
+  Vec4qb get_high() const { return Vec8b(__mmask16(m16 >> 4)).get_low(); }
+  static int size() { return 8; }
+};
+
+/*****************************************************************************
+ *
+ *          Functions for boolean vectors
+ *
+ *****************************************************************************/
+
+// function andnot: a & ~ b
+static inline Vec8b andnot(Vec8b a, Vec8b b) { return _mm512_kandn(b, a); }
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec8b const &a) { return (uint8_t)(__mmask16)a == 0xFF; }
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec8b const &a) { return (uint8_t)(__mmask16)a != 0; }
+
+/*****************************************************************************
+ *
+ *          Vec8qb: Vector of 8 Booleans for use with Vec8q and Vec8qu
+ *
+ *****************************************************************************/
+
+class Vec8qb : public Vec8b
+{
+ public:
+  // Default constructor:
+  Vec8qb() {}
+  Vec8qb(Vec16b x) { m16 = x; }
+  // Constructor to build from all elements:
+  Vec8qb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) : Vec8b(x0, x1, x2, x3, x4, x5, x6, x7) {}
+  // Constructor to convert from type __mmask8 used in intrinsics:
+  Vec8qb(__mmask8 x) { m16 = (__mmask16)x; }
+  // Constructor to convert from type __mmask16 used in intrinsics:
+  Vec8qb(__mmask16 x) { m16 = x; }
+  // Assignment operator to convert from type __mmask16 used in intrinsics:
+  Vec8qb &operator=(__mmask16 x)
+  {
+    m16 = x;
+    return *this;
+  }
+  // Constructor to broadcast single value:
+  Vec8qb(bool b) : Vec8b(b) {}
+  // Assignment operator to broadcast scalar:
+  Vec8qb &operator=(bool b)
+  {
+    m16 = Vec8b(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec8qb(int b);
+  Vec8qb &operator=(int x);
+
+ public:
+  // Constructor to make from two halves
+  Vec8qb(Vec4qb const &x0, Vec4qb const &x1)
+  {
+    // = Vec8q(x0,x1) != 0;  (not defined yet)
+    __m512i z = _mm512_inserti64x4(_mm512_castsi256_si512(x0), x1, 1);
+    m16       = _mm512_cmpneq_epi64_mask(z, _mm512_setzero_si512());
+  }
+};
+
+// Define operators for Vec8qb
+
+// vector operator & : bitwise and
+static inline Vec8qb operator&(Vec8qb a, Vec8qb b) { return Vec16b(a) & Vec16b(b); }
+static inline Vec8qb operator&&(Vec8qb a, Vec8qb b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec8qb operator|(Vec8qb a, Vec8qb b) { return Vec16b(a) | Vec16b(b); }
+static inline Vec8qb operator||(Vec8qb a, Vec8qb b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec8qb operator^(Vec8qb a, Vec8qb b) { return Vec16b(a) ^ Vec16b(b); }
+
+// vector operator ~ : bitwise not
+static inline Vec8qb operator~(Vec8qb a) { return ~Vec16b(a); }
+
+// vector operator ! : element not
+static inline Vec8qb operator!(Vec8qb a) { return ~a; }
+
+// vector operator &= : bitwise and
+static inline Vec8qb &operator&=(Vec8qb &a, Vec8qb b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8qb &operator|=(Vec8qb &a, Vec8qb b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8qb &operator^=(Vec8qb &a, Vec8qb b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// to_bits: convert to integer bitfield
+static inline uint32_t to_bits(Vec8qb a) { return (uint8_t)(__mmask16)a; }
+
+// vector function andnot
+static inline Vec8qb andnot(Vec8qb a, Vec8qb b) { return Vec8qb(andnot(Vec16b(a), Vec16b(b))); }
+
+/*****************************************************************************
+ *
+ *          Vector of 512 1-bit unsigned integers (base class for Vec16i)
+ *
+ *****************************************************************************/
+class Vec512b
+{
+ protected:
+  __m512i zmm;  // Integer vector
+ public:
+  // Default constructor:
+  Vec512b() {}
+  // Constructor to build from two Vec256b:
+  Vec512b(Vec256b const &a0, Vec256b const &a1) { zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1); }
+  // Constructor to convert from type __m512i used in intrinsics:
+  Vec512b(__m512i const &x) { zmm = x; }
+  // Assignment operator to convert from type __m512i used in intrinsics:
+  Vec512b &operator=(__m512i const &x)
+  {
+    zmm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m512i used in intrinsics
+  operator __m512i() const { return zmm; }
+  // Member function to load from array (unaligned)
+  Vec512b &load(void const *p)
+  {
+    zmm = _mm512_loadu_si512(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  // You may use load_a instead of load if you are certain that p points to an address
+  // divisible by 64, but there is hardly any speed advantage of load_a on modern processors
+  Vec512b &load_a(void const *p)
+  {
+    zmm = _mm512_load_si512(p);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(void *p) const { _mm512_storeu_si512(p, zmm); }
+  // Member function to store into array, aligned by 64
+  // You may use store_a instead of store if you are certain that p points to an address
+  // divisible by 64, but there is hardly any speed advantage of store_a on modern processors
+  void store_a(void *p) const { _mm512_store_si512(p, zmm); }
+  // Member function to change a single bit, mainly for test purposes
+  // Note: This function is inefficient. Use load function if changing more than one bit
+  Vec512b const &set_bit(uint32_t index, int value)
+  {
+    static uint64_t m[16] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0};
+    int wi                = (index >> 6) & 7;  // qword index
+    int bi                = index & 0x3F;      // bit index within qword w
+
+    __m512i mask = Vec512b().load(m + 8 - wi);                     // 1 in qword number wi
+    mask         = _mm512_sll_epi64(mask, _mm_cvtsi32_si128(bi));  // mask with bit number b set
+    if(value & 1)
+      {
+        zmm = _mm512_or_si512(mask, zmm);
+      }
+    else
+      {
+        zmm = _mm512_andnot_si512(mask, zmm);
+      }
+    return *this;
+  }
+  // Member function to get a single bit, mainly for test purposes
+  // Note: This function is inefficient. Use store function if reading more than one bit
+  int get_bit(uint32_t index) const
+  {
+    union
+    {
+      __m512i z;
+      uint8_t i[64];
+    } u;
+    u.z    = zmm;
+    int wi = (index >> 3) & 0x3F;  // byte index
+    int bi = index & 7;            // bit index within byte w
+    return (u.i[wi] >> bi) & 1;
+  }
+  // Member functions to split into two Vec256b:
+  Vec256b get_low() const { return _mm512_castsi512_si256(zmm); }
+  Vec256b get_high() const { return _mm512_extracti64x4_epi64(zmm, 1); }
+  static int size() { return 512; }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec512b operator&(Vec512b const &a, Vec512b const &b) { return _mm512_and_epi32(a, b); }
+static inline Vec512b operator&&(Vec512b const &a, Vec512b const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec512b operator|(Vec512b const &a, Vec512b const &b) { return _mm512_or_epi32(a, b); }
+static inline Vec512b operator||(Vec512b const &a, Vec512b const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec512b operator^(Vec512b const &a, Vec512b const &b) { return _mm512_xor_epi32(a, b); }
+
+// vector operator ~ : bitwise not
+static inline Vec512b operator~(Vec512b const &a) { return _mm512_xor_epi32(a, _mm512_set1_epi32(-1)); }
+
+// vector operator &= : bitwise and
+static inline Vec512b &operator&=(Vec512b &a, Vec512b const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec512b &operator|=(Vec512b &a, Vec512b const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec512b &operator^=(Vec512b &a, Vec512b const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec512b andnot(Vec512b const &a, Vec512b const &b) { return _mm512_andnot_epi32(b, a); }
+
+/*****************************************************************************
+ *
+ *          Vector of 16 32-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec16i : public Vec512b
+{
+ public:
+  // Default constructor:
+  Vec16i(){};
+  // Constructor to broadcast the same value into all elements:
+  Vec16i(int i) { zmm = _mm512_set1_epi32(i); };
+  // Constructor to build from all elements:
+  Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7, int32_t i8, int32_t i9,
+         int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15)
+  {
+    zmm = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+  };
+  // Constructor to build from two Vec8i:
+  Vec16i(Vec8i const &a0, Vec8i const &a1) { zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1); }
+  // Constructor to convert from type __m512i used in intrinsics:
+  Vec16i(__m512i const &x) { zmm = x; };
+  // Assignment operator to convert from type __m512i used in intrinsics:
+  Vec16i &operator=(__m512i const &x)
+  {
+    zmm = x;
+    return *this;
+  };
+  // Type cast operator to convert to __m512i used in intrinsics
+  operator __m512i() const { return zmm; };
+  // Member function to load from array (unaligned)
+  Vec16i &load(void const *p)
+  {
+    zmm = _mm512_loadu_si512(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  Vec16i &load_a(void const *p)
+  {
+    zmm = _mm512_load_si512(p);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec16i &load_partial(int n, void const *p)
+  {
+    zmm = _mm512_maskz_loadu_epi32(__mmask16((1 << n) - 1), p);
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const { _mm512_mask_storeu_epi32(p, __mmask16((1 << n) - 1), zmm); }
+  // cut off vector to n elements. The last 16-n elements are set to zero
+  Vec16i &cutoff(int n)
+  {
+    zmm = _mm512_maskz_mov_epi32(__mmask16((1 << n) - 1), zmm);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  Vec16i const &insert(uint32_t index, int32_t value)
+  {
+    zmm = _mm512_mask_set1_epi32(zmm, __mmask16(1 << index), value);
+    return *this;
+  };
+  // Member function extract a single element from vector
+  int32_t extract(uint32_t index) const
+  {
+    int32_t a[16];
+    store(a);
+    return a[index & 15];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int32_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec8i:
+  Vec8i get_low() const { return _mm512_castsi512_si256(zmm); }
+  Vec8i get_high() const { return _mm512_extracti64x4_epi64(zmm, 1); }
+  static int size() { return 16; }
+};
+
+// Define operators for Vec16i
+
+// vector operator + : add element by element
+static inline Vec16i operator+(Vec16i const &a, Vec16i const &b) { return _mm512_add_epi32(a, b); }
+
+// vector operator += : add
+static inline Vec16i &operator+=(Vec16i &a, Vec16i const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec16i operator++(Vec16i &a, int)
+{
+  Vec16i a0 = a;
+  a         = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec16i &operator++(Vec16i &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16i operator-(Vec16i const &a, Vec16i const &b) { return _mm512_sub_epi32(a, b); }
+
+// vector operator - : unary minus
+static inline Vec16i operator-(Vec16i const &a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+// vector operator -= : subtract
+static inline Vec16i &operator-=(Vec16i &a, Vec16i const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec16i operator--(Vec16i &a, int)
+{
+  Vec16i a0 = a;
+  a         = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec16i &operator--(Vec16i &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16i operator*(Vec16i const &a, Vec16i const &b) { return _mm512_mullo_epi32(a, b); }
+
+// vector operator *= : multiply
+static inline Vec16i &operator*=(Vec16i &a, Vec16i const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+// vector operator << : shift left
+static inline Vec16i operator<<(Vec16i const &a, int32_t b) { return _mm512_sll_epi32(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator <<= : shift left
+static inline Vec16i &operator<<=(Vec16i &a, int32_t b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16i operator>>(Vec16i const &a, int32_t b) { return _mm512_sra_epi32(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16i &operator>>=(Vec16i &a, int32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16ib operator==(Vec16i const &a, Vec16i const &b) { return _mm512_cmpeq_epi32_mask(a, b); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16ib operator!=(Vec16i const &a, Vec16i const &b) { return _mm512_cmpneq_epi32_mask(a, b); }
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16ib operator>(Vec16i const &a, Vec16i const &b) { return _mm512_cmpgt_epi32_mask(a, b); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16ib operator<(Vec16i const &a, Vec16i const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16ib operator>=(Vec16i const &a, Vec16i const &b) { return _mm512_cmpge_epi32_mask(a, b); }
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16ib operator<=(Vec16i const &a, Vec16i const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec16i operator&(Vec16i const &a, Vec16i const &b) { return _mm512_and_epi32(a, b); }
+
+// vector operator &= : bitwise and
+static inline Vec16i &operator&=(Vec16i &a, Vec16i const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16i operator|(Vec16i const &a, Vec16i const &b) { return _mm512_or_epi32(a, b); }
+
+// vector operator |= : bitwise or
+static inline Vec16i &operator|=(Vec16i &a, Vec16i const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16i operator^(Vec16i const &a, Vec16i const &b) { return _mm512_xor_epi32(a, b); }
+
+// vector operator ^= : bitwise xor
+static inline Vec16i &operator^=(Vec16i &a, Vec16i const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16i operator~(Vec16i const &a) { return a ^ Vec16i(-1); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16i select(Vec16ib const &s, Vec16i const &a, Vec16i const &b)
+{
+  return _mm512_mask_mov_epi32(b, s, a);  // conditional move may be optimized better by the compiler than blend
+                                          // return _mm512_mask_blend_epi32(s, b, a);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16i if_add(Vec16ib const &f, Vec16i const &a, Vec16i const &b) { return _mm512_mask_add_epi32(a, f, a, b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add(Vec16i const &a)
+{
+#if defined(__INTEL_COMPILER)
+  return _mm512_reduce_add_epi32(a);
+#else
+  return horizontal_add(a.get_low() + a.get_high());
+#endif
+}
+
+// function add_saturated: add element by element, signed with saturation
+// (is it faster to up-convert to 64 bit integers, and then downconvert the sum with saturation?)
+static inline Vec16i add_saturated(Vec16i const &a, Vec16i const &b)
+{
+  __m512i sum    = _mm512_add_epi32(a, b);                                  // a + b
+  __m512i axb    = _mm512_xor_epi32(a, b);                                  // check if a and b have different sign
+  __m512i axs    = _mm512_xor_epi32(a, sum);                                // check if a and sum have different sign
+  __m512i ovf1   = _mm512_andnot_epi32(axb, axs);                           // check if sum has wrong sign
+  __m512i ovf2   = _mm512_srai_epi32(ovf1, 31);                             // -1 if overflow
+  __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32());  // same, as mask
+  __m512i asign  = _mm512_srli_epi32(a, 31);                                // 1  if a < 0
+  __m512i sat1   = _mm512_srli_epi32(ovf2, 1);                              // 7FFFFFFF if overflow
+  __m512i sat2   = _mm512_add_epi32(sat1, asign);   // 7FFFFFFF if positive overflow 80000000 if negative overflow
+  return _mm512_mask_blend_epi32(ovf3, sum, sat2);  // sum if not overflow, else sat2
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16i sub_saturated(Vec16i const &a, Vec16i const &b)
+{
+  __m512i diff   = _mm512_sub_epi32(a, b);                                  // a + b
+  __m512i axb    = _mm512_xor_si512(a, b);                                  // check if a and b have different sign
+  __m512i axs    = _mm512_xor_si512(a, diff);                               // check if a and sum have different sign
+  __m512i ovf1   = _mm512_and_si512(axb, axs);                              // check if sum has wrong sign
+  __m512i ovf2   = _mm512_srai_epi32(ovf1, 31);                             // -1 if overflow
+  __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32());  // same, as mask
+  __m512i asign  = _mm512_srli_epi32(a, 31);                                // 1  if a < 0
+  __m512i sat1   = _mm512_srli_epi32(ovf2, 1);                              // 7FFFFFFF if overflow
+  __m512i sat2   = _mm512_add_epi32(sat1, asign);    // 7FFFFFFF if positive overflow 80000000 if negative overflow
+  return _mm512_mask_blend_epi32(ovf3, diff, sat2);  // sum if not overflow, else sat2
+}
+
+// function max: a > b ? a : b
+static inline Vec16i max(Vec16i const &a, Vec16i const &b) { return _mm512_max_epi32(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec16i min(Vec16i const &a, Vec16i const &b) { return _mm512_min_epi32(a, b); }
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16i abs(Vec16i const &a) { return _mm512_abs_epi32(a); }
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16i abs_saturated(Vec16i const &a) { return _mm512_min_epu32(abs(a), Vec16i(0x7FFFFFFF)); }
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16i rotate_left(Vec16i const &a, int b) { return _mm512_rolv_epi32(a, Vec16i(b)); }
+
+/*****************************************************************************
+ *
+ *          Vector of 16 32-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec16ui : public Vec16i
+{
+ public:
+  // Default constructor:
+  Vec16ui(){};
+  // Constructor to broadcast the same value into all elements:
+  Vec16ui(uint32_t i) { zmm = _mm512_set1_epi32(i); };
+  // Constructor to build from all elements:
+  Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7, uint32_t i8,
+          uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15)
+  {
+    zmm = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+  };
+  // Constructor to build from two Vec8ui:
+  Vec16ui(Vec8ui const &a0, Vec8ui const &a1) { zmm = Vec16i(Vec8i(a0), Vec8i(a1)); }
+  // Constructor to convert from type __m512i used in intrinsics:
+  Vec16ui(__m512i const &x) { zmm = x; };
+  // Assignment operator to convert from type __m512i used in intrinsics:
+  Vec16ui &operator=(__m512i const &x)
+  {
+    zmm = x;
+    return *this;
+  };
+  // Member function to load from array (unaligned)
+  Vec16ui &load(void const *p)
+  {
+    Vec16i::load(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  Vec16ui &load_a(void const *p)
+  {
+    Vec16i::load_a(p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec16ui const &insert(uint32_t index, uint32_t value)
+  {
+    Vec16i::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint32_t extract(uint32_t index) const { return Vec16i::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint32_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4ui:
+  Vec8ui get_low() const { return Vec8ui(Vec16i::get_low()); }
+  Vec8ui get_high() const { return Vec8ui(Vec16i::get_high()); }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16ui operator+(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) + Vec16i(b)); }
+
+// vector operator - : subtract
+static inline Vec16ui operator-(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) - Vec16i(b)); }
+
+// vector operator * : multiply
+static inline Vec16ui operator*(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) * Vec16i(b)); }
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec16ui operator>>(Vec16ui const &a, uint32_t b) { return _mm512_srl_epi32(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec16ui operator>>(Vec16ui const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right logical
+static inline Vec16ui &operator>>=(Vec16ui &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec16ui &operator>>=(Vec16ui &a, int32_t b)
+{
+  a = a >> uint32_t(b);
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator<<(Vec16ui const &a, uint32_t b) { return Vec16ui((Vec16i)a << (int32_t)b); }
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator<<(Vec16ui const &a, int32_t b) { return Vec16ui((Vec16i)a << (int32_t)b); }
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16ib operator<(Vec16ui const &a, Vec16ui const &b) { return _mm512_cmplt_epu32_mask(a, b); }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16ib operator>(Vec16ui const &a, Vec16ui const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16ib operator>=(Vec16ui const &a, Vec16ui const &b) { return _mm512_cmpge_epu32_mask(a, b); }
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16ib operator<=(Vec16ui const &a, Vec16ui const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec16ui operator&(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) & Vec16i(b)); }
+
+// vector operator | : bitwise or
+static inline Vec16ui operator|(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) | Vec16i(b)); }
+
+// vector operator ^ : bitwise xor
+static inline Vec16ui operator^(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) ^ Vec16i(b)); }
+
+// vector operator ~ : bitwise not
+static inline Vec16ui operator~(Vec16ui const &a) { return Vec16ui(~Vec16i(a)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16ui select(Vec16ib const &s, Vec16ui const &a, Vec16ui const &b) { return Vec16ui(select(s, Vec16i(a), Vec16i(b))); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16ui if_add(Vec16ib const &f, Vec16ui const &a, Vec16ui const &b) { return Vec16ui(if_add(f, Vec16i(a), Vec16i(b))); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add(Vec16ui const &a) { return horizontal_add((Vec16i)a); }
+
+// horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16ui add_saturated(Vec16ui const &a, Vec16ui const &b)
+{
+  Vec16ui sum      = a + b;
+  Vec16ib overflow = sum < (a | b);                  // overflow if (a + b) < (a | b)
+  return _mm512_mask_set1_epi32(sum, overflow, -1);  // 0xFFFFFFFF if overflow
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16ui sub_saturated(Vec16ui const &a, Vec16ui const &b)
+{
+  Vec16ui diff = a - b;
+  return _mm512_maskz_mov_epi32(diff <= a, diff);  // underflow if diff > a gives zero
+}
+
+// function max: a > b ? a : b
+static inline Vec16ui max(Vec16ui const &a, Vec16ui const &b) { return _mm512_max_epu32(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec16ui min(Vec16ui const &a, Vec16ui const &b) { return _mm512_min_epu32(a, b); }
+
+/*****************************************************************************
+ *
+ *          Vector of 8 64-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec8q : public Vec512b
+{
+ public:
+  // Default constructor:
+  Vec8q() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8q(int64_t i) { zmm = _mm512_set1_epi64(i); }
+  // Constructor to build from all elements:
+  Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7)
+  {
+    zmm = _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
+  }
+  // Constructor to build from two Vec4q:
+  Vec8q(Vec4q const &a0, Vec4q const &a1) { zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1); }
+  // Constructor to convert from type __m512i used in intrinsics:
+  Vec8q(__m512i const &x) { zmm = x; }
+  // Assignment operator to convert from type __m512i used in intrinsics:
+  Vec8q &operator=(__m512i const &x)
+  {
+    zmm = x;
+    return *this;
+  }
+  // Type cast operator to convert to __m512i used in intrinsics
+  operator __m512i() const { return zmm; }
+  // Member function to load from array (unaligned)
+  Vec8q &load(void const *p)
+  {
+    zmm = _mm512_loadu_si512(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  Vec8q &load_a(void const *p)
+  {
+    zmm = _mm512_load_si512(p);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec8q &load_partial(int n, void const *p)
+  {
+    zmm = _mm512_maskz_loadu_epi64(__mmask16((1 << n) - 1), p);
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const { _mm512_mask_storeu_epi64(p, __mmask16((1 << n) - 1), zmm); }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec8q &cutoff(int n)
+  {
+    zmm = _mm512_maskz_mov_epi64(__mmask16((1 << n) - 1), zmm);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8q const &insert(uint32_t index, int64_t value)
+  {
+    zmm = _mm512_mask_set1_epi64(zmm, __mmask16(1 << index), value);
+    // zmm = _mm512_mask_blend_epi64(__mmask16(1 << index), zmm, _mm512_set1_epi64(value));
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int64_t extract(uint32_t index) const
+  {
+    int64_t a[8];
+    store(a);
+    return a[index & 7];
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int64_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec2q:
+  Vec4q get_low() const { return _mm512_castsi512_si256(zmm); }
+  Vec4q get_high() const { return _mm512_extracti64x4_epi64(zmm, 1); }
+  static int size() { return 8; }
+};
+
+// Define operators for Vec8q
+
+// vector operator + : add element by element
+static inline Vec8q operator+(Vec8q const &a, Vec8q const &b) { return _mm512_add_epi64(a, b); }
+
+// vector operator += : add
+static inline Vec8q &operator+=(Vec8q &a, Vec8q const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec8q operator++(Vec8q &a, int)
+{
+  Vec8q a0 = a;
+  a        = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec8q &operator++(Vec8q &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8q operator-(Vec8q const &a, Vec8q const &b) { return _mm512_sub_epi64(a, b); }
+
+// vector operator - : unary minus
+static inline Vec8q operator-(Vec8q const &a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); }
+
+// vector operator -= : subtract
+static inline Vec8q &operator-=(Vec8q &a, Vec8q const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec8q operator--(Vec8q &a, int)
+{
+  Vec8q a0 = a;
+  a        = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec8q &operator--(Vec8q &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8q operator*(Vec8q const &a, Vec8q const &b)
+{
+#ifdef __AVX512DQ__
+  return _mm512_mullo_epi64(a, b);
+#elif defined(__INTEL_COMPILER)
+  return _mm512_mullox_epi64(a, b);  // _mm512_mullox_epi64 missing in gcc
+#else
+  // instruction does not exist. Split into 32-bit multiplies
+  //__m512i ahigh = _mm512_shuffle_epi32(a, 0xB1);       // swap H<->L
+  __m512i ahigh   = _mm512_srli_epi64(a, 32);            // high 32 bits of each a
+  __m512i bhigh   = _mm512_srli_epi64(b, 32);            // high 32 bits of each b
+  __m512i prodahb = _mm512_mul_epu32(ahigh, b);          // ahigh*b
+  __m512i prodbha = _mm512_mul_epu32(bhigh, a);          // bhigh*a
+  __m512i prodhl  = _mm512_add_epi64(prodahb, prodbha);  // sum of high*low products
+  __m512i prodhi  = _mm512_slli_epi64(prodhl, 32);       // same, shifted high
+  __m512i prodll  = _mm512_mul_epu32(a, b);              // alow*blow = 64 bit unsigned products
+  __m512i prod    = _mm512_add_epi64(prodll, prodhi);    // low*low+(high*low)<<32
+  return prod;
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec8q &operator*=(Vec8q &a, Vec8q const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator << : shift left
+static inline Vec8q operator<<(Vec8q const &a, int32_t b) { return _mm512_sll_epi64(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator <<= : shift left
+static inline Vec8q &operator<<=(Vec8q &a, int32_t b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8q operator>>(Vec8q const &a, int32_t b) { return _mm512_sra_epi64(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8q &operator>>=(Vec8q &a, int32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8qb operator==(Vec8q const &a, Vec8q const &b) { return Vec8qb(_mm512_cmpeq_epi64_mask(a, b)); }
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8qb operator!=(Vec8q const &a, Vec8q const &b) { return Vec8qb(_mm512_cmpneq_epi64_mask(a, b)); }
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8qb operator<(Vec8q const &a, Vec8q const &b) { return Vec8qb(_mm512_cmplt_epi64_mask(a, b)); }
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8qb operator>(Vec8q const &a, Vec8q const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8qb operator>=(Vec8q const &a, Vec8q const &b) { return Vec8qb(_mm512_cmpge_epi64_mask(a, b)); }
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8qb operator<=(Vec8q const &a, Vec8q const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec8q operator&(Vec8q const &a, Vec8q const &b) { return _mm512_and_epi32(a, b); }
+
+// vector operator &= : bitwise and
+static inline Vec8q &operator&=(Vec8q &a, Vec8q const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8q operator|(Vec8q const &a, Vec8q const &b) { return _mm512_or_epi32(a, b); }
+
+// vector operator |= : bitwise or
+static inline Vec8q &operator|=(Vec8q &a, Vec8q const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8q operator^(Vec8q const &a, Vec8q const &b) { return _mm512_xor_epi32(a, b); }
+// vector operator ^= : bitwise xor
+static inline Vec8q &operator^=(Vec8q &a, Vec8q const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8q operator~(Vec8q const &a) { return Vec8q(~Vec16i(a)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8q select(Vec8qb const &s, Vec8q const &a, Vec8q const &b)
+{
+  return _mm512_mask_mov_epi64(b, s, a);
+  // return _mm512_mask_blend_epi64(s, b, a);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8q if_add(Vec8qb const &f, Vec8q const &a, Vec8q const &b) { return _mm512_mask_add_epi64(a, f, a, b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int64_t horizontal_add(Vec8q const &a)
+{
+#if defined(__INTEL_COMPILER)
+  return _mm512_reduce_add_epi64(a);
+#else
+  return horizontal_add(a.get_low() + a.get_high());
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x(Vec16i const &x)
+{
+  Vec8q a = _mm512_cvtepi32_epi64(x.get_low());
+  Vec8q b = _mm512_cvtepi32_epi64(x.get_high());
+  return horizontal_add(a + b);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x(Vec16ui const &x)
+{
+  Vec8q a = _mm512_cvtepu32_epi64(x.get_low());
+  Vec8q b = _mm512_cvtepu32_epi64(x.get_high());
+  return horizontal_add(a + b);
+}
+
+// function max: a > b ? a : b
+static inline Vec8q max(Vec8q const &a, Vec8q const &b) { return _mm512_max_epi64(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec8q min(Vec8q const &a, Vec8q const &b) { return _mm512_min_epi64(a, b); }
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8q abs(Vec8q const &a) { return _mm512_abs_epi64(a); }
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8q abs_saturated(Vec8q const &a) { return _mm512_min_epu64(abs(a), Vec8q(0x7FFFFFFFFFFFFFFF)); }
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8q rotate_left(Vec8q const &a, int b) { return _mm512_rolv_epi64(a, Vec8q(b)); }
+
+/*****************************************************************************
+ *
+ *          Vector of 8 64-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec8uq : public Vec8q
+{
+ public:
+  // Default constructor:
+  Vec8uq() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8uq(uint64_t i) { zmm = Vec8q(i); }
+  // Constructor to convert from Vec8q:
+  Vec8uq(Vec8q const &x) { zmm = x; }
+  // Constructor to convert from type __m512i used in intrinsics:
+  Vec8uq(__m512i const &x) { zmm = x; }
+  // Constructor to build from all elements:
+  Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7)
+  {
+    zmm = Vec8q(i0, i1, i2, i3, i4, i5, i6, i7);
+  }
+  // Constructor to build from two Vec4uq:
+  Vec8uq(Vec4uq const &a0, Vec4uq const &a1) { zmm = Vec8q(Vec4q(a0), Vec4q(a1)); }
+  // Assignment operator to convert from Vec8q:
+  Vec8uq &operator=(Vec8q const &x)
+  {
+    zmm = x;
+    return *this;
+  }
+  // Assignment operator to convert from type __m512i used in intrinsics:
+  Vec8uq &operator=(__m512i const &x)
+  {
+    zmm = x;
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec8uq &load(void const *p)
+  {
+    Vec8q::load(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec8uq &load_a(void const *p)
+  {
+    Vec8q::load_a(p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8uq const &insert(uint32_t index, uint64_t value)
+  {
+    Vec8q::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint64_t extract(uint32_t index) const { return Vec8q::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint64_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec2uq:
+  Vec4uq get_low() const { return Vec4uq(Vec8q::get_low()); }
+  Vec4uq get_high() const { return Vec4uq(Vec8q::get_high()); }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8uq operator+(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) + Vec8q(b)); }
+
+// vector operator - : subtract
+static inline Vec8uq operator-(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) - Vec8q(b)); }
+
+// vector operator * : multiply element by element
+static inline Vec8uq operator*(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) * Vec8q(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec8uq operator>>(Vec8uq const &a, uint32_t b) { return _mm512_srl_epi64(a, _mm_cvtsi32_si128(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec8uq operator>>(Vec8uq const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right artihmetic
+static inline Vec8uq &operator>>=(Vec8uq &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8uq &operator>>=(Vec8uq &a, int32_t b)
+{
+  a = a >> uint32_t(b);
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8uq operator<<(Vec8uq const &a, uint32_t b) { return Vec8uq((Vec8q)a << (int32_t)b); }
+
+// vector operator << : shift left all elements
+static inline Vec8uq operator<<(Vec8uq const &a, int32_t b) { return Vec8uq((Vec8q)a << b); }
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8qb operator<(Vec8uq const &a, Vec8uq const &b) { return _mm512_cmplt_epu64_mask(a, b); }
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8qb operator>(Vec8uq const &a, Vec8uq const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8qb operator>=(Vec8uq const &a, Vec8uq const &b) { return _mm512_cmpge_epu64_mask(a, b); }
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8qb operator<=(Vec8uq const &a, Vec8uq const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec8uq operator&(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) & Vec8q(b)); }
+
+// vector operator | : bitwise or
+static inline Vec8uq operator|(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) | Vec8q(b)); }
+
+// vector operator ^ : bitwise xor
+static inline Vec8uq operator^(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) ^ Vec8q(b)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8uq select(Vec8qb const &s, Vec8uq const &a, Vec8uq const &b) { return Vec8uq(select(s, Vec8q(a), Vec8q(b))); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8uq if_add(Vec8qb const &f, Vec8uq const &a, Vec8uq const &b) { return _mm512_mask_add_epi64(a, f, a, b); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint64_t horizontal_add(Vec8uq const &a) { return horizontal_add(Vec8q(a)); }
+
+// function max: a > b ? a : b
+static inline Vec8uq max(Vec8uq const &a, Vec8uq const &b) { return _mm512_max_epu64(a, b); }
+
+// function min: a < b ? a : b
+static inline Vec8uq min(Vec8uq const &a, Vec8uq const &b) { return _mm512_min_epu64(a, b); }
+
+/*****************************************************************************
+ *
+ *          Vector permute functions
+ *
+ ******************************************************************************
+ *
+ * These permute functions can reorder the elements of a vector and optionally
+ * set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to select.
+ * An index of -1 will generate zero. An index of -256 means don't care.
+ *
+ * Example:
+ * Vec8q a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+ * Vec8q b;
+ * b = permute8q<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+// Permute vector of 8 64-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q permute8q(Vec8q const &a)
+{
+  // Combine indexes into a single bitfield, with 4 bits for each
+  const int m1 =
+      (i0 & 7) | (i1 & 7) << 4 | (i2 & 7) << 8 | (i3 & 7) << 12 | (i4 & 7) << 16 | (i5 & 7) << 20 | (i6 & 7) << 24 | (i7 & 7) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF0) | (i2 < 0 ? 0 : 0xF00) | (i3 < 0 ? 0 : 0xF000) | (i4 < 0 ? 0 : 0xF0000) |
+                 (i5 < 0 ? 0 : 0xF00000) | (i6 < 0 ? 0 : 0xF000000) | (i7 < 0 ? 0 : 0xF0000000);
+  const int m2 = m1 & mz;
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80) != 0;
+
+  // special case: all zero
+  if(mz == 0)
+    return _mm512_setzero_epi32();
+
+  // mask for elements not zeroed
+  const __mmask16 z = __mmask16((i0 >= 0) << 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                                (i6 >= 0) << 6 | (i7 >= 0) << 7);
+  // same with 2 bits for each element
+  const __mmask16 zz = __mmask16((i0 >= 0 ? 3 : 0) | (i1 >= 0 ? 0xC : 0) | (i2 >= 0 ? 0x30 : 0) | (i3 >= 0 ? 0xC0 : 0) |
+                                 (i4 >= 0 ? 0x300 : 0) | (i5 >= 0 ? 0xC00 : 0) | (i6 >= 0 ? 0x3000 : 0) | (i7 >= 0 ? 0xC000 : 0));
+
+  if(((m1 ^ 0x76543210) & mz) == 0)
+    {
+      // no shuffling
+      if(dozero)
+        {
+          // zero some elements
+          return _mm512_maskz_mov_epi64(z, a);
+        }
+      return a;  // do nothing
+    }
+
+  if(((m1 ^ 0x66442200) & 0x66666666 & mz) == 0)
+    {
+      // no exchange of data between the four 128-bit lanes
+      const int pat   = ((m2 | m2 >> 8 | m2 >> 16 | m2 >> 24) & 0x11) * 0x01010101;
+      const int pmask = ((pat & 1) * 10 + 4) | ((((pat >> 4) & 1) * 10 + 4) << 4);
+      if(((m1 ^ pat) & mz & 0x11111111) == 0)
+        {
+          // same permute pattern in all lanes
+          if(dozero)
+            {  // permute within lanes and zero
+              return _mm512_maskz_shuffle_epi32(zz, a, (_MM_PERM_ENUM)pmask);
+            }
+          else
+            {  // permute within lanes
+              return _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)pmask);
+            }
+        }
+      // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
+    }
+  if((((m1 ^ 0x10101010) & 0x11111111 & mz) == 0) && ((m1 ^ (m1 >> 4)) & 0x06060606 & mz & (mz >> 4)) == 0)
+    {
+      // permute lanes only. no permutation within each lane
+      const int m3 = m2 | (m2 >> 4);
+      const int s  = ((m3 >> 1) & 3) | (((m3 >> 9) & 3) << 2) | (((m3 >> 17) & 3) << 4) | (((m3 >> 25) & 3) << 6);
+      if(dozero)
+        {
+          // permute lanes and zero some 64-bit elements
+          return _mm512_maskz_shuffle_i64x2(z, a, a, (_MM_PERM_ENUM)s);
+        }
+      else
+        {
+          // permute lanes
+          return _mm512_shuffle_i64x2(a, a, (_MM_PERM_ENUM)s);
+        }
+    }
+  // full permute needed
+  const __m512i pmask = constant16i<i0 & 7, 0, i1 & 7, 0, i2 & 7, 0, i3 & 7, 0, i4 & 7, 0, i5 & 7, 0, i6 & 7, 0, i7 & 7, 0>();
+  if(dozero)
+    {
+      // full permute and zeroing
+      // Documentation is inconsistent. which order of the operands is correct?
+      return _mm512_maskz_permutexvar_epi64(z, pmask, a);
+    }
+  else
+    {
+      return _mm512_permutexvar_epi64(pmask, a);
+    }
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8uq permute8uq(Vec8uq const &a)
+{
+  return Vec8uq(permute8q<i0, i1, i2, i3, i4, i5, i6, i7>(a));
+}
+
+// Permute vector of 16 32-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16i permute16i(Vec16i const &a)
+{
+  // Combine indexes into a single bitfield, with 4 bits for each
+  const uint64_t m1 = (i0 & 15) | (i1 & 15) << 4 | (i2 & 15) << 8 | (i3 & 15) << 12 | (i4 & 15) << 16 | (i5 & 15) << 20 |
+                      (i6 & 15) << 24 | (i7 & 15LL) << 28  // 15LL avoids sign extension of (int32_t | int64_t)
+                      | (i8 & 15LL) << 32 | (i9 & 15LL) << 36 | (i10 & 15LL) << 40 | (i11 & 15LL) << 44 | (i12 & 15LL) << 48 |
+                      (i13 & 15LL) << 52 | (i14 & 15LL) << 56 | (i15 & 15LL) << 60;
+
+  // Mask to zero out negative indexes
+  const uint64_t mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF0) | (i2 < 0 ? 0 : 0xF00) | (i3 < 0 ? 0 : 0xF000) |
+                      (i4 < 0 ? 0 : 0xF0000) | (i5 < 0 ? 0 : 0xF00000) | (i6 < 0 ? 0 : 0xF000000) | (i7 < 0 ? 0 : 0xF0000000ULL) |
+                      (i8 < 0 ? 0 : 0xF00000000) | (i9 < 0 ? 0 : 0xF000000000) | (i10 < 0 ? 0 : 0xF0000000000) |
+                      (i11 < 0 ? 0 : 0xF00000000000) | (i12 < 0 ? 0 : 0xF000000000000) | (i13 < 0 ? 0 : 0xF0000000000000) |
+                      (i14 < 0 ? 0 : 0xF00000000000000) | (i15 < 0 ? 0 : 0xF000000000000000);
+
+  const uint64_t m2 = m1 & mz;
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) & 0x80) != 0;
+
+  // special case: all zero
+  if(mz == 0)
+    return _mm512_setzero_epi32();
+
+  // mask for elements not zeroed
+  const __mmask16 z = __mmask16((i0 >= 0) << 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                                (i6 >= 0) << 6 | (i7 >= 0) << 7 | (i8 >= 0) << 8 | (i9 >= 0) << 9 | (i10 >= 0) << 10 |
+                                (i11 >= 0) << 11 | (i12 >= 0) << 12 | (i13 >= 0) << 13 | (i14 >= 0) << 14 | (i15 >= 0) << 15);
+
+  if(((m1 ^ 0xFEDCBA9876543210) & mz) == 0)
+    {
+      // no shuffling
+      if(dozero)
+        {
+          // zero some elements
+          return _mm512_maskz_mov_epi32(z, a);
+        }
+      return a;  // do nothing
+    }
+
+  if(((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0)
+    {
+      // no exchange of data between the four 128-bit lanes
+      const uint64_t pat = ((m2 | (m2 >> 16) | (m2 >> 32) | (m2 >> 48)) & 0x3333) * 0x0001000100010001;
+      const int pmask    = (pat & 3) | (((pat >> 4) & 3) << 2) | (((pat >> 8) & 3) << 4) | (((pat >> 12) & 3) << 6);
+      if(((m1 ^ pat) & 0x3333333333333333 & mz) == 0)
+        {
+          // same permute pattern in all lanes
+          if(dozero)
+            {  // permute within lanes and zero
+              return _mm512_maskz_shuffle_epi32(z, a, (_MM_PERM_ENUM)pmask);
+            }
+          else
+            {  // permute within lanes
+              return _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)pmask);
+            }
+        }
+      // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
+    }
+  const uint64_t lane = (m2 | m2 >> 4 | m2 >> 8 | m2 >> 12) & 0x000C000C000C000C;
+  if((((m1 ^ 0x3210321032103210) & 0x3333333333333333 & mz) == 0) && ((m1 ^ (lane * 0x1111)) & 0xCCCCCCCCCCCCCCCC & mz) == 0)
+    {
+      // permute lanes only. no permutation within each lane
+      const uint64_t s = ((lane >> 2) & 3) | (((lane >> 18) & 3) << 2) | (((lane >> 34) & 3) << 4) | (((lane >> 50) & 3) << 6);
+      if(dozero)
+        {
+          // permute lanes and zero some 64-bit elements
+          return _mm512_maskz_shuffle_i32x4(z, a, a, (_MM_PERM_ENUM)s);
+        }
+      else
+        {
+          // permute lanes
+          return _mm512_shuffle_i32x4(a, a, (_MM_PERM_ENUM)s);
+        }
+    }
+  // full permute needed
+  const __m512i pmask = constant16i<i0 & 15, i1 & 15, i2 & 15, i3 & 15, i4 & 15, i5 & 15, i6 & 15, i7 & 15, i8 & 15, i9 & 15, i10 & 15,
+                                    i11 & 15, i12 & 15, i13 & 15, i14 & 15, i15 & 15>();
+  if(dozero)
+    {
+      // full permute and zeroing
+      return _mm512_maskz_permutexvar_epi32(z, pmask, a);
+    }
+  else
+    {
+      return _mm512_permutexvar_epi32(pmask, a);
+    }
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16ui permute16ui(Vec16ui const &a)
+{
+  return Vec16ui(permute16i<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a));
+}
+
+/*****************************************************************************
+ *
+ *          Vector blend functions
+ *
+ ******************************************************************************
+ *
+ * These blend functions can mix elements from two different vectors and
+ * optionally set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select, where higher indexes indicate an element from the second source
+ * vector. For example, if each vector has 8 elements, then indexes 0 - 7
+ * will select an element from the first vector and indexes 8 - 15 will select
+ * an element from the second vector. A negative index will generate zero.
+ *
+ * Example:
+ * Vec8q a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8q b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+ * Vec8q c;
+ * c = blend8q<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q blend8q(Vec8q const &a, Vec8q const &b)
+{
+  // Combine indexes into a single bitfield, with 4 bits for each
+  const int m1 = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                 (i6 & 0xF) << 24 | (i7 & 0xF) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF0) | (i2 < 0 ? 0 : 0xF00) | (i3 < 0 ? 0 : 0xF000) | (i4 < 0 ? 0 : 0xF0000) |
+                 (i5 < 0 ? 0 : 0xF00000) | (i6 < 0 ? 0 : 0xF000000) | (i7 < 0 ? 0 : 0xF0000000);
+  const int m2 = m1 & mz;
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80) != 0;
+
+  // mask for elements not zeroed
+  const __mmask16 z = __mmask16((i0 >= 0) << 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                                (i6 >= 0) << 6 | (i7 >= 0) << 7);
+
+  // special case: all zero
+  if(mz == 0)
+    return _mm512_setzero_epi32();
+
+  // special case: all from a
+  if((m1 & 0x88888888 & mz) == 0)
+    {
+      return permute8q<i0, i1, i2, i3, i4, i5, i6, i7>(a);
+    }
+
+  // special case: all from b
+  if((~m1 & 0x88888888 & mz) == 0)
+    {
+      return permute8q<i0 ^ 8, i1 ^ 8, i2 ^ 8, i3 ^ 8, i4 ^ 8, i5 ^ 8, i6 ^ 8, i7 ^ 8>(b);
+    }
+
+  // special case: blend without permute
+  if(((m1 ^ 0x76543210) & 0x77777777 & mz) == 0)
+    {
+      __mmask16 blendmask = __mmask16((i0 & 8) >> 3 | (i1 & 8) >> 2 | (i2 & 8) >> 1 | (i3 & 8) >> 0 | (i4 & 8) << 1 | (i5 & 8) << 2 |
+                                      (i6 & 8) << 3 | (i7 & 8) << 4);
+      __m512i t           = _mm512_mask_blend_epi64(blendmask, a, b);
+      if(dozero)
+        {
+          t = _mm512_maskz_mov_epi64(z, t);
+        }
+      return t;
+    }
+  // special case: all data stay within their lane
+  if(((m1 ^ 0x66442200) & 0x66666666 & mz) == 0)
+    {
+      // mask for elements from a and b
+      const uint32_t mb = ((i0 & 8) ? 0xF : 0) | ((i1 & 8) ? 0xF0 : 0) | ((i2 & 8) ? 0xF00 : 0) | ((i3 & 8) ? 0xF000 : 0) |
+                          ((i4 & 8) ? 0xF0000 : 0) | ((i5 & 8) ? 0xF00000 : 0) | ((i6 & 8) ? 0xF000000 : 0) |
+                          ((i7 & 8) ? 0xF0000000 : 0);
+      const uint32_t mbz  = mb & mz;   // mask for nonzero elements from b
+      const uint32_t maz  = ~mb & mz;  // mask for nonzero elements from a
+      const uint32_t m1a  = m1 & maz;
+      const uint32_t m1b  = m1 & mbz;
+      const uint32_t pata = ((m1a | m1a >> 8 | m1a >> 16 | m1a >> 24) & 0xFF) * 0x01010101;  // permute pattern for elements from a
+      const uint32_t patb = ((m1b | m1b >> 8 | m1b >> 16 | m1b >> 24) & 0xFF) * 0x01010101;  // permute pattern for elements from b
+      if(((m1 ^ pata) & 0x11111111 & maz) == 0 && ((m1 ^ patb) & 0x11111111 & mbz) == 0)
+        {
+          // Same permute pattern in all lanes:
+          // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
+          // and we are saving 64 bytes of data cache.
+          // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
+          __m512i ta = permute8q < (maz & 0xF) ? i0 & 7 : -1, (maz & 0xF0) ? i1 & 7 : -1, (maz & 0xF00) ? i2 & 7 : -1,
+                  (maz & 0xF000) ? i3 & 7 : -1, (maz & 0xF0000) ? i4 & 7 : -1, (maz & 0xF00000) ? i5 & 7 : -1,
+                  (maz & 0xF000000) ? i6 & 7 : -1, (maz & 0xF0000000) ? i7 & 7 : -1 > (a);
+          // write mask for elements from b
+          const __mmask16 sb = ((mbz & 0xF) ? 3 : 0) | ((mbz & 0xF0) ? 0xC : 0) | ((mbz & 0xF00) ? 0x30 : 0) |
+                               ((mbz & 0xF000) ? 0xC0 : 0) | ((mbz & 0xF0000) ? 0x300 : 0) | ((mbz & 0xF00000) ? 0xC00 : 0) |
+                               ((mbz & 0xF000000) ? 0x3000 : 0) | ((mbz & 0xF0000000) ? 0xC000 : 0);
+          // permute index for elements from b
+          const int pi = ((patb & 1) * 10 + 4) | ((((patb >> 4) & 1) * 10 + 4) << 4);
+          // 2. Permute elements from b and combine with elements from a through write mask
+          return _mm512_mask_shuffle_epi32(ta, sb, b, (_MM_PERM_ENUM)pi);
+        }
+      // not same permute pattern in all lanes. use full permute
+    }
+  // general case: full permute
+  const __m512i pmask =
+      constant16i<i0 & 0xF, 0, i1 & 0xF, 0, i2 & 0xF, 0, i3 & 0xF, 0, i4 & 0xF, 0, i5 & 0xF, 0, i6 & 0xF, 0, i7 & 0xF, 0>();
+  if(dozero)
+    {
+      return _mm512_maskz_permutex2var_epi64(z, a, pmask, b);
+    }
+  else
+    {
+      return _mm512_permutex2var_epi64(a, pmask, b);
+    }
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8uq blend8uq(Vec8uq const &a, Vec8uq const &b)
+{
+  return Vec8uq(blend8q<i0, i1, i2, i3, i4, i5, i6, i7>(a, b));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16i blend16i(Vec16i const &a, Vec16i const &b)
+{
+  // Combine indexes into a single bitfield, with 4 bits for each indicating shuffle, but not source
+  const uint64_t m1 = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                      (i6 & 0xF) << 24 | (i7 & 0xFLL) << 28 | (i8 & 0xFLL) << 32 | (i9 & 0xFLL) << 36 | (i10 & 0xFLL) << 40 |
+                      (i11 & 0xFLL) << 44 | (i12 & 0xFLL) << 48 | (i13 & 0xFLL) << 52 | (i14 & 0xFLL) << 56 | (i15 & 0xFLL) << 60;
+
+  // Mask to zero out negative indexes
+  const uint64_t mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF0) | (i2 < 0 ? 0 : 0xF00) | (i3 < 0 ? 0 : 0xF000) |
+                      (i4 < 0 ? 0 : 0xF0000) | (i5 < 0 ? 0 : 0xF00000) | (i6 < 0 ? 0 : 0xF000000) | (i7 < 0 ? 0 : 0xF0000000ULL) |
+                      (i8 < 0 ? 0 : 0xF00000000) | (i9 < 0 ? 0 : 0xF000000000) | (i10 < 0 ? 0 : 0xF0000000000) |
+                      (i11 < 0 ? 0 : 0xF00000000000) | (i12 < 0 ? 0 : 0xF000000000000) | (i13 < 0 ? 0 : 0xF0000000000000) |
+                      (i14 < 0 ? 0 : 0xF00000000000000) | (i15 < 0 ? 0 : 0xF000000000000000);
+  const uint64_t m2 = m1 & mz;
+
+  // collect bit 4 of each index = select source
+  const uint64_t ms = ((i0 & 16) ? 0xF : 0) | ((i1 & 16) ? 0xF0 : 0) | ((i2 & 16) ? 0xF00 : 0) | ((i3 & 16) ? 0xF000 : 0) |
+                      ((i4 & 16) ? 0xF0000 : 0) | ((i5 & 16) ? 0xF00000 : 0) | ((i6 & 16) ? 0xF000000 : 0) |
+                      ((i7 & 16) ? 0xF0000000ULL : 0) | ((i8 & 16) ? 0xF00000000 : 0) | ((i9 & 16) ? 0xF000000000 : 0) |
+                      ((i10 & 16) ? 0xF0000000000 : 0) | ((i11 & 16) ? 0xF00000000000 : 0) | ((i12 & 16) ? 0xF000000000000 : 0) |
+                      ((i13 & 16) ? 0xF0000000000000 : 0) | ((i14 & 16) ? 0xF00000000000000 : 0) |
+                      ((i15 & 16) ? 0xF000000000000000 : 0);
+
+  // zeroing needed
+  const bool dozero = ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) & 0x80) != 0;
+
+  // mask for elements not zeroed
+  const __mmask16 z = __mmask16((i0 >= 0) << 0 | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3 | (i4 >= 0) << 4 | (i5 >= 0) << 5 |
+                                (i6 >= 0) << 6 | (i7 >= 0) << 7 | (i8 >= 0) << 8 | (i9 >= 0) << 9 | (i10 >= 0) << 10 |
+                                (i11 >= 0) << 11 | (i12 >= 0) << 12 | (i13 >= 0) << 13 | (i14 >= 0) << 14 | (i15 >= 0) << 15);
+
+  // special case: all zero
+  if(mz == 0)
+    return _mm512_setzero_epi32();
+
+  // special case: all from a
+  if((ms & mz) == 0)
+    {
+      return permute16i<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a);
+    }
+
+  // special case: all from b
+  if((~ms & mz) == 0)
+    {
+      return permute16i<i0 ^ 16, i1 ^ 16, i2 ^ 16, i3 ^ 16, i4 ^ 16, i5 ^ 16, i6 ^ 16, i7 ^ 16, i8 ^ 16, i9 ^ 16, i10 ^ 16, i11 ^ 16,
+                        i12 ^ 16, i13 ^ 16, i14 ^ 16, i15 ^ 16>(b);
+    }
+
+  // special case: blend without permute
+  if(((m1 ^ 0xFEDCBA9876543210) & mz) == 0)
+    {
+      __mmask16 blendmask = __mmask16((i0 & 16) >> 4 | (i1 & 16) >> 3 | (i2 & 16) >> 2 | (i3 & 16) >> 1 | (i4 & 16) | (i5 & 16) << 1 |
+                                      (i6 & 16) << 2 | (i7 & 16) << 3 | (i8 & 16) << 4 | (i9 & 16) << 5 | (i10 & 16) << 6 |
+                                      (i11 & 16) << 7 | (i12 & 16) << 8 | (i13 & 16) << 9 | (i14 & 16) << 10 | (i15 & 16) << 11);
+      __m512i t           = _mm512_mask_blend_epi32(blendmask, a, b);
+      if(dozero)
+        {
+          t = _mm512_maskz_mov_epi32(z, t);
+        }
+      return t;
+    }
+
+  // special case: all data stay within their lane
+  if(((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0)
+    {
+      // mask for elements from a and b
+      const uint64_t mb  = ms;
+      const uint64_t mbz = mb & mz;   // mask for nonzero elements from b
+      const uint64_t maz = ~mb & mz;  // mask for nonzero elements from a
+      const uint64_t m1a = m1 & maz;
+      const uint64_t m1b = m1 & mbz;
+      const uint64_t pata =
+          ((m1a | m1a >> 16 | m1a >> 32 | m1a >> 48) & 0xFFFF) * 0x0001000100010001;  // permute pattern for elements from a
+      const uint64_t patb =
+          ((m1b | m1b >> 16 | m1b >> 32 | m1b >> 48) & 0xFFFF) * 0x0001000100010001;  // permute pattern for elements from b
+      if(((m1 ^ pata) & 0x3333333333333333 & maz) == 0 && ((m1 ^ patb) & 0x3333333333333333 & mbz) == 0)
+        {
+          // Same permute pattern in all lanes:
+          // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
+          // and we are saving 64 bytes of data cache.
+          // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
+          __m512i ta = permute16i < (maz & 0xF) ? i0 & 15 : -1, (maz & 0xF0) ? i1 & 15 : -1, (maz & 0xF00) ? i2 & 15 : -1,
+                  (maz & 0xF000) ? i3 & 15 : -1, (maz & 0xF0000) ? i4 & 15 : -1, (maz & 0xF00000) ? i5 & 15 : -1,
+                  (maz & 0xF000000) ? i6 & 15 : -1, (maz & 0xF0000000) ? i7 & 15 : -1, (maz & 0xF00000000) ? i8 & 15 : -1,
+                  (maz & 0xF000000000) ? i9 & 15 : -1, (maz & 0xF0000000000) ? i10 & 15 : -1, (maz & 0xF00000000000) ? i11 & 15 : -1,
+                  (maz & 0xF000000000000) ? i12 & 15 : -1, (maz & 0xF0000000000000) ? i13 & 15 : -1,
+                  (maz & 0xF00000000000000) ? i14 & 15 : -1, (maz & 0xF000000000000000) ? i15 & 15 : -1 > (a);
+          // write mask for elements from b
+          const __mmask16 sb = ((mbz & 0xF) ? 1 : 0) | ((mbz & 0xF0) ? 0x2 : 0) | ((mbz & 0xF00) ? 0x4 : 0) |
+                               ((mbz & 0xF000) ? 0x8 : 0) | ((mbz & 0xF0000) ? 0x10 : 0) | ((mbz & 0xF00000) ? 0x20 : 0) |
+                               ((mbz & 0xF000000) ? 0x40 : 0) | ((mbz & 0xF0000000) ? 0x80 : 0) | ((mbz & 0xF00000000) ? 0x100 : 0) |
+                               ((mbz & 0xF000000000) ? 0x200 : 0) | ((mbz & 0xF0000000000) ? 0x400 : 0) |
+                               ((mbz & 0xF00000000000) ? 0x800 : 0) | ((mbz & 0xF000000000000) ? 0x1000 : 0) |
+                               ((mbz & 0xF0000000000000) ? 0x2000 : 0) | ((mbz & 0xF00000000000000) ? 0x4000 : 0) |
+                               ((mbz & 0xF000000000000000) ? 0x8000 : 0);
+          // permute index for elements from b
+          const int pi = (patb & 3) | (((patb >> 4) & 3) << 2) | (((patb >> 8) & 3) << 4) | (((patb >> 12) & 3) << 6);
+          // 2. Permute elements from b and combine with elements from a through write mask
+          return _mm512_mask_shuffle_epi32(ta, sb, b, (_MM_PERM_ENUM)pi);
+        }
+      // not same permute pattern in all lanes. use full permute
+    }
+
+  // general case: full permute
+  const __m512i pmask = constant16i<i0 & 0x1F, i1 & 0x1F, i2 & 0x1F, i3 & 0x1F, i4 & 0x1F, i5 & 0x1F, i6 & 0x1F, i7 & 0x1F, i8 & 0x1F,
+                                    i9 & 0x1F, i10 & 0x1F, i11 & 0x1F, i12 & 0x1F, i13 & 0x1F, i14 & 0x1F, i15 & 0x1F>();
+  if(dozero)
+    {
+      return _mm512_maskz_permutex2var_epi32(z, a, pmask, b);
+    }
+  else
+    {
+      return _mm512_permutex2var_epi32(a, pmask, b);
+    }
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16ui blend16ui(Vec16ui const &a, Vec16ui const &b)
+{
+  return Vec16ui(blend16i<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(Vec16i(a), Vec16i(b)));
+}
+
+/*****************************************************************************
+ *
+ *          Vector lookup functions
+ *
+ ******************************************************************************
+ *
+ * These functions use vector elements as indexes into a table.
+ * The table is given as one or more vectors or as an array.
+ *
+ * This can be used for several purposes:
+ *  - table lookup
+ *  - permute or blend with variable indexes
+ *  - blend from more than two sources
+ *  - gather non-contiguous data
+ *
+ * An index out of range may produce any value - the actual value produced is
+ * implementation dependent and may be different for different instruction
+ * sets. An index out of range does not produce an error message or exception.
+ *
+ * Example:
+ * Vec8q a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+ * Vec8q b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8q c;
+ * c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+ *
+ *****************************************************************************/
+
+static inline Vec16i lookup16(Vec16i const &index, Vec16i const &table) { return _mm512_permutexvar_epi32(index, table); }
+
+template <int n>
+static inline Vec16i lookup(Vec16i const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 16)
+    {
+      Vec16i table1 = Vec16i().load(table);
+      return lookup16(index, table1);
+    }
+  if(n <= 32)
+    {
+      Vec16i table1 = Vec16i().load(table);
+      Vec16i table2 = Vec16i().load((int8_t *)table + 64);
+      return _mm512_permutex2var_epi32(table1, index, table2);
+    }
+  // n > 32. Limit index
+  Vec16ui index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec16ui(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec16ui(index), uint32_t(n - 1));
+    }
+  return _mm512_i32gather_epi32(index1, (const int *)table, 4);
+  // return  _mm512_i32gather_epi32(index1, table, _MM_UPCONV_EPI32_NONE, 4, 0);
+}
+
+static inline Vec8q lookup8(Vec8q const &index, Vec8q const &table) { return _mm512_permutexvar_epi64(index, table); }
+
+template <int n>
+static inline Vec8q lookup(Vec8q const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 8)
+    {
+      Vec8q table1 = Vec8q().load(table);
+      return lookup8(index, table1);
+    }
+  if(n <= 16)
+    {
+      Vec8q table1 = Vec8q().load(table);
+      Vec8q table2 = Vec8q().load((int8_t *)table + 64);
+      return _mm512_permutex2var_epi64(table1, index, table2);
+    }
+  // n > 16. Limit index
+  Vec8uq index1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      index1 = Vec8uq(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      index1 = min(Vec8uq(index), uint32_t(n - 1));
+    }
+  return _mm512_i64gather_epi64(index1, (const long long *)table, 8);
+}
+
+/*****************************************************************************
+ *
+ *          Gather functions with fixed indexes
+ *
+ *****************************************************************************/
+// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16i gather16i(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) >= 0>
+      Negative_array_index;  // Error message if index is negative
+  // find smallest and biggest index, using only compile-time constant expressions
+  const int i01min    = i0 < i1 ? i0 : i1;
+  const int i23min    = i2 < i3 ? i2 : i3;
+  const int i45min    = i4 < i5 ? i4 : i5;
+  const int i67min    = i6 < i7 ? i6 : i7;
+  const int i89min    = i8 < i9 ? i8 : i9;
+  const int i1011min  = i10 < i11 ? i10 : i11;
+  const int i1213min  = i12 < i13 ? i12 : i13;
+  const int i1415min  = i14 < i15 ? i14 : i15;
+  const int i0_3min   = i01min < i23min ? i01min : i23min;
+  const int i4_7min   = i45min < i67min ? i45min : i67min;
+  const int i8_11min  = i89min < i1011min ? i89min : i1011min;
+  const int i12_15min = i1213min < i1415min ? i1213min : i1415min;
+  const int i0_7min   = i0_3min < i4_7min ? i0_3min : i4_7min;
+  const int i8_15min  = i8_11min < i12_15min ? i8_11min : i12_15min;
+  const int imin      = i0_7min < i8_15min ? i0_7min : i8_15min;
+  const int i01max    = i0 > i1 ? i0 : i1;
+  const int i23max    = i2 > i3 ? i2 : i3;
+  const int i45max    = i4 > i5 ? i4 : i5;
+  const int i67max    = i6 > i7 ? i6 : i7;
+  const int i89max    = i8 > i9 ? i8 : i9;
+  const int i1011max  = i10 > i11 ? i10 : i11;
+  const int i1213max  = i12 > i13 ? i12 : i13;
+  const int i1415max  = i14 > i15 ? i14 : i15;
+  const int i0_3max   = i01max > i23max ? i01max : i23max;
+  const int i4_7max   = i45max > i67max ? i45max : i67max;
+  const int i8_11max  = i89max > i1011max ? i89max : i1011max;
+  const int i12_15max = i1213max > i1415max ? i1213max : i1415max;
+  const int i0_7max   = i0_3max > i4_7max ? i0_3max : i4_7max;
+  const int i8_15max  = i8_11max > i12_15max ? i8_11max : i12_15max;
+  const int imax      = i0_7max > i8_15max ? i0_7max : i8_15max;
+  if(imax - imin <= 15)
+    {
+      // load one contiguous block and permute
+      if(imax > 15)
+        {
+          // make sure we don't read past the end of the array
+          Vec16i b = Vec16i().load((int32_t const *)a + imax - 15);
+          return permute16i<i0 - imax + 15, i1 - imax + 15, i2 - imax + 15, i3 - imax + 15, i4 - imax + 15, i5 - imax + 15,
+                            i6 - imax + 15, i7 - imax + 15, i8 - imax + 15, i9 - imax + 15, i10 - imax + 15, i11 - imax + 15,
+                            i12 - imax + 15, i13 - imax + 15, i14 - imax + 15, i15 - imax + 15>(b);
+        }
+      else
+        {
+          Vec16i b = Vec16i().load((int32_t const *)a + imin);
+          return permute16i<i0 - imin, i1 - imin, i2 - imin, i3 - imin, i4 - imin, i5 - imin, i6 - imin, i7 - imin, i8 - imin,
+                            i9 - imin, i10 - imin, i11 - imin, i12 - imin, i13 - imin, i14 - imin, i15 - imin>(b);
+        }
+    }
+  if((i0 < imin + 16 || i0 > imax - 16) && (i1 < imin + 16 || i1 > imax - 16) && (i2 < imin + 16 || i2 > imax - 16) &&
+     (i3 < imin + 16 || i3 > imax - 16) && (i4 < imin + 16 || i4 > imax - 16) && (i5 < imin + 16 || i5 > imax - 16) &&
+     (i6 < imin + 16 || i6 > imax - 16) && (i7 < imin + 16 || i7 > imax - 16) && (i8 < imin + 16 || i8 > imax - 16) &&
+     (i9 < imin + 16 || i9 > imax - 16) && (i10 < imin + 16 || i10 > imax - 16) && (i11 < imin + 16 || i11 > imax - 16) &&
+     (i12 < imin + 16 || i12 > imax - 16) && (i13 < imin + 16 || i13 > imax - 16) && (i14 < imin + 16 || i14 > imax - 16) &&
+     (i15 < imin + 16 || i15 > imax - 16))
+    {
+      // load two contiguous blocks and blend
+      Vec16i b      = Vec16i().load((int32_t const *)a + imin);
+      Vec16i c      = Vec16i().load((int32_t const *)a + imax - 15);
+      const int j0  = i0 < imin + 16 ? i0 - imin : 31 - imax + i0;
+      const int j1  = i1 < imin + 16 ? i1 - imin : 31 - imax + i1;
+      const int j2  = i2 < imin + 16 ? i2 - imin : 31 - imax + i2;
+      const int j3  = i3 < imin + 16 ? i3 - imin : 31 - imax + i3;
+      const int j4  = i4 < imin + 16 ? i4 - imin : 31 - imax + i4;
+      const int j5  = i5 < imin + 16 ? i5 - imin : 31 - imax + i5;
+      const int j6  = i6 < imin + 16 ? i6 - imin : 31 - imax + i6;
+      const int j7  = i7 < imin + 16 ? i7 - imin : 31 - imax + i7;
+      const int j8  = i8 < imin + 16 ? i8 - imin : 31 - imax + i8;
+      const int j9  = i9 < imin + 16 ? i9 - imin : 31 - imax + i9;
+      const int j10 = i10 < imin + 16 ? i10 - imin : 31 - imax + i10;
+      const int j11 = i11 < imin + 16 ? i11 - imin : 31 - imax + i11;
+      const int j12 = i12 < imin + 16 ? i12 - imin : 31 - imax + i12;
+      const int j13 = i13 < imin + 16 ? i13 - imin : 31 - imax + i13;
+      const int j14 = i14 < imin + 16 ? i14 - imin : 31 - imax + i14;
+      const int j15 = i15 < imin + 16 ? i15 - imin : 31 - imax + i15;
+      return blend16i<j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15>(b, c);
+    }
+  // use gather instruction
+  return _mm512_i32gather_epi32(Vec16i(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15), (const int *)a, 4);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q gather8q(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) >= 0> Negative_array_index;  // Error message if index is negative
+
+  const int i01min   = i0 < i1 ? i0 : i1;
+  const int i23min   = i2 < i3 ? i2 : i3;
+  const int i45min   = i4 < i5 ? i4 : i5;
+  const int i67min   = i6 < i7 ? i6 : i7;
+  const int i0123min = i01min < i23min ? i01min : i23min;
+  const int i4567min = i45min < i67min ? i45min : i67min;
+  const int imin     = i0123min < i4567min ? i0123min : i4567min;
+  const int i01max   = i0 > i1 ? i0 : i1;
+  const int i23max   = i2 > i3 ? i2 : i3;
+  const int i45max   = i4 > i5 ? i4 : i5;
+  const int i67max   = i6 > i7 ? i6 : i7;
+  const int i0123max = i01max > i23max ? i01max : i23max;
+  const int i4567max = i45max > i67max ? i45max : i67max;
+  const int imax     = i0123max > i4567max ? i0123max : i4567max;
+  if(imax - imin <= 7)
+    {
+      // load one contiguous block and permute
+      if(imax > 7)
+        {
+          // make sure we don't read past the end of the array
+          Vec8q b = Vec8q().load((int64_t const *)a + imax - 7);
+          return permute8q<i0 - imax + 7, i1 - imax + 7, i2 - imax + 7, i3 - imax + 7, i4 - imax + 7, i5 - imax + 7, i6 - imax + 7,
+                           i7 - imax + 7>(b);
+        }
+      else
+        {
+          Vec8q b = Vec8q().load((int64_t const *)a + imin);
+          return permute8q<i0 - imin, i1 - imin, i2 - imin, i3 - imin, i4 - imin, i5 - imin, i6 - imin, i7 - imin>(b);
+        }
+    }
+  if((i0 < imin + 8 || i0 > imax - 8) && (i1 < imin + 8 || i1 > imax - 8) && (i2 < imin + 8 || i2 > imax - 8) &&
+     (i3 < imin + 8 || i3 > imax - 8) && (i4 < imin + 8 || i4 > imax - 8) && (i5 < imin + 8 || i5 > imax - 8) &&
+     (i6 < imin + 8 || i6 > imax - 8) && (i7 < imin + 8 || i7 > imax - 8))
+    {
+      // load two contiguous blocks and blend
+      Vec8q b      = Vec8q().load((int64_t const *)a + imin);
+      Vec8q c      = Vec8q().load((int64_t const *)a + imax - 7);
+      const int j0 = i0 < imin + 8 ? i0 - imin : 15 - imax + i0;
+      const int j1 = i1 < imin + 8 ? i1 - imin : 15 - imax + i1;
+      const int j2 = i2 < imin + 8 ? i2 - imin : 15 - imax + i2;
+      const int j3 = i3 < imin + 8 ? i3 - imin : 15 - imax + i3;
+      const int j4 = i4 < imin + 8 ? i4 - imin : 15 - imax + i4;
+      const int j5 = i5 < imin + 8 ? i5 - imin : 15 - imax + i5;
+      const int j6 = i6 < imin + 8 ? i6 - imin : 15 - imax + i6;
+      const int j7 = i7 < imin + 8 ? i7 - imin : 15 - imax + i7;
+      return blend8q<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+  // use gather instruction
+  return _mm512_i64gather_epi64(Vec8q(i0, i1, i2, i3, i4, i5, i6, i7), (const long long *)a, 8);
+}
+
+/*****************************************************************************
+ *
+ *          Vector scatter functions
+ *
+ ******************************************************************************
+ *
+ * These functions write the elements of a vector to arbitrary positions in an
+ * array in memory. Each vector element is written to an array position
+ * determined by an index. An element is not written if the corresponding
+ * index is out of range.
+ * The indexes can be specified as constant template parameters or as an
+ * integer vector.
+ *
+ * The scatter functions are useful if the data are distributed in a sparce
+ * manner into the array. If the array is dense then it is more efficient
+ * to permute the data into the right positions and then write the whole
+ * permuted vector into the array.
+ *
+ * Example:
+ * Vec8q a(10,11,12,13,14,15,16,17);
+ * int64_t b[16] = {0};
+ * scatter<0,2,14,10,1,-1,5,9>(a,b);
+ * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
+ *
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline void scatter(Vec16i const &data, void *array)
+{
+  __m512i indx = constant16i<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>();
+  Vec16ib mask(i0 >= 0, i1 >= 0, i2 >= 0, i3 >= 0, i4 >= 0, i5 >= 0, i6 >= 0, i7 >= 0, i8 >= 0, i9 >= 0, i10 >= 0, i11 >= 0, i12 >= 0,
+               i13 >= 0, i14 >= 0, i15 >= 0);
+  _mm512_mask_i32scatter_epi32((int *)array, mask, indx, data, 4);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8q const &data, void *array)
+{
+  __m256i indx = constant8i<i0, i1, i2, i3, i4, i5, i6, i7>();
+  Vec8qb mask(i0 >= 0, i1 >= 0, i2 >= 0, i3 >= 0, i4 >= 0, i5 >= 0, i6 >= 0, i7 >= 0);
+  _mm512_mask_i32scatter_epi64((long long *)array, mask, indx, data, 8);
+}
+
+static inline void scatter(Vec16i const &index, uint32_t limit, Vec16i const &data, void *array)
+{
+  Vec16ib mask = Vec16ui(index) < limit;
+  _mm512_mask_i32scatter_epi32((int *)array, mask, index, data, 4);
+}
+
+static inline void scatter(Vec8q const &index, uint32_t limit, Vec8q const &data, void *array)
+{
+  Vec8qb mask = Vec8uq(index) < uint64_t(limit);
+  _mm512_mask_i64scatter_epi64((long long *)array, mask, index, data, 8);
+}
+
+static inline void scatter(Vec8i const &index, uint32_t limit, Vec8q const &data, void *array)
+{
+#if defined(__AVX512VL__)
+  __mmask16 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit));
+#else
+  __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
+#endif
+  _mm512_mask_i32scatter_epi64((long long *)array, mask, index, data, 8);
+}
+
+/*****************************************************************************
+ *
+ *          Functions for conversion between integer sizes
+ *
+ *****************************************************************************/
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_to_int : extends Vec16s to Vec16i with sign extension
+static inline Vec16i extend_to_int(Vec16s const &a) { return _mm512_cvtepi16_epi32(a); }
+
+// Function extend_to_int : extends Vec16us to Vec16ui with zero extension
+static inline Vec16ui extend_to_int(Vec16us const &a) { return _mm512_cvtepu16_epi32(a); }
+
+// Function extend_to_int : extends Vec16c to Vec16i with sign extension
+static inline Vec16i extend_to_int(Vec16c const &a) { return _mm512_cvtepi8_epi32(a); }
+
+// Function extend_to_int : extends Vec16uc to Vec16ui with zero extension
+static inline Vec16ui extend_to_int(Vec16uc const &a) { return _mm512_cvtepu8_epi32(a); }
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 64 bits with sign extension
+static inline Vec8q extend_low(Vec16i const &a) { return _mm512_cvtepi32_epi64(a.get_low()); }
+
+// Function extend_high : extends the high 8 elements to 64 bits with sign extension
+static inline Vec8q extend_high(Vec16i const &a) { return _mm512_cvtepi32_epi64(a.get_high()); }
+
+// Function extend_low : extends the low 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_low(Vec16ui const &a) { return _mm512_cvtepu32_epi64(a.get_low()); }
+
+// Function extend_high : extends the high 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_high(Vec16ui const &a) { return _mm512_cvtepu32_epi64(a.get_high()); }
+
+// Compress 32-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec16c compress_to_int8(Vec16i const &a) { return _mm512_cvtepi32_epi8(a); }
+
+static inline Vec16s compress_to_int16(Vec16i const &a) { return _mm512_cvtepi32_epi16(a); }
+
+// with signed saturation
+static inline Vec16c compress_to_int8_saturated(Vec16i const &a) { return _mm512_cvtsepi32_epi8(a); }
+
+static inline Vec16s compress_to_int16_saturated(Vec16i const &a) { return _mm512_cvtsepi32_epi16(a); }
+
+// with unsigned saturation
+static inline Vec16uc compress_to_int8_saturated(Vec16ui const &a) { return _mm512_cvtusepi32_epi8(a); }
+
+static inline Vec16us compress_to_int16_saturated(Vec16ui const &a) { return _mm512_cvtusepi32_epi16(a); }
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec16i compress(Vec8q const &low, Vec8q const &high)
+{
+  Vec8i low2  = _mm512_cvtepi64_epi32(low);
+  Vec8i high2 = _mm512_cvtepi64_epi32(high);
+  return Vec16i(low2, high2);
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec16i compress_saturated(Vec8q const &low, Vec8q const &high)
+{
+  Vec8i low2  = _mm512_cvtsepi64_epi32(low);
+  Vec8i high2 = _mm512_cvtsepi64_epi32(high);
+  return Vec16i(low2, high2);
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec16ui compress_saturated(Vec8uq const &low, Vec8uq const &high)
+{
+  Vec8ui low2  = _mm512_cvtusepi64_epi32(low);
+  Vec8ui high2 = _mm512_cvtusepi64_epi32(high);
+  return Vec16ui(low2, high2);
+}
+
+/*****************************************************************************
+ *
+ *          Integer division operators
+ *
+ *          Please see the file vectori128.h for explanation.
+ *
+ *****************************************************************************/
+
+// vector operator / : divide each element by divisor
+
+// vector of 16 32-bit signed integers
+static inline Vec16i operator/(Vec16i const &a, Divisor_i const &d)
+{
+  __m512i m   = _mm512_broadcast_i32x4(d.getm());                  // broadcast multiplier
+  __m512i sgn = _mm512_broadcast_i32x4(d.getsign());               // broadcast sign of d
+  __m512i t1  = _mm512_mul_epi32(a, m);                            // 32x32->64 bit signed multiplication of even elements of a
+  __m512i t3  = _mm512_srli_epi64(a, 32);                          // get odd elements of a into position for multiplication
+  __m512i t4  = _mm512_mul_epi32(t3, m);                           // 32x32->64 bit signed multiplication of odd elements
+  __m512i t2  = _mm512_srli_epi64(t1, 32);                         // dword of even index results
+  __m512i t7  = _mm512_mask_mov_epi32(t2, __mmask16(0xAAAA), t4);  // blend two results
+  __m512i t8  = _mm512_add_epi32(t7, a);                           // add
+  __m512i t9  = _mm512_sra_epi32(t8, d.gets1());                   // shift right artihmetic
+  __m512i t10 = _mm512_srai_epi32(a, 31);                          // sign of a
+  __m512i t11 = _mm512_sub_epi32(t10, sgn);                        // sign of a - sign of d
+  __m512i t12 = _mm512_sub_epi32(t9, t11);                         // + 1 if a < 0, -1 if d < 0
+  return _mm512_xor_si512(t12, sgn);                               // change sign if divisor negative
+}
+
+// vector of 16 32-bit unsigned integers
+static inline Vec16ui operator/(Vec16ui const &a, Divisor_ui const &d)
+{
+  __m512i m   = _mm512_broadcast_i32x4(d.getm());                  // broadcast multiplier
+  __m512i t1  = _mm512_mul_epu32(a, m);                            // 32x32->64 bit unsigned multiplication of even elements of a
+  __m512i t3  = _mm512_srli_epi64(a, 32);                          // get odd elements of a into position for multiplication
+  __m512i t4  = _mm512_mul_epu32(t3, m);                           // 32x32->64 bit unsigned multiplication of odd elements
+  __m512i t2  = _mm512_srli_epi64(t1, 32);                         // high dword of even index results
+  __m512i t7  = _mm512_mask_mov_epi32(t2, __mmask16(0xAAAA), t4);  // blend two results
+  __m512i t8  = _mm512_sub_epi32(a, t7);                           // subtract
+  __m512i t9  = _mm512_srl_epi32(t8, d.gets1());                   // shift right logical
+  __m512i t10 = _mm512_add_epi32(t7, t9);                          // add
+  return _mm512_srl_epi32(t10, d.gets2());                         // shift right logical
+}
+
+// vector operator /= : divide
+static inline Vec16i &operator/=(Vec16i &a, Divisor_i const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator /= : divide
+static inline Vec16ui &operator/=(Vec16ui &a, Divisor_ui const &d)
+{
+  a = a / d;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Integer division 2: divisor is a compile-time constant
+ *
+ *****************************************************************************/
+
+// Divide Vec16i by compile-time constant
+template <int32_t d>
+static inline Vec16i divide_by_i(Vec16i const &x)
+{
+  Static_error_check<(d != 0)> Dividing_by_zero;  // Error message if dividing by zero
+  if(d == 1)
+    return x;
+  if(d == -1)
+    return -x;
+  if(uint32_t(d) == 0x80000000u)
+    {
+      return _mm512_maskz_set1_epi32(x == Vec16i(0x80000000), 1);  // avoid overflow of abs(d). return (x == 0x80000000) ? 1 : 0;
+    }
+  const uint32_t d1 =
+      d > 0 ? uint32_t(d) : -uint32_t(d);  // compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits)
+  if((d1 & (d1 - 1)) == 0)
+    {
+      // d1 is a power of 2. use shift
+      const int k = bit_scan_reverse_const(d1);
+      __m512i sign;
+      if(k > 1)
+        sign = _mm512_srai_epi32(x, k - 1);
+      else
+        sign = x;                                        // k copies of sign bit
+      __m512i bias   = _mm512_srli_epi32(sign, 32 - k);  // bias = x >= 0 ? 0 : k-1
+      __m512i xpbias = _mm512_add_epi32(x, bias);        // x + bias
+      __m512i q      = _mm512_srai_epi32(xpbias, k);     // (x + bias) >> k
+      if(d > 0)
+        return q;                                          // d > 0: return  q
+      return _mm512_sub_epi32(_mm512_setzero_epi32(), q);  // d < 0: return -q
+    }
+  // general case
+  const int32_t sh   = bit_scan_reverse_const(uint32_t(d1) - 1);  // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case)
+  const int32_t mult = int(1 + (uint64_t(1) << (32 + sh)) / uint32_t(d1) - (int64_t(1) << 32));  // multiplier
+  const Divisor_i div(mult, sh, d < 0 ? -1 : 0);
+  return x / div;
+}
+
+// define Vec8i a / const_int(d)
+template <int32_t d>
+static inline Vec16i operator/(Vec16i const &a, Const_int_t<d>)
+{
+  return divide_by_i<d>(a);
+}
+
+// define Vec16i a / const_uint(d)
+template <uint32_t d>
+static inline Vec16i operator/(Vec16i const &a, Const_uint_t<d>)
+{
+  Static_error_check<(d < 0x80000000u)> Error_overflow_dividing_signed_by_unsigned;  // Error: dividing signed by overflowing unsigned
+  return divide_by_i<int32_t(d)>(a);                                                 // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16i &operator/=(Vec16i &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16i &operator/=(Vec16i &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// Divide Vec16ui by compile-time constant
+template <uint32_t d>
+static inline Vec16ui divide_by_ui(Vec16ui const &x)
+{
+  Static_error_check<(d != 0)> Dividing_by_zero;  // Error message if dividing by zero
+  if(d == 1)
+    return x;                               // divide by 1
+  const int b = bit_scan_reverse_const(d);  // floor(log2(d))
+  if((uint32_t(d) & (uint32_t(d) - 1)) == 0)
+    {
+      // d is a power of 2. use shift
+      return _mm512_srli_epi32(x, b);  // x >> b
+    }
+  // general case (d > 2)
+  uint32_t mult         = uint32_t((uint64_t(1) << (b + 32)) / d);         // multiplier = 2^(32+b) / d
+  const uint64_t rem    = (uint64_t(1) << (b + 32)) - uint64_t(d) * mult;  // remainder 2^(32+b) % d
+  const bool round_down = (2 * rem < d);                                   // check if fraction is less than 0.5
+  if(!round_down)
+    {
+      mult = mult + 1;  // round up mult
+    }
+  // do 32*32->64 bit unsigned multiplication and get high part of result
+  const __m512i multv = Vec16ui(uint64_t(mult));     // zero-extend mult and broadcast
+  __m512i t1          = _mm512_mul_epu32(x, multv);  // 32x32->64 bit unsigned multiplication of even elements
+  if(round_down)
+    {
+      t1 = _mm512_add_epi64(t1, multv);  // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+  __m512i t2 = _mm512_srli_epi64(t1, 32);    // high dword of result 0 and 2
+  __m512i t3 = _mm512_srli_epi64(x, 32);     // get odd elements into position for multiplication
+  __m512i t4 = _mm512_mul_epu32(t3, multv);  // 32x32->64 bit unsigned multiplication of x[1] and x[3]
+  if(round_down)
+    {
+      t4 = _mm512_add_epi64(t4, multv);  // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+  __m512i t7 = _mm512_mask_mov_epi32(t2, __mmask16(0xAA), t4);  // blend two results
+  Vec16ui q  = _mm512_srli_epi32(t7, b);                        // shift right by b
+  return q;                                                     // no overflow possible
+}
+
+// define Vec8ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec16ui operator/(Vec16ui const &a, Const_uint_t<d>)
+{
+  return divide_by_ui<d>(a);
+}
+
+// define Vec8ui a / const_int(d)
+template <int32_t d>
+static inline Vec16ui operator/(Vec16ui const &a, Const_int_t<d>)
+{
+  Static_error_check<(d >= 0)> Error_dividing_unsigned_by_negative;  // Error: dividing unsigned by negative is ambiguous
+  return divide_by_ui<d>(a);                                         // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16ui &operator/=(Vec16ui &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16ui &operator/=(Vec16ui &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal scan functions
+ *
+ *****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+
+static inline int horizontal_find_first(Vec16ib const &x)
+{
+  uint32_t b = uint16_t(__mmask16(x));
+  if(b)
+    {
+      return bit_scan_forward(b);
+    }
+  else
+    {
+      return -1;
+    }
+}
+
+static inline int horizontal_find_first(Vec8qb const &x)
+{
+  uint32_t b = uint8_t(__mmask16(x));
+  if(b)
+    {
+      return bit_scan_forward(b);
+    }
+  else
+    {
+      return -1;
+    }
+}
+
+static inline uint32_t horizontal_count(Vec16ib const &x) { return vml_popcnt(uint32_t(uint16_t(__mmask16(x)))); }
+
+static inline uint32_t horizontal_count(Vec8qb const &x) { return vml_popcnt(uint32_t(uint16_t(__mmask16(x)))); }
+
+/*****************************************************************************
+ *
+ *          Boolean <-> bitfield conversion functions
+ *
+ *****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4ib x)
+{
+  __m512i a   = _mm512_castsi128_si512(x);
+  __mmask16 b = _mm512_mask_testn_epi32_mask(0xF, a, a);
+  return uint8_t(b) ^ 0xF;
+}
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec4ib to_Vec4ib(uint8_t x) { return _mm512_castsi512_si128(_mm512_maskz_set1_epi32(__mmask16(x), -1)); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec2qb x)
+{
+  __m512i a   = _mm512_castsi128_si512(x);
+  __mmask16 b = _mm512_mask_testn_epi64_mask(0x3, a, a);
+  return uint8_t(b) ^ 0x3;
+}
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec2qb to_Vec2qb(uint8_t x) { return _mm512_castsi512_si128(_mm512_maskz_set1_epi64(__mmask16(x), -1LL)); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8ib x)
+{
+  __m512i a   = _mm512_castsi256_si512(x);
+  __mmask16 b = _mm512_mask_testn_epi32_mask(0xFF, a, a);
+  return ~uint8_t(b);
+}
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec8ib to_Vec8ib(uint8_t x) { return _mm512_castsi512_si256(_mm512_maskz_set1_epi32(__mmask16(x), -1)); }
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4qb x)
+{
+  __m512i a   = _mm512_castsi256_si512(x);
+  __mmask16 b = _mm512_mask_testn_epi64_mask(0xF, a, a);
+  return uint8_t(b) ^ 0xF;
+}
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec4qb to_Vec4qb(uint8_t x) { return _mm512_castsi512_si256(_mm512_maskz_set1_epi64(__mmask16(x), -1LL)); }
+
+// to_bits: convert to integer bitfield
+static inline uint16_t to_bits(Vec16b a) { return (uint16_t)(__mmask16)a; }
+
+// to_Vec16b: convert integer bitfield to boolean vector
+static inline Vec16b to_Vec16b(uint16_t x) { return (__mmask16)x; }
+
+// to_Vec16ib: convert integer bitfield to boolean vector
+static inline Vec16ib to_Vec16ib(uint16_t x) { return to_Vec16b(x); }
+
+// to_Vec8b: convert integer bitfield to boolean vector
+static inline Vec8qb to_Vec8qb(uint8_t x) { return (__mmask16)x; }
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORI512_H
diff --git a/src/vectorclass/vectori512e.h b/src/vectorclass/vectori512e.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f069d7d2461d5a4135a53790ae383f0b14e02d2
--- /dev/null
+++ b/src/vectorclass/vectori512e.h
@@ -0,0 +1,2632 @@
+/****************************  vectori512e.h   *******************************
+ * Author:        Agner Fog
+ * Date created:  2014-07-23
+ * Last modified: 2017-02-19
+ * Version:       1.27
+ * Project:       vector classes
+ * Description:
+ * Header file defining integer vector classes as interface to intrinsic
+ * functions in x86 microprocessors with AVX512 and later instruction sets.
+ *
+ * Instructions:
+ * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
+ * instruction set, which must be at least AVX512.
+ *
+ * The following vector classes are defined here:
+ * Vec16i    Vector of  16  32-bit signed   integers
+ * Vec16ui   Vector of  16  32-bit unsigned integers
+ * Vec16ib   Vector of  16  Booleans for use with Vec16i and Vec16ui
+ * Vec8q     Vector of   8  64-bit signed   integers
+ * Vec8uq    Vector of   8  64-bit unsigned integers
+ * Vec8qb    Vector of   8  Booleans for use with Vec8q and Vec8uq
+ *
+ * Each vector object is represented internally in the CPU as a 512-bit register.
+ * This header file defines operators and functions for these vectors.
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2014-2017 GNU General Public License http://www.gnu.org/licenses
+ *****************************************************************************/
+
+// check combination of header files
+#if defined(VECTORI512_H)
+#if VECTORI512_H != 1
+#error Two different versions of vectori512.h included
+#endif
+#else
+#define VECTORI512_H 1
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/*****************************************************************************
+ *
+ *          base class Vec512ie
+ *
+ *****************************************************************************/
+// base class to replace _mm512i when AVX512 is not supported
+class Vec512ie
+{
+ protected:
+  Vec256b z0;  // low half
+  Vec256b z1;  // high half
+ public:
+  Vec512ie(void){};  // default constructor
+  Vec512ie(Vec8i const &x0, Vec8i const &x1)
+  {  // constructor to build from two Vec8i
+    z0 = x0;
+    z1 = x1;
+  }
+  Vec8i get_low() const
+  {  // get low half
+    return Vec8i(z0);
+  }
+  Vec8i get_high() const
+  {  // get high half
+    return Vec8i(z1);
+  }
+};
+
+/*****************************************************************************
+ *
+ *          Vector of 512 1-bit unsigned integers or Booleans
+ *
+ *****************************************************************************/
+class Vec512b : public Vec512ie
+{
+ public:
+  // Default constructor:
+  Vec512b() {}
+  // Constructor to build from two Vec256b:
+  Vec512b(Vec256b const &a0, Vec256b const &a1)
+  {
+    z0 = a0;
+    z1 = a1;
+  }
+  // Constructor to convert from type Vec512ie
+  Vec512b(Vec512ie const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec512ie
+  Vec512b &operator=(Vec512ie const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec512b &load(void const *p)
+  {
+    z0 = Vec8i().load(p);
+    z1 = Vec8i().load((int32_t const *)p + 8);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  Vec512b &load_a(void const *p)
+  {
+    z0 = Vec8i().load_a(p);
+    z1 = Vec8i().load_a((int32_t const *)p + 8);
+    return *this;
+  }
+  // Member function to store into array (unaligned)
+  void store(void *p) const
+  {
+    Vec8i(z0).store(p);
+    Vec8i(z1).store((int32_t *)p + 8);
+  }
+  // Member function to store into array, aligned by 64
+  void store_a(void *p) const
+  {
+    Vec8i(z0).store_a(p);
+    Vec8i(z1).store_a((int32_t *)p + 8);
+  }
+  // Member function to change a single bit
+  // Note: This function is inefficient. Use load function if changing more than one bit
+  Vec512b const &set_bit(uint32_t index, int value)
+  {
+    if(index < 256)
+      {
+        z0 = Vec8i(z0).set_bit(index, value);
+      }
+    else
+      {
+        z1 = Vec8i(z1).set_bit(index - 256, value);
+      }
+    return *this;
+  }
+  // Member function to get a single bit
+  // Note: This function is inefficient. Use store function if reading more than one bit
+  int get_bit(uint32_t index) const
+  {
+    if(index < 256)
+      {
+        return Vec8i(z0).get_bit(index);
+      }
+    else
+      {
+        return Vec8i(z1).get_bit(index - 256);
+      }
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return get_bit(index) != 0; }
+  // Member functions to split into two Vec128b:
+  Vec256b get_low() const { return z0; }
+  Vec256b get_high() const { return z1; }
+  static int size() { return 512; }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec512b operator&(Vec512b const &a, Vec512b const &b)
+{
+  return Vec512b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec512b operator&&(Vec512b const &a, Vec512b const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec512b operator|(Vec512b const &a, Vec512b const &b)
+{
+  return Vec512b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec512b operator||(Vec512b const &a, Vec512b const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec512b operator^(Vec512b const &a, Vec512b const &b)
+{
+  return Vec512b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec512b operator~(Vec512b const &a) { return Vec512b(~a.get_low(), ~a.get_high()); }
+
+// vector operator &= : bitwise and
+static inline Vec512b &operator&=(Vec512b &a, Vec512b const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec512b &operator|=(Vec512b &a, Vec512b const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec512b &operator^=(Vec512b &a, Vec512b const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec512b andnot(Vec512b const &a, Vec512b const &b)
+{
+  return Vec512b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Generate compile-time constant vector
+ *
+ *****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory.
+// Can be converted to any integer vector type
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec512ie constant16i()
+{
+  static const union
+  {
+    int32_t i[16];
+    Vec256b y[2];  // note: requires C++0x or later. Use option -std=c++0x
+  } u = {{i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}};
+  return Vec512ie(u.y[0], u.y[1]);
+}
+
+/*****************************************************************************
+ *
+ *          Boolean vector base classes for AVX512
+ *
+ *****************************************************************************/
+
+class Vec16b : public Vec512b
+{
+ public:
+  // Default constructor:
+  Vec16b() {}
+  // Constructor to build from all elements:
+  Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, bool b8, bool b9, bool b10, bool b11, bool b12,
+         bool b13, bool b14, bool b15)
+  {
+    *this = Vec512b(Vec8i(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7),
+                    Vec8i(-(int)b8, -(int)b9, -(int)b10, -(int)b11, -(int)b12, -(int)b13, -(int)b14, -(int)b15));
+  }
+  // Constructor to convert from type Vec512b
+  Vec16b(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Constructor to make from two halves
+  Vec16b(Vec8ib const &x0, Vec8ib const &x1)
+  {
+    z0 = x0;
+    z1 = x1;
+  }
+  // Constructor to make from two halves
+  Vec16b(Vec8i const &x0, Vec8i const &x1)
+  {
+    z0 = x0;
+    z1 = x1;
+  }
+  // Constructor to broadcast single value:
+  Vec16b(bool b) { z0 = z1 = Vec8i(-int32_t(b)); }
+  // Assignment operator to broadcast scalar value:
+  Vec16b &operator=(bool b)
+  {
+    z0 = z1 = Vec8i(-int32_t(b));
+    return *this;
+  }
+
+ private:
+  // Prevent constructing from int, etc. because of ambiguity
+  Vec16b(int b);
+  // Prevent assigning int because of ambiguity
+  Vec16b &operator=(int x);
+
+ public:
+  // split into two halves
+  Vec8ib get_low() const { return Vec8ib(z0); }
+  Vec8ib get_high() const { return Vec8ib(z1); }
+  // Assignment operator to convert from type Vec512b
+  Vec16b &operator=(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec16b const &insert(uint32_t index, bool value)
+  {
+    if(index < 8)
+      {
+        z0 = Vec8ib(z0).insert(index, value);
+      }
+    else
+      {
+        z1 = Vec8ib(z1).insert(index - 8, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  bool extract(uint32_t index) const
+  {
+    if(index < 8)
+      {
+        return Vec8ib(z0).extract(index);
+      }
+    else
+      {
+        return Vec8ib(z1).extract(index - 8);
+      }
+  }
+  // Extract a single element. Operator [] can only read an element, not write.
+  bool operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 16; }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec16b operator&(Vec16b const &a, Vec16b const &b)
+{
+  return Vec16b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16b operator&&(Vec16b const &a, Vec16b const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec16b operator|(Vec16b const &a, Vec16b const &b)
+{
+  return Vec16b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16b operator||(Vec16b const &a, Vec16b const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec16b operator^(Vec16b const &a, Vec16b const &b)
+{
+  return Vec16b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16b operator~(Vec16b const &a) { return Vec16b(~(a.get_low()), ~(a.get_high())); }
+
+// vector operator ! : element not
+static inline Vec16b operator!(Vec16b const &a) { return ~a; }
+
+// vector operator &= : bitwise and
+static inline Vec16b &operator&=(Vec16b &a, Vec16b const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16b &operator|=(Vec16b &a, Vec16b const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16b &operator^=(Vec16b &a, Vec16b const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Functions for boolean vectors
+ *
+ *****************************************************************************/
+
+// function andnot: a & ~ b
+static inline Vec16b andnot(Vec16b const &a, Vec16b const &b)
+{
+  return Vec16b(Vec8ib(andnot(a.get_low(), b.get_low())), Vec8ib(andnot(a.get_high(), b.get_high())));
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec16b const &a) { return horizontal_and(a.get_low() & a.get_high()); }
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec16b const &a) { return horizontal_or(a.get_low() | a.get_high()); }
+
+/*****************************************************************************
+ *
+ *          Vec16ib: Vector of 16 Booleans for use with Vec16i and Vec16ui
+ *
+ *****************************************************************************/
+
+class Vec16ib : public Vec16b
+{
+ public:
+  // Default constructor:
+  Vec16ib() {}
+  Vec16ib(Vec16b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Constructor to build from all elements:
+  Vec16ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, bool x8, bool x9, bool x10, bool x11, bool x12,
+          bool x13, bool x14, bool x15)
+  {
+    z0 = Vec8ib(x0, x1, x2, x3, x4, x5, x6, x7);
+    z1 = Vec8ib(x8, x9, x10, x11, x12, x13, x14, x15);
+  }
+  // Constructor to convert from type Vec512b
+  Vec16ib(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Construct from two halves
+  Vec16ib(Vec8ib const &x0, Vec8ib const &x1)
+  {
+    z0 = x0;
+    z1 = x1;
+  }
+  // Assignment operator to convert from type Vec512b
+  Vec16ib &operator=(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+    return *this;
+  }
+  // Constructor to broadcast scalar value:
+  Vec16ib(bool b) : Vec16b(b) {}
+  // Assignment operator to broadcast scalar value:
+  Vec16ib &operator=(bool b)
+  {
+    *this = Vec16b(b);
+    return *this;
+  }
+
+ private:  // Prevent constructing from int, etc.
+  Vec16ib(int b);
+  Vec16ib &operator=(int x);
+
+ public:
+};
+
+// Define operators for Vec16ib
+
+// vector operator & : bitwise and
+static inline Vec16ib operator&(Vec16ib const &a, Vec16ib const &b) { return Vec16b(a) & Vec16b(b); }
+static inline Vec16ib operator&&(Vec16ib const &a, Vec16ib const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec16ib operator|(Vec16ib const &a, Vec16ib const &b) { return Vec16b(a) | Vec16b(b); }
+static inline Vec16ib operator||(Vec16ib const &a, Vec16ib const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec16ib operator^(Vec16ib const &a, Vec16ib const &b) { return Vec16b(a) ^ Vec16b(b); }
+
+// vector operator ~ : bitwise not
+static inline Vec16ib operator~(Vec16ib const &a) { return ~Vec16b(a); }
+
+// vector operator ! : element not
+static inline Vec16ib operator!(Vec16ib const &a) { return ~a; }
+
+// vector operator &= : bitwise and
+static inline Vec16ib &operator&=(Vec16ib &a, Vec16ib const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16ib &operator|=(Vec16ib &a, Vec16ib const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16ib &operator^=(Vec16ib &a, Vec16ib const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector function andnot
+static inline Vec16ib andnot(Vec16ib const &a, Vec16ib const &b) { return Vec16ib(andnot(Vec16b(a), Vec16b(b))); }
+
+/*****************************************************************************
+ *
+ *          Vec8b: Base class vector of 8 Booleans
+ *
+ *****************************************************************************/
+
+class Vec8b : public Vec16b
+{
+ public:
+  // Default constructor:
+  Vec8b() {}
+  Vec8b(Vec16b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Constructor to convert from type Vec512b
+  Vec8b(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // construct from two halves
+  Vec8b(Vec4qb const &x0, Vec4qb const &x1)
+  {
+    z0 = x0;
+    z1 = x1;
+  }
+  // Constructor to broadcast single value:
+  Vec8b(bool b) { z0 = z1 = Vec8i(-int32_t(b)); }
+  // Assignment operator to broadcast scalar value:
+  Vec8b &operator=(bool b)
+  {
+    z0 = z1 = Vec8i(-int32_t(b));
+    return *this;
+  }
+
+ private:
+  // Prevent constructing from int, etc. because of ambiguity
+  Vec8b(int b);
+  // Prevent assigning int because of ambiguity
+  Vec8b &operator=(int x);
+
+ public:
+  // split into two halves
+  Vec4qb get_low() const { return Vec4qb(z0); }
+  Vec4qb get_high() const { return Vec4qb(z1); }
+  // Assignment operator to convert from type Vec512b
+  Vec8b &operator=(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8b const &insert(uint32_t index, bool value)
+  {
+    if(index < 4)
+      {
+        z0 = Vec4qb(z0).insert(index, value);
+      }
+    else
+      {
+        z1 = Vec4qb(z1).insert(index - 4, value);
+      }
+    return *this;
+  }
+  bool extract(uint32_t index) const
+  {
+    if(index < 4)
+      {
+        return Vec4qb(Vec4q(z0)).extract(index);
+      }
+    else
+      {
+        return Vec4qb(Vec4q(z1)).extract(index - 4);
+      }
+  }
+  bool operator[](uint32_t index) const { return extract(index); }
+  static int size() { return 8; }
+};
+
+/*****************************************************************************
+ *
+ *          Vec8qb: Vector of 8 Booleans for use with Vec8q and Vec8qu
+ *
+ *****************************************************************************/
+
+class Vec8qb : public Vec8b
+{
+ public:
+  // Default constructor:
+  Vec8qb() {}
+  Vec8qb(Vec16b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Constructor to build from all elements:
+  Vec8qb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7)
+  {
+    z0 = Vec4qb(x0, x1, x2, x3);
+    z1 = Vec4qb(x4, x5, x6, x7);
+  }
+  // Constructor to convert from type Vec512b
+  Vec8qb(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // construct from two halves
+  Vec8qb(Vec4qb const &x0, Vec4qb const &x1)
+  {
+    z0 = x0;
+    z1 = x1;
+  }
+  // Assignment operator to convert from type Vec512b
+  Vec8qb &operator=(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+    return *this;
+  }
+  // Constructor to broadcast single value:
+  Vec8qb(bool b) : Vec8b(b) {}
+  // Assignment operator to broadcast scalar value:
+  Vec8qb &operator=(bool b)
+  {
+    *this = Vec8b(b);
+    return *this;
+  }
+
+ private:
+  // Prevent constructing from int, etc. because of ambiguity
+  Vec8qb(int b);
+  // Prevent assigning int because of ambiguity
+  Vec8qb &operator=(int x);
+
+ public:
+};
+
+// Define operators for Vec8qb
+
+// vector operator & : bitwise and
+static inline Vec8qb operator&(Vec8qb const &a, Vec8qb const &b) { return Vec16b(a) & Vec16b(b); }
+static inline Vec8qb operator&&(Vec8qb const &a, Vec8qb const &b) { return a & b; }
+
+// vector operator | : bitwise or
+static inline Vec8qb operator|(Vec8qb const &a, Vec8qb const &b) { return Vec16b(a) | Vec16b(b); }
+static inline Vec8qb operator||(Vec8qb const &a, Vec8qb const &b) { return a | b; }
+
+// vector operator ^ : bitwise xor
+static inline Vec8qb operator^(Vec8qb const &a, Vec8qb const &b) { return Vec16b(a) ^ Vec16b(b); }
+
+// vector operator ~ : bitwise not
+static inline Vec8qb operator~(Vec8qb const &a) { return ~Vec16b(a); }
+
+// vector operator ! : element not
+static inline Vec8qb operator!(Vec8qb const &a) { return ~a; }
+
+// vector operator &= : bitwise and
+static inline Vec8qb &operator&=(Vec8qb &a, Vec8qb const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8qb &operator|=(Vec8qb &a, Vec8qb const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8qb &operator^=(Vec8qb &a, Vec8qb const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector function andnot
+static inline Vec8qb andnot(Vec8qb const &a, Vec8qb const &b) { return Vec8qb(andnot(Vec16b(a), Vec16b(b))); }
+
+/*****************************************************************************
+ *
+ *          Vector of 16 32-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec16i : public Vec512b
+{
+ public:
+  // Default constructor:
+  Vec16i() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec16i(int i) { z0 = z1 = Vec8i(i); }
+  // Constructor to build from all elements:
+  Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7, int32_t i8, int32_t i9,
+         int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15)
+  {
+    z0 = Vec8i(i0, i1, i2, i3, i4, i5, i6, i7);
+    z1 = Vec8i(i8, i9, i10, i11, i12, i13, i14, i15);
+  }
+  // Constructor to build from two Vec8i:
+  Vec16i(Vec8i const &a0, Vec8i const &a1) { *this = Vec512b(a0, a1); }
+  // Constructor to convert from type Vec512b
+  Vec16i(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec512b
+  Vec16i &operator=(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec16i &load(void const *p)
+  {
+    Vec512b::load(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  Vec16i &load_a(void const *p)
+  {
+    Vec512b::load_a(p);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec16i &load_partial(int n, void const *p)
+  {
+    if(n < 8)
+      {
+        z0 = Vec8i().load_partial(n, p);
+        z1 = Vec8i(0);
+      }
+    else
+      {
+        z0 = Vec8i().load(p);
+        z1 = Vec8i().load_partial(n - 8, (int32_t const *)p + 8);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n < 8)
+      {
+        Vec8i(get_low()).store_partial(n, p);
+      }
+    else
+      {
+        Vec8i(get_low()).store(p);
+        Vec8i(get_high()).store_partial(n - 8, (int32_t *)p + 8);
+      }
+  }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec16i &cutoff(int n)
+  {
+    if(n < 8)
+      {
+        z0 = Vec8i(z0).cutoff(n);
+        z1 = Vec8i(0);
+      }
+    else
+      {
+        z1 = Vec8i(z1).cutoff(n - 8);
+      }
+    return *this;
+  }
+  // Member function to change a single element in vector
+  Vec16i const &insert(uint32_t index, int32_t value)
+  {
+    if(index < 8)
+      {
+        z0 = Vec8i(z0).insert(index, value);
+      }
+    else
+      {
+        z1 = Vec8i(z1).insert(index - 8, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int32_t extract(uint32_t index) const
+  {
+    if(index < 8)
+      {
+        return Vec8i(z0).extract(index);
+      }
+    else
+      {
+        return Vec8i(z1).extract(index - 8);
+      }
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int32_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec8i:
+  Vec8i get_low() const { return Vec8i(z0); }
+  Vec8i get_high() const { return Vec8i(z1); }
+  static int size() { return 16; }
+};
+
+// Define operators for Vec16i
+
+// vector operator + : add element by element
+static inline Vec16i operator+(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec16i &operator+=(Vec16i &a, Vec16i const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec16i operator++(Vec16i &a, int)
+{
+  Vec16i a0 = a;
+  a         = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec16i &operator++(Vec16i &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16i operator-(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec16i operator-(Vec16i const &a) { return Vec16i(-a.get_low(), -a.get_high()); }
+
+// vector operator -= : subtract
+static inline Vec16i &operator-=(Vec16i &a, Vec16i const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec16i operator--(Vec16i &a, int)
+{
+  Vec16i a0 = a;
+  a         = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec16i &operator--(Vec16i &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16i operator*(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec16i &operator*=(Vec16i &a, Vec16i const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+// vector operator << : shift left
+static inline Vec16i operator<<(Vec16i const &a, int32_t b) { return Vec16i(a.get_low() << b, a.get_high() << b); }
+
+// vector operator <<= : shift left
+static inline Vec16i &operator<<=(Vec16i &a, int32_t b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16i operator>>(Vec16i const &a, int32_t b) { return Vec16i(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16i &operator>>=(Vec16i &a, int32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16ib operator==(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16ib(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16ib operator!=(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16ib(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16ib operator>(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16ib(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16ib operator<(Vec16i const &a, Vec16i const &b) { return b > a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16ib operator>=(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16ib operator<=(Vec16i const &a, Vec16i const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec16i operator&(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec16i &operator&=(Vec16i &a, Vec16i const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16i operator|(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec16i &operator|=(Vec16i &a, Vec16i const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16i operator^(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16i &operator^=(Vec16i &a, Vec16i const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16i operator~(Vec16i const &a) { return Vec16i(~(a.get_low()), ~(a.get_high())); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16i select(Vec16ib const &s, Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16i if_add(Vec16ib const &f, Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add(Vec16i const &a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16i add_saturated(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16i sub_saturated(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16i max(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16i min(Vec16i const &a, Vec16i const &b)
+{
+  return Vec16i(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16i abs(Vec16i const &a) { return Vec16i(abs(a.get_low()), abs(a.get_high())); }
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16i abs_saturated(Vec16i const &a) { return Vec16i(abs_saturated(a.get_low()), abs_saturated(a.get_high())); }
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16i rotate_left(Vec16i const &a, int b) { return Vec16i(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b)); }
+
+/*****************************************************************************
+ *
+ *          Vector of 16 32-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec16ui : public Vec16i
+{
+ public:
+  // Default constructor:
+  Vec16ui(){};
+  // Constructor to broadcast the same value into all elements:
+  Vec16ui(uint32_t i) { z0 = z1 = Vec8ui(i); };
+  // Constructor to build from all elements:
+  Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7, uint32_t i8,
+          uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15)
+  {
+    z0 = Vec8ui(i0, i1, i2, i3, i4, i5, i6, i7);
+    z1 = Vec8ui(i8, i9, i10, i11, i12, i13, i14, i15);
+  };
+  // Constructor to build from two Vec8ui:
+  Vec16ui(Vec8ui const &a0, Vec8ui const &a1)
+  {
+    z0 = a0;
+    z1 = a1;
+  }
+  // Constructor to convert from type Vec512b
+  Vec16ui(Vec512b const &x) { *this = x; };
+  // Assignment operator to convert from type Vec512b
+  Vec16ui &operator=(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+    return *this;
+  };
+  // Member function to load from array (unaligned)
+  Vec16ui &load(void const *p)
+  {
+    Vec16i::load(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  Vec16ui &load_a(void const *p)
+  {
+    Vec16i::load_a(p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec16ui const &insert(uint32_t index, uint32_t value)
+  {
+    Vec16i::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint32_t extract(uint32_t index) const { return Vec16i::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint32_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec4ui:
+  Vec8ui get_low() const { return Vec8ui(Vec16i::get_low()); }
+  Vec8ui get_high() const { return Vec8ui(Vec16i::get_high()); }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16ui operator+(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) + Vec16i(b)); }
+
+// vector operator - : subtract
+static inline Vec16ui operator-(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) - Vec16i(b)); }
+
+// vector operator * : multiply
+static inline Vec16ui operator*(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) * Vec16i(b)); }
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec16ui operator>>(Vec16ui const &a, uint32_t b) { return Vec16ui(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec16ui operator>>(Vec16ui const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right logical
+static inline Vec16ui &operator>>=(Vec16ui &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec16ui &operator>>=(Vec16ui &a, int32_t b)
+{
+  a = a >> uint32_t(b);
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator<<(Vec16ui const &a, uint32_t b) { return Vec16ui((Vec16i)a << (int32_t)b); }
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator<<(Vec16ui const &a, int32_t b) { return Vec16ui((Vec16i)a << (int32_t)b); }
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16ib operator<(Vec16ui const &a, Vec16ui const &b)
+{
+  return Vec16ib(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16ib operator>(Vec16ui const &a, Vec16ui const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16ib operator>=(Vec16ui const &a, Vec16ui const &b)
+{
+  return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16ib operator<=(Vec16ui const &a, Vec16ui const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec16ui operator&(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) & Vec16i(b)); }
+
+// vector operator | : bitwise or
+static inline Vec16ui operator|(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) | Vec16i(b)); }
+
+// vector operator ^ : bitwise xor
+static inline Vec16ui operator^(Vec16ui const &a, Vec16ui const &b) { return Vec16ui(Vec16i(a) ^ Vec16i(b)); }
+
+// vector operator ~ : bitwise not
+static inline Vec16ui operator~(Vec16ui const &a) { return Vec16ui(~Vec16i(a)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16ui select(Vec16ib const &s, Vec16ui const &a, Vec16ui const &b) { return Vec16ui(select(s, Vec16i(a), Vec16i(b))); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16ui if_add(Vec16ib const &f, Vec16ui const &a, Vec16ui const &b) { return Vec16ui(if_add(f, Vec16i(a), Vec16i(b))); }
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add(Vec16ui const &a) { return horizontal_add((Vec16i)a); }
+
+// horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16ui add_saturated(Vec16ui const &a, Vec16ui const &b)
+{
+  return Vec16ui(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16ui sub_saturated(Vec16ui const &a, Vec16ui const &b)
+{
+  return Vec16ui(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16ui max(Vec16ui const &a, Vec16ui const &b)
+{
+  return Vec16ui(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16ui min(Vec16ui const &a, Vec16ui const &b)
+{
+  return Vec16ui(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Vector of 8 64-bit signed integers
+ *
+ *****************************************************************************/
+
+class Vec8q : public Vec512b
+{
+ public:
+  // Default constructor:
+  Vec8q() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8q(int64_t i) { z0 = z1 = Vec4q(i); }
+  // Constructor to build from all elements:
+  Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7)
+  {
+    z0 = Vec4q(i0, i1, i2, i3);
+    z1 = Vec4q(i4, i5, i6, i7);
+  }
+  // Constructor to build from two Vec4q:
+  Vec8q(Vec4q const &a0, Vec4q const &a1)
+  {
+    z0 = a0;
+    z1 = a1;
+  }
+  // Constructor to convert from type Vec512b
+  Vec8q(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Assignment operator to convert from type Vec512b
+  Vec8q &operator=(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec8q &load(void const *p)
+  {
+    z0 = Vec4q().load(p);
+    z1 = Vec4q().load((int64_t const *)p + 4);
+    return *this;
+  }
+  // Member function to load from array, aligned by 64
+  Vec8q &load_a(void const *p)
+  {
+    z0 = Vec4q().load_a(p);
+    z1 = Vec4q().load_a((int64_t const *)p + 4);
+    return *this;
+  }
+  // Partial load. Load n elements and set the rest to 0
+  Vec8q &load_partial(int n, void const *p)
+  {
+    if(n < 4)
+      {
+        z0 = Vec4q().load_partial(n, p);
+        z1 = Vec4q(0);
+      }
+    else
+      {
+        z0 = Vec4q().load(p);
+        z1 = Vec4q().load_partial(n - 4, (int64_t const *)p + 4);
+      }
+    return *this;
+  }
+  // Partial store. Store n elements
+  void store_partial(int n, void *p) const
+  {
+    if(n < 4)
+      {
+        Vec4q(get_low()).store_partial(n, p);
+      }
+    else
+      {
+        Vec4q(get_low()).store(p);
+        Vec4q(get_high()).store_partial(n - 4, (int64_t *)p + 4);
+      }
+  }
+  // cut off vector to n elements. The last 8-n elements are set to zero
+  Vec8q &cutoff(int n)
+  {
+    if(n < 4)
+      {
+        z0 = Vec4q(z0).cutoff(n);
+        z1 = Vec4q(0);
+      }
+    else
+      {
+        z1 = Vec4q(z1).cutoff(n - 4);
+      }
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8q const &insert(uint32_t index, int64_t value)
+  {
+    if(index < 4)
+      {
+        z0 = Vec4q(z0).insert(index, value);
+      }
+    else
+      {
+        z1 = Vec4q(z1).insert(index - 4, value);
+      }
+    return *this;
+  }
+  // Member function extract a single element from vector
+  int64_t extract(uint32_t index) const
+  {
+    if(index < 4)
+      {
+        return Vec4q(z0).extract(index);
+      }
+    else
+      {
+        return Vec4q(z1).extract(index - 4);
+      }
+  }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  int64_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec2q:
+  Vec4q get_low() const { return Vec4q(z0); }
+  Vec4q get_high() const { return Vec4q(z1); }
+  static int size() { return 8; }
+};
+
+// Define operators for Vec8q
+
+// vector operator + : add element by element
+static inline Vec8q operator+(Vec8q const &a, Vec8q const &b) { return Vec8q(a.get_low() + b.get_low(), a.get_high() + b.get_high()); }
+
+// vector operator += : add
+static inline Vec8q &operator+=(Vec8q &a, Vec8q const &b)
+{
+  a = a + b;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec8q operator++(Vec8q &a, int)
+{
+  Vec8q a0 = a;
+  a        = a + 1;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec8q &operator++(Vec8q &a)
+{
+  a = a + 1;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8q operator-(Vec8q const &a, Vec8q const &b) { return Vec8q(a.get_low() - b.get_low(), a.get_high() - b.get_high()); }
+
+// vector operator - : unary minus
+static inline Vec8q operator-(Vec8q const &a) { return Vec8q(-a.get_low(), -a.get_high()); }
+
+// vector operator -= : subtract
+static inline Vec8q &operator-=(Vec8q &a, Vec8q const &b)
+{
+  a = a - b;
+  return a;
+}
+
+// postfix operator --
+static inline Vec8q operator--(Vec8q &a, int)
+{
+  Vec8q a0 = a;
+  a        = a - 1;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec8q &operator--(Vec8q &a)
+{
+  a = a - 1;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8q operator*(Vec8q const &a, Vec8q const &b) { return Vec8q(a.get_low() * b.get_low(), a.get_high() * b.get_high()); }
+
+// vector operator *= : multiply
+static inline Vec8q &operator*=(Vec8q &a, Vec8q const &b)
+{
+  a = a * b;
+  return a;
+}
+
+// vector operator << : shift left
+static inline Vec8q operator<<(Vec8q const &a, int32_t b) { return Vec8q(a.get_low() << b, a.get_high() << b); }
+
+// vector operator <<= : shift left
+static inline Vec8q &operator<<=(Vec8q &a, int32_t b)
+{
+  a = a << b;
+  return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8q operator>>(Vec8q const &a, int32_t b) { return Vec8q(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8q &operator>>=(Vec8q &a, int32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8qb operator==(Vec8q const &a, Vec8q const &b)
+{
+  return Vec8qb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8qb operator!=(Vec8q const &a, Vec8q const &b)
+{
+  return Vec8qb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8qb operator<(Vec8q const &a, Vec8q const &b)
+{
+  return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8qb operator>(Vec8q const &a, Vec8q const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8qb operator>=(Vec8q const &a, Vec8q const &b)
+{
+  return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8qb operator<=(Vec8q const &a, Vec8q const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec8q operator&(Vec8q const &a, Vec8q const &b) { return Vec8q(a.get_low() & b.get_low(), a.get_high() & b.get_high()); }
+
+// vector operator &= : bitwise and
+static inline Vec8q &operator&=(Vec8q &a, Vec8q const &b)
+{
+  a = a & b;
+  return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8q operator|(Vec8q const &a, Vec8q const &b) { return Vec8q(a.get_low() | b.get_low(), a.get_high() | b.get_high()); }
+
+// vector operator |= : bitwise or
+static inline Vec8q &operator|=(Vec8q &a, Vec8q const &b)
+{
+  a = a | b;
+  return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8q operator^(Vec8q const &a, Vec8q const &b) { return Vec8q(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); }
+// vector operator ^= : bitwise xor
+static inline Vec8q &operator^=(Vec8q &a, Vec8q const &b)
+{
+  a = a ^ b;
+  return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8q operator~(Vec8q const &a) { return Vec8q(~(a.get_low()), ~(a.get_high())); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8q select(Vec8qb const &s, Vec8q const &a, Vec8q const &b)
+{
+  return Vec8q(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8q if_add(Vec8qb const &f, Vec8q const &a, Vec8q const &b)
+{
+  return Vec8q(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int64_t horizontal_add(Vec8q const &a) { return horizontal_add(a.get_low() + a.get_high()); }
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x(Vec16i const &x) { return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high()); }
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x(Vec16ui const &x) { return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high()); }
+
+// function max: a > b ? a : b
+static inline Vec8q max(Vec8q const &a, Vec8q const &b)
+{
+  return Vec8q(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8q min(Vec8q const &a, Vec8q const &b)
+{
+  return Vec8q(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8q abs(Vec8q const &a) { return Vec8q(abs(a.get_low()), abs(a.get_high())); }
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8q abs_saturated(Vec8q const &a) { return Vec8q(abs_saturated(a.get_low()), abs_saturated(a.get_high())); }
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8q rotate_left(Vec8q const &a, int b) { return Vec8q(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b)); }
+
+/*****************************************************************************
+ *
+ *          Vector of 8 64-bit unsigned integers
+ *
+ *****************************************************************************/
+
+class Vec8uq : public Vec8q
+{
+ public:
+  // Default constructor:
+  Vec8uq() {}
+  // Constructor to broadcast the same value into all elements:
+  Vec8uq(uint64_t i) { z0 = z1 = Vec4uq(i); }
+  // Constructor to convert from Vec8q:
+  Vec8uq(Vec8q const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Constructor to convert from type Vec512b
+  Vec8uq(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+  }
+  // Constructor to build from all elements:
+  Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7)
+  {
+    z0 = Vec4q(i0, i1, i2, i3);
+    z1 = Vec4q(i4, i5, i6, i7);
+  }
+  // Constructor to build from two Vec4uq:
+  Vec8uq(Vec4uq const &a0, Vec4uq const &a1)
+  {
+    z0 = a0;
+    z1 = a1;
+  }
+  // Assignment operator to convert from Vec8q:
+  Vec8uq &operator=(Vec8q const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+    return *this;
+  }
+  // Assignment operator to convert from type Vec512b
+  Vec8uq &operator=(Vec512b const &x)
+  {
+    z0 = x.get_low();
+    z1 = x.get_high();
+    return *this;
+  }
+  // Member function to load from array (unaligned)
+  Vec8uq &load(void const *p)
+  {
+    Vec8q::load(p);
+    return *this;
+  }
+  // Member function to load from array, aligned by 32
+  Vec8uq &load_a(void const *p)
+  {
+    Vec8q::load_a(p);
+    return *this;
+  }
+  // Member function to change a single element in vector
+  // Note: This function is inefficient. Use load function if changing more than one element
+  Vec8uq const &insert(uint32_t index, uint64_t value)
+  {
+    Vec8q::insert(index, value);
+    return *this;
+  }
+  // Member function extract a single element from vector
+  uint64_t extract(uint32_t index) const { return Vec8q::extract(index); }
+  // Extract a single element. Use store function if extracting more than one element.
+  // Operator [] can only read an element, not write.
+  uint64_t operator[](uint32_t index) const { return extract(index); }
+  // Member functions to split into two Vec2uq:
+  Vec4uq get_low() const { return Vec4uq(Vec8q::get_low()); }
+  Vec4uq get_high() const { return Vec4uq(Vec8q::get_high()); }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8uq operator+(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) + Vec8q(b)); }
+
+// vector operator - : subtract
+static inline Vec8uq operator-(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) - Vec8q(b)); }
+
+// vector operator * : multiply element by element
+static inline Vec8uq operator*(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) * Vec8q(b)); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec8uq operator>>(Vec8uq const &a, uint32_t b) { return Vec8uq(a.get_low() >> b, a.get_high() >> b); }
+
+// vector operator >> : shift right logical all elements
+static inline Vec8uq operator>>(Vec8uq const &a, int32_t b) { return a >> (uint32_t)b; }
+
+// vector operator >>= : shift right artihmetic
+static inline Vec8uq &operator>>=(Vec8uq &a, uint32_t b)
+{
+  a = a >> b;
+  return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8uq &operator>>=(Vec8uq &a, int32_t b)
+{
+  a = a >> uint32_t(b);
+  return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8uq operator<<(Vec8uq const &a, uint32_t b) { return Vec8uq((Vec8q)a << (int32_t)b); }
+
+// vector operator << : shift left all elements
+static inline Vec8uq operator<<(Vec8uq const &a, int32_t b) { return Vec8uq((Vec8q)a << b); }
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8qb operator<(Vec8uq const &a, Vec8uq const &b)
+{
+  return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8qb operator>(Vec8uq const &a, Vec8uq const &b) { return b < a; }
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8qb operator>=(Vec8uq const &a, Vec8uq const &b)
+{
+  return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8qb operator<=(Vec8uq const &a, Vec8uq const &b) { return b >= a; }
+
+// vector operator & : bitwise and
+static inline Vec8uq operator&(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) & Vec8q(b)); }
+
+// vector operator | : bitwise or
+static inline Vec8uq operator|(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) | Vec8q(b)); }
+
+// vector operator ^ : bitwise xor
+static inline Vec8uq operator^(Vec8uq const &a, Vec8uq const &b) { return Vec8uq(Vec8q(a) ^ Vec8q(b)); }
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8uq select(Vec8qb const &s, Vec8uq const &a, Vec8uq const &b) { return Vec8uq(select(s, Vec8q(a), Vec8q(b))); }
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8uq if_add(Vec8qb const &f, Vec8uq const &a, Vec8uq const &b)
+{
+  return Vec8uq(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint64_t horizontal_add(Vec8uq const &a) { return horizontal_add(Vec8q(a)); }
+
+// function max: a > b ? a : b
+static inline Vec8uq max(Vec8uq const &a, Vec8uq const &b)
+{
+  return Vec8uq(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8uq min(Vec8uq const &a, Vec8uq const &b)
+{
+  return Vec8uq(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Vector permute functions
+ *
+ ******************************************************************************
+ *
+ * These permute functions can reorder the elements of a vector and optionally
+ * set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to select.
+ * An index of -1 will generate zero. An index of -256 means don't care.
+ *
+ * Example:
+ * Vec8q a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+ * Vec8q b;
+ * b = permute8q<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+// Permute vector of 8 64-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q permute8q(Vec8q const &a)
+{
+  return Vec8q(blend4q<i0, i1, i2, i3>(a.get_low(), a.get_high()), blend4q<i4, i5, i6, i7>(a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8uq permute8uq(Vec8uq const &a)
+{
+  return Vec8uq(permute8q<i0, i1, i2, i3, i4, i5, i6, i7>(a));
+}
+
+// Permute vector of 16 32-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16i permute16i(Vec16i const &a)
+{
+  return Vec16i(blend8i<i0, i1, i2, i3, i4, i5, i6, i7>(a.get_low(), a.get_high()),
+                blend8i<i8, i9, i10, i11, i12, i13, i14, i15>(a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16ui permute16ui(Vec16ui const &a)
+{
+  return Vec16ui(permute16i<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a));
+}
+
+/*****************************************************************************
+ *
+ *          Vector blend functions
+ *
+ ******************************************************************************
+ *
+ * These blend functions can mix elements from two different vectors and
+ * optionally set some elements to zero.
+ *
+ * The indexes are inserted as template parameters in <>. These indexes must be
+ * constants. Each template parameter is an index to the element you want to
+ * select, where higher indexes indicate an element from the second source
+ * vector. For example, if each vector has 8 elements, then indexes 0 - 7
+ * will select an element from the first vector and indexes 8 - 15 will select
+ * an element from the second vector. A negative index will generate zero.
+ *
+ * Example:
+ * Vec8q a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8q b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+ * Vec8q c;
+ * c = blend8q<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+ *
+ * A lot of the code here is metaprogramming aiming to find the instructions
+ * that best fit the template parameters and instruction set. The metacode
+ * will be reduced out to leave only a few vector instructions in release
+ * mode with optimization on.
+ *****************************************************************************/
+
+// helper function used below
+template <int n>
+static inline Vec4q select4(Vec8q const &a, Vec8q const &b)
+{
+  switch(n)
+    {
+      case 0:
+        return a.get_low();
+      case 1:
+        return a.get_high();
+      case 2:
+        return b.get_low();
+      case 3:
+        return b.get_high();
+    }
+  return Vec4q(0);
+}
+
+// blend vectors Vec8q
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q blend8q(Vec8q const &a, Vec8q const &b)
+{
+  const int j0 = i0 >= 0 ? i0 / 4 : i0;
+  const int j1 = i1 >= 0 ? i1 / 4 : i1;
+  const int j2 = i2 >= 0 ? i2 / 4 : i2;
+  const int j3 = i3 >= 0 ? i3 / 4 : i3;
+  const int j4 = i4 >= 0 ? i4 / 4 : i4;
+  const int j5 = i5 >= 0 ? i5 / 4 : i5;
+  const int j6 = i6 >= 0 ? i6 / 4 : i6;
+  const int j7 = i7 >= 0 ? i7 / 4 : i7;
+  Vec4q x0, x1;
+
+  const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
+  const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
+  const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
+  const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
+
+  // Combine all the indexes into a single bitfield, with 4 bits for each
+  const int m1 = (i0 & 0xF) | (i1 & 0xF) << 4 | (i2 & 0xF) << 8 | (i3 & 0xF) << 12 | (i4 & 0xF) << 16 | (i5 & 0xF) << 20 |
+                 (i6 & 0xF) << 24 | (i7 & 0xF) << 28;
+
+  // Mask to zero out negative indexes
+  const int mz = (i0 < 0 ? 0 : 0xF) | (i1 < 0 ? 0 : 0xF) << 4 | (i2 < 0 ? 0 : 0xF) << 8 | (i3 < 0 ? 0 : 0xF) << 12 |
+                 (i4 < 0 ? 0 : 0xF) << 16 | (i5 < 0 ? 0 : 0xF) << 20 | (i6 < 0 ? 0 : 0xF) << 24 | (i7 < 0 ? 0 : 0xF) << 28;
+
+  if(r0 < 0)
+    {
+      x0 = Vec4q(0);
+    }
+  else if(((m1 ^ r0 * 0x4444) & 0xCCCC & mz) == 0)
+    {
+      // i0 - i3 all from same source
+      x0 = permute4q<i0 & -13, i1 & -13, i2 & -13, i3 & -13>(select4<r0>(a, b));
+    }
+  else if((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0))
+    {
+      // i0 - i3 all from two sources
+      const int k0 = i0 >= 0 ? i0 & 3 : i0;
+      const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
+      const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
+      const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
+      x0           = blend4q<k0, k1, k2, k3>(select4<r0>(a, b), select4<s0>(a, b));
+    }
+  else
+    {
+      // i0 - i3 from three or four different sources
+      x0 = blend4q<0, 1, 6, 7>(blend4q<i0 & -13, (i1 & -13) | 4, -0x100, -0x100>(select4<j0>(a, b), select4<j1>(a, b)),
+                               blend4q<-0x100, -0x100, i2 & -13, (i3 & -13) | 4>(select4<j2>(a, b), select4<j3>(a, b)));
+    }
+
+  if(r1 < 0)
+    {
+      x1 = Vec4q(0);
+    }
+  else if(((m1 ^ uint32_t(r1) * 0x44440000u) & 0xCCCC0000 & mz) == 0)
+    {
+      // i4 - i7 all from same source
+      x1 = permute4q<i4 & -13, i5 & -13, i6 & -13, i7 & -13>(select4<r1>(a, b));
+    }
+  else if((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1))
+    {
+      // i4 - i7 all from two sources
+      const int k4 = i4 >= 0 ? i4 & 3 : i4;
+      const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
+      const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
+      const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
+      x1           = blend4q<k4, k5, k6, k7>(select4<r1>(a, b), select4<s1>(a, b));
+    }
+  else
+    {
+      // i4 - i7 from three or four different sources
+      x1 = blend4q<0, 1, 6, 7>(blend4q<i4 & -13, (i5 & -13) | 4, -0x100, -0x100>(select4<j4>(a, b), select4<j5>(a, b)),
+                               blend4q<-0x100, -0x100, i6 & -13, (i7 & -13) | 4>(select4<j6>(a, b), select4<j7>(a, b)));
+    }
+
+  return Vec8q(x0, x1);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8uq blend8uq(Vec8uq const &a, Vec8uq const &b)
+{
+  return Vec8uq(blend8q<i0, i1, i2, i3, i4, i5, i6, i7>(a, b));
+}
+
+// helper function used below
+template <int n>
+static inline Vec8i select4(Vec16i const &a, Vec16i const &b)
+{
+  switch(n)
+    {
+      case 0:
+        return a.get_low();
+      case 1:
+        return a.get_high();
+      case 2:
+        return b.get_low();
+      case 3:
+        return b.get_high();
+    }
+  return Vec8i(0);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16i blend16i(Vec16i const &a, Vec16i const &b)
+{
+  const int j0  = i0 >= 0 ? i0 / 8 : i0;
+  const int j1  = i1 >= 0 ? i1 / 8 : i1;
+  const int j2  = i2 >= 0 ? i2 / 8 : i2;
+  const int j3  = i3 >= 0 ? i3 / 8 : i3;
+  const int j4  = i4 >= 0 ? i4 / 8 : i4;
+  const int j5  = i5 >= 0 ? i5 / 8 : i5;
+  const int j6  = i6 >= 0 ? i6 / 8 : i6;
+  const int j7  = i7 >= 0 ? i7 / 8 : i7;
+  const int j8  = i8 >= 0 ? i8 / 8 : i8;
+  const int j9  = i9 >= 0 ? i9 / 8 : i9;
+  const int j10 = i10 >= 0 ? i10 / 8 : i10;
+  const int j11 = i11 >= 0 ? i11 / 8 : i11;
+  const int j12 = i12 >= 0 ? i12 / 8 : i12;
+  const int j13 = i13 >= 0 ? i13 / 8 : i13;
+  const int j14 = i14 >= 0 ? i14 / 8 : i14;
+  const int j15 = i15 >= 0 ? i15 / 8 : i15;
+
+  Vec8i x0, x1;
+
+  const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3 >= 0 ? j3 : j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
+  const int r1 =
+      j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
+  const int s0 = (j1 >= 0 && j1 != r0)
+                     ? j1
+                     : (j2 >= 0 && j2 != r0)
+                           ? j2
+                           : (j3 >= 0 && j3 != r0)
+                                 ? j3
+                                 : (j4 >= 0 && j4 != r0) ? j4 : (j5 >= 0 && j5 != r0) ? j5 : (j6 >= 0 && j6 != r0) ? j6 : j7;
+  const int s1 = (j9 >= 0 && j9 != r1)
+                     ? j9
+                     : (j10 >= 0 && j10 != r1)
+                           ? j10
+                           : (j11 >= 0 && j11 != r1)
+                                 ? j11
+                                 : (j12 >= 0 && j12 != r1) ? j12 : (j13 >= 0 && j13 != r1) ? j13 : (j14 >= 0 && j14 != r1) ? j14 : j15;
+
+  if(r0 < 0)
+    {
+      x0 = Vec8i(0);
+    }
+  else if(r0 == s0)
+    {
+      // i0 - i7 all from same source
+      x0 = permute8i<i0 & -25, i1 & -25, i2 & -25, i3 & -25, i4 & -25, i5 & -25, i6 & -25, i7 & -25>(select4<r0>(a, b));
+    }
+  else if((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0) && (j4 < 0 || j4 == r0 || j4 == s0) &&
+          (j5 < 0 || j5 == r0 || j5 == s0) && (j6 < 0 || j6 == r0 || j6 == s0) && (j7 < 0 || j7 == r0 || j7 == s0))
+    {
+      // i0 - i7 all from two sources
+      const int k0 = i0 >= 0 ? (i0 & 7) : i0;
+      const int k1 = (i1 >= 0 ? (i1 & 7) : i1) | (j1 == s0 ? 8 : 0);
+      const int k2 = (i2 >= 0 ? (i2 & 7) : i2) | (j2 == s0 ? 8 : 0);
+      const int k3 = (i3 >= 0 ? (i3 & 7) : i3) | (j3 == s0 ? 8 : 0);
+      const int k4 = (i4 >= 0 ? (i4 & 7) : i4) | (j4 == s0 ? 8 : 0);
+      const int k5 = (i5 >= 0 ? (i5 & 7) : i5) | (j5 == s0 ? 8 : 0);
+      const int k6 = (i6 >= 0 ? (i6 & 7) : i6) | (j6 == s0 ? 8 : 0);
+      const int k7 = (i7 >= 0 ? (i7 & 7) : i7) | (j7 == s0 ? 8 : 0);
+      x0           = blend8i<k0, k1, k2, k3, k4, k5, k6, k7>(select4<r0>(a, b), select4<s0>(a, b));
+    }
+  else
+    {
+      // i0 - i7 from three or four different sources
+      const int n0 = j0 >= 0 ? j0 / 2 * 8 + 0 : j0;
+      const int n1 = j1 >= 0 ? j1 / 2 * 8 + 1 : j1;
+      const int n2 = j2 >= 0 ? j2 / 2 * 8 + 2 : j2;
+      const int n3 = j3 >= 0 ? j3 / 2 * 8 + 3 : j3;
+      const int n4 = j4 >= 0 ? j4 / 2 * 8 + 4 : j4;
+      const int n5 = j5 >= 0 ? j5 / 2 * 8 + 5 : j5;
+      const int n6 = j6 >= 0 ? j6 / 2 * 8 + 6 : j6;
+      const int n7 = j7 >= 0 ? j7 / 2 * 8 + 7 : j7;
+      x0           = blend8i<n0, n1, n2, n3, n4, n5, n6, n7>(
+          blend8i < j0 & 2 ? -256 : i0 & 15, j1 & 2 ? -256 : i1 & 15, j2 & 2 ? -256 : i2 & 15, j3 & 2 ? -256 : i3 & 15,
+          j4 & 2 ? -256 : i4 & 15, j5 & 2 ? -256 : i5 & 15, j6 & 2 ? -256 : i6 & 15,
+          j7 & 2 ? -256 : i7 & 15 > (a.get_low(), a.get_high()), blend8i < (j0 ^ 2) & 6 ? -256 : i0 & 15,
+          (j1 ^ 2) & 6 ? -256 : i1 & 15, (j2 ^ 2) & 6 ? -256 : i2 & 15, (j3 ^ 2) & 6 ? -256 : i3 & 15, (j4 ^ 2) & 6 ? -256 : i4 & 15,
+          (j5 ^ 2) & 6 ? -256 : i5 & 15, (j6 ^ 2) & 6 ? -256 : i6 & 15, (j7 ^ 2) & 6 ? -256 : i7 & 15 > (b.get_low(), b.get_high()));
+    }
+
+  if(r1 < 0)
+    {
+      x1 = Vec8i(0);
+    }
+  else if(r1 == s1)
+    {
+      // i8 - i15 all from same source
+      x1 = permute8i<i8 & -25, i9 & -25, i10 & -25, i11 & -25, i12 & -25, i13 & -25, i14 & -25, i15 & -25>(select4<r1>(a, b));
+    }
+  else if((j10 < 0 || j10 == r1 || j10 == s1) && (j11 < 0 || j11 == r1 || j11 == s1) && (j12 < 0 || j12 == r1 || j12 == s1) &&
+          (j13 < 0 || j13 == r1 || j13 == s1) && (j14 < 0 || j14 == r1 || j14 == s1) && (j15 < 0 || j15 == r1 || j15 == s1))
+    {
+      // i8 - i15 all from two sources
+      const int k8  = i8 >= 0 ? (i8 & 7) : i8;
+      const int k9  = (i9 >= 0 ? (i9 & 7) : i9) | (j9 == s1 ? 8 : 0);
+      const int k10 = (i10 >= 0 ? (i10 & 7) : i10) | (j10 == s1 ? 8 : 0);
+      const int k11 = (i11 >= 0 ? (i11 & 7) : i11) | (j11 == s1 ? 8 : 0);
+      const int k12 = (i12 >= 0 ? (i12 & 7) : i12) | (j12 == s1 ? 8 : 0);
+      const int k13 = (i13 >= 0 ? (i13 & 7) : i13) | (j13 == s1 ? 8 : 0);
+      const int k14 = (i14 >= 0 ? (i14 & 7) : i14) | (j14 == s1 ? 8 : 0);
+      const int k15 = (i15 >= 0 ? (i15 & 7) : i15) | (j15 == s1 ? 8 : 0);
+      x1            = blend8i<k8, k9, k10, k11, k12, k13, k14, k15>(select4<r1>(a, b), select4<s1>(a, b));
+    }
+  else
+    {
+      // i8 - i15 from three or four different sources
+      const int n8  = j8 >= 0 ? j8 / 2 * 8 + 0 : j8;
+      const int n9  = j9 >= 0 ? j9 / 2 * 8 + 1 : j9;
+      const int n10 = j10 >= 0 ? j10 / 2 * 8 + 2 : j10;
+      const int n11 = j11 >= 0 ? j11 / 2 * 8 + 3 : j11;
+      const int n12 = j12 >= 0 ? j12 / 2 * 8 + 4 : j12;
+      const int n13 = j13 >= 0 ? j13 / 2 * 8 + 5 : j13;
+      const int n14 = j14 >= 0 ? j14 / 2 * 8 + 6 : j14;
+      const int n15 = j15 >= 0 ? j15 / 2 * 8 + 7 : j15;
+      x1            = blend8i<n8, n9, n10, n11, n12, n13, n14, n15>(
+          blend8i < j8 & 2 ? -256 : i8 & 15, j9 & 2 ? -256 : i9 & 15, j10 & 2 ? -256 : i10 & 15, j11 & 2 ? -256 : i11 & 15,
+          j12 & 2 ? -256 : i12 & 15, j13 & 2 ? -256 : i13 & 15, j14 & 2 ? -256 : i14 & 15,
+          j15 & 2 ? -256 : i15 & 15 > (a.get_low(), a.get_high()), blend8i < (j8 ^ 2) & 6 ? -256 : i8 & 15,
+          (j9 ^ 2) & 6 ? -256 : i9 & 15, (j10 ^ 2) & 6 ? -256 : i10 & 15, (j11 ^ 2) & 6 ? -256 : i11 & 15,
+          (j12 ^ 2) & 6 ? -256 : i12 & 15, (j13 ^ 2) & 6 ? -256 : i13 & 15, (j14 ^ 2) & 6 ? -256 : i14 & 15,
+          (j15 ^ 2) & 6 ? -256 : i15 & 15 > (b.get_low(), b.get_high()));
+    }
+  return Vec16i(x0, x1);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16ui blend16ui(Vec16ui const &a, Vec16ui const &b)
+{
+  return Vec16ui(blend16i<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(Vec16i(a), Vec16i(b)));
+}
+
+/*****************************************************************************
+ *
+ *          Vector lookup functions
+ *
+ ******************************************************************************
+ *
+ * These functions use vector elements as indexes into a table.
+ * The table is given as one or more vectors or as an array.
+ *
+ * This can be used for several purposes:
+ *  - table lookup
+ *  - permute or blend with variable indexes
+ *  - blend from more than two sources
+ *  - gather non-contiguous data
+ *
+ * An index out of range may produce any value - the actual value produced is
+ * implementation dependent and may be different for different instruction
+ * sets. An index out of range does not produce an error message or exception.
+ *
+ * Example:
+ * Vec8q a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+ * Vec8q b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+ * Vec8q c;
+ * c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+ *
+ *****************************************************************************/
+
+static inline Vec16i lookup16(Vec16i const &index, Vec16i const &table)
+{
+  int32_t tab[16];
+  table.store(tab);
+  Vec8i t0 = lookup<16>(index.get_low(), tab);
+  Vec8i t1 = lookup<16>(index.get_high(), tab);
+  return Vec16i(t0, t1);
+}
+
+template <int n>
+static inline Vec16i lookup(Vec16i const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 8)
+    {
+      Vec8i table1 = Vec8i().load(table);
+      return Vec16i(lookup8(index.get_low(), table1), lookup8(index.get_high(), table1));
+    }
+  if(n <= 16)
+    return lookup16(index, Vec16i().load(table));
+  // n > 16. Limit index
+  Vec16ui i1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      i1 = Vec16ui(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      i1 = min(Vec16ui(index), n - 1);
+    }
+  int32_t const *t = (int32_t const *)table;
+  return Vec16i(t[i1[0]], t[i1[1]], t[i1[2]], t[i1[3]], t[i1[4]], t[i1[5]], t[i1[6]], t[i1[7]], t[i1[8]], t[i1[9]], t[i1[10]],
+                t[i1[11]], t[i1[12]], t[i1[13]], t[i1[14]], t[i1[15]]);
+}
+
+static inline Vec8q lookup8(Vec8q const &index, Vec8q const &table)
+{
+  int64_t tab[8];
+  table.store(tab);
+  Vec4q t0 = lookup<8>(index.get_low(), tab);
+  Vec4q t1 = lookup<8>(index.get_high(), tab);
+  return Vec8q(t0, t1);
+}
+
+template <int n>
+static inline Vec8q lookup(Vec8q const &index, void const *table)
+{
+  if(n <= 0)
+    return 0;
+  if(n <= 4)
+    {
+      Vec4q table1 = Vec4q().load(table);
+      return Vec8q(lookup4(index.get_low(), table1), lookup4(index.get_high(), table1));
+    }
+  if(n <= 8)
+    {
+      return lookup8(index, Vec8q().load(table));
+    }
+  // n > 8. Limit index
+  Vec8uq i1;
+  if((n & (n - 1)) == 0)
+    {
+      // n is a power of 2, make index modulo n
+      i1 = Vec8uq(index) & (n - 1);
+    }
+  else
+    {
+      // n is not a power of 2, limit to n-1
+      i1 = min(Vec8uq(index), n - 1);
+    }
+  int64_t const *t = (int64_t const *)table;
+  return Vec8q(t[i1[0]], t[i1[1]], t[i1[2]], t[i1[3]], t[i1[4]], t[i1[5]], t[i1[6]], t[i1[7]]);
+}
+
+/*****************************************************************************
+ *
+ *          Vector scatter functions
+ *
+ ******************************************************************************
+ *
+ * These functions write the elements of a vector to arbitrary positions in an
+ * array in memory. Each vector element is written to an array position
+ * determined by an index. An element is not written if the corresponding
+ * index is out of range.
+ * The indexes can be specified as constant template parameters or as an
+ * integer vector.
+ *
+ * The scatter functions are useful if the data are distributed in a sparce
+ * manner into the array. If the array is dense then it is more efficient
+ * to permute the data into the right positions and then write the whole
+ * permuted vector into the array.
+ *
+ * Example:
+ * Vec8q a(10,11,12,13,14,15,16,17);
+ * int64_t b[16] = {0};
+ * scatter<0,2,14,10,1,-1,5,9>(a,b);
+ * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
+ *
+ *****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline void scatter(Vec16i const &data, void *array)
+{
+  int32_t *arr        = (int32_t *)array;
+  const int index[16] = {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15};
+  for(int i = 0; i < 16; i++)
+    {
+      if(index[i] >= 0)
+        arr[index[i]] = data[i];
+    }
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8q const &data, void *array)
+{
+  int64_t *arr       = (int64_t *)array;
+  const int index[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+  for(int i = 0; i < 8; i++)
+    {
+      if(index[i] >= 0)
+        arr[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec16i const &index, uint32_t limit, Vec16i const &data, void *array)
+{
+  int32_t *arr = (int32_t *)array;
+  for(int i = 0; i < 16; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        arr[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec8q const &index, uint32_t limit, Vec8q const &data, void *array)
+{
+  int64_t *arr = (int64_t *)array;
+  for(int i = 0; i < 8; i++)
+    {
+      if(uint64_t(index[i]) < uint64_t(limit))
+        arr[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec8i const &index, uint32_t limit, Vec8q const &data, void *array)
+{
+  int64_t *arr = (int64_t *)array;
+  for(int i = 0; i < 8; i++)
+    {
+      if(uint32_t(index[i]) < limit)
+        arr[index[i]] = data[i];
+    }
+}
+
+/*****************************************************************************
+ *
+ *          Gather functions with fixed indexes
+ *
+ *****************************************************************************/
+// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14,
+          int i15>
+static inline Vec16i gather16i(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) >= 0>
+      Negative_array_index;  // Error message if index is negative
+  // find smallest and biggest index, using only compile-time constant expressions
+  const int i01min    = i0 < i1 ? i0 : i1;
+  const int i23min    = i2 < i3 ? i2 : i3;
+  const int i45min    = i4 < i5 ? i4 : i5;
+  const int i67min    = i6 < i7 ? i6 : i7;
+  const int i89min    = i8 < i9 ? i8 : i9;
+  const int i1011min  = i10 < i11 ? i10 : i11;
+  const int i1213min  = i12 < i13 ? i12 : i13;
+  const int i1415min  = i14 < i15 ? i14 : i15;
+  const int i0_3min   = i01min < i23min ? i01min : i23min;
+  const int i4_7min   = i45min < i67min ? i45min : i67min;
+  const int i8_11min  = i89min < i1011min ? i89min : i1011min;
+  const int i12_15min = i1213min < i1415min ? i1213min : i1415min;
+  const int i0_7min   = i0_3min < i4_7min ? i0_3min : i4_7min;
+  const int i8_15min  = i8_11min < i12_15min ? i8_11min : i12_15min;
+  const int imin      = i0_7min < i8_15min ? i0_7min : i8_15min;
+  const int i01max    = i0 > i1 ? i0 : i1;
+  const int i23max    = i2 > i3 ? i2 : i3;
+  const int i45max    = i4 > i5 ? i4 : i5;
+  const int i67max    = i6 > i7 ? i6 : i7;
+  const int i89max    = i8 > i9 ? i8 : i9;
+  const int i1011max  = i10 > i11 ? i10 : i11;
+  const int i1213max  = i12 > i13 ? i12 : i13;
+  const int i1415max  = i14 > i15 ? i14 : i15;
+  const int i0_3max   = i01max > i23max ? i01max : i23max;
+  const int i4_7max   = i45max > i67max ? i45max : i67max;
+  const int i8_11max  = i89max > i1011max ? i89max : i1011max;
+  const int i12_15max = i1213max > i1415max ? i1213max : i1415max;
+  const int i0_7max   = i0_3max > i4_7max ? i0_3max : i4_7max;
+  const int i8_15max  = i8_11max > i12_15max ? i8_11max : i12_15max;
+  const int imax      = i0_7max > i8_15max ? i0_7max : i8_15max;
+  if(imax - imin <= 15)
+    {
+      // load one contiguous block and permute
+      if(imax > 15)
+        {
+          // make sure we don't read past the end of the array
+          Vec16i b = Vec16i().load((int32_t const *)a + imax - 15);
+          return permute16i<i0 - imax + 15, i1 - imax + 15, i2 - imax + 15, i3 - imax + 15, i4 - imax + 15, i5 - imax + 15,
+                            i6 - imax + 15, i7 - imax + 15, i8 - imax + 15, i9 - imax + 15, i10 - imax + 15, i11 - imax + 15,
+                            i12 - imax + 15, i13 - imax + 15, i14 - imax + 15, i15 - imax + 15>(b);
+        }
+      else
+        {
+          Vec16i b = Vec16i().load((int32_t const *)a + imin);
+          return permute16i<i0 - imin, i1 - imin, i2 - imin, i3 - imin, i4 - imin, i5 - imin, i6 - imin, i7 - imin, i8 - imin,
+                            i9 - imin, i10 - imin, i11 - imin, i12 - imin, i13 - imin, i14 - imin, i15 - imin>(b);
+        }
+    }
+  if((i0 < imin + 16 || i0 > imax - 16) && (i1 < imin + 16 || i1 > imax - 16) && (i2 < imin + 16 || i2 > imax - 16) &&
+     (i3 < imin + 16 || i3 > imax - 16) && (i4 < imin + 16 || i4 > imax - 16) && (i5 < imin + 16 || i5 > imax - 16) &&
+     (i6 < imin + 16 || i6 > imax - 16) && (i7 < imin + 16 || i7 > imax - 16) && (i8 < imin + 16 || i8 > imax - 16) &&
+     (i9 < imin + 16 || i9 > imax - 16) && (i10 < imin + 16 || i10 > imax - 16) && (i11 < imin + 16 || i11 > imax - 16) &&
+     (i12 < imin + 16 || i12 > imax - 16) && (i13 < imin + 16 || i13 > imax - 16) && (i14 < imin + 16 || i14 > imax - 16) &&
+     (i15 < imin + 16 || i15 > imax - 16))
+    {
+      // load two contiguous blocks and blend
+      Vec16i b      = Vec16i().load((int32_t const *)a + imin);
+      Vec16i c      = Vec16i().load((int32_t const *)a + imax - 15);
+      const int j0  = i0 < imin + 16 ? i0 - imin : 31 - imax + i0;
+      const int j1  = i1 < imin + 16 ? i1 - imin : 31 - imax + i1;
+      const int j2  = i2 < imin + 16 ? i2 - imin : 31 - imax + i2;
+      const int j3  = i3 < imin + 16 ? i3 - imin : 31 - imax + i3;
+      const int j4  = i4 < imin + 16 ? i4 - imin : 31 - imax + i4;
+      const int j5  = i5 < imin + 16 ? i5 - imin : 31 - imax + i5;
+      const int j6  = i6 < imin + 16 ? i6 - imin : 31 - imax + i6;
+      const int j7  = i7 < imin + 16 ? i7 - imin : 31 - imax + i7;
+      const int j8  = i8 < imin + 16 ? i8 - imin : 31 - imax + i8;
+      const int j9  = i9 < imin + 16 ? i9 - imin : 31 - imax + i9;
+      const int j10 = i10 < imin + 16 ? i10 - imin : 31 - imax + i10;
+      const int j11 = i11 < imin + 16 ? i11 - imin : 31 - imax + i11;
+      const int j12 = i12 < imin + 16 ? i12 - imin : 31 - imax + i12;
+      const int j13 = i13 < imin + 16 ? i13 - imin : 31 - imax + i13;
+      const int j14 = i14 < imin + 16 ? i14 - imin : 31 - imax + i14;
+      const int j15 = i15 < imin + 16 ? i15 - imin : 31 - imax + i15;
+      return blend16i<j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15>(b, c);
+    }
+  // use lookup function
+  return lookup<imax + 1>(Vec16i(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15), a);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q gather8q(void const *a)
+{
+  Static_error_check<(i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) >= 0> Negative_array_index;  // Error message if index is negative
+
+  const int i01min   = i0 < i1 ? i0 : i1;
+  const int i23min   = i2 < i3 ? i2 : i3;
+  const int i45min   = i4 < i5 ? i4 : i5;
+  const int i67min   = i6 < i7 ? i6 : i7;
+  const int i0123min = i01min < i23min ? i01min : i23min;
+  const int i4567min = i45min < i67min ? i45min : i67min;
+  const int imin     = i0123min < i4567min ? i0123min : i4567min;
+  const int i01max   = i0 > i1 ? i0 : i1;
+  const int i23max   = i2 > i3 ? i2 : i3;
+  const int i45max   = i4 > i5 ? i4 : i5;
+  const int i67max   = i6 > i7 ? i6 : i7;
+  const int i0123max = i01max > i23max ? i01max : i23max;
+  const int i4567max = i45max > i67max ? i45max : i67max;
+  const int imax     = i0123max > i4567max ? i0123max : i4567max;
+  if(imax - imin <= 7)
+    {
+      // load one contiguous block and permute
+      if(imax > 7)
+        {
+          // make sure we don't read past the end of the array
+          Vec8q b = Vec8q().load((int64_t const *)a + imax - 7);
+          return permute8q<i0 - imax + 7, i1 - imax + 7, i2 - imax + 7, i3 - imax + 7, i4 - imax + 7, i5 - imax + 7, i6 - imax + 7,
+                           i7 - imax + 7>(b);
+        }
+      else
+        {
+          Vec8q b = Vec8q().load((int64_t const *)a + imin);
+          return permute8q<i0 - imin, i1 - imin, i2 - imin, i3 - imin, i4 - imin, i5 - imin, i6 - imin, i7 - imin>(b);
+        }
+    }
+  if((i0 < imin + 8 || i0 > imax - 8) && (i1 < imin + 8 || i1 > imax - 8) && (i2 < imin + 8 || i2 > imax - 8) &&
+     (i3 < imin + 8 || i3 > imax - 8) && (i4 < imin + 8 || i4 > imax - 8) && (i5 < imin + 8 || i5 > imax - 8) &&
+     (i6 < imin + 8 || i6 > imax - 8) && (i7 < imin + 8 || i7 > imax - 8))
+    {
+      // load two contiguous blocks and blend
+      Vec8q b      = Vec8q().load((int64_t const *)a + imin);
+      Vec8q c      = Vec8q().load((int64_t const *)a + imax - 7);
+      const int j0 = i0 < imin + 8 ? i0 - imin : 15 - imax + i0;
+      const int j1 = i1 < imin + 8 ? i1 - imin : 15 - imax + i1;
+      const int j2 = i2 < imin + 8 ? i2 - imin : 15 - imax + i2;
+      const int j3 = i3 < imin + 8 ? i3 - imin : 15 - imax + i3;
+      const int j4 = i4 < imin + 8 ? i4 - imin : 15 - imax + i4;
+      const int j5 = i5 < imin + 8 ? i5 - imin : 15 - imax + i5;
+      const int j6 = i6 < imin + 8 ? i6 - imin : 15 - imax + i6;
+      const int j7 = i7 < imin + 8 ? i7 - imin : 15 - imax + i7;
+      return blend8q<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+  // use lookup function
+  return lookup<imax + 1>(Vec8q(i0, i1, i2, i3, i4, i5, i6, i7), a);
+}
+
+/*****************************************************************************
+ *
+ *          Functions for conversion between integer sizes
+ *
+ *****************************************************************************/
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_to_int : extends Vec16s to Vec16i with sign extension
+static inline Vec16i extend_to_int(Vec16s const &a) { return Vec16i(extend_low(a), extend_high(a)); }
+
+// Function extend_to_int : extends Vec16us to Vec16ui with zero extension
+static inline Vec16ui extend_to_int(Vec16us const &a) { return Vec16i(extend_low(a), extend_high(a)); }
+
+// Function extend_to_int : extends Vec16c to Vec16i with sign extension
+static inline Vec16i extend_to_int(Vec16c const &a) { return extend_to_int(Vec16s(extend_low(a), extend_high(a))); }
+
+// Function extend_to_int : extends Vec16uc to Vec16ui with zero extension
+static inline Vec16ui extend_to_int(Vec16uc const &a) { return extend_to_int(Vec16s(extend_low(a), extend_high(a))); }
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 64 bits with sign extension
+static inline Vec8q extend_low(Vec16i const &a) { return Vec8q(extend_low(a.get_low()), extend_high(a.get_low())); }
+
+// Function extend_high : extends the high 8 elements to 64 bits with sign extension
+static inline Vec8q extend_high(Vec16i const &a) { return Vec8q(extend_low(a.get_high()), extend_high(a.get_high())); }
+
+// Function extend_low : extends the low 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_low(Vec16ui const &a) { return Vec8q(extend_low(a.get_low()), extend_high(a.get_low())); }
+
+// Function extend_high : extends the high 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_high(Vec16ui const &a) { return Vec8q(extend_low(a.get_high()), extend_high(a.get_high())); }
+
+// Compress 32-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec16c compress_to_int8(Vec16i const &a)
+{
+  Vec16s b = compress(a.get_low(), a.get_high());
+  Vec16c c = compress(b.get_low(), b.get_high());
+  return c;
+}
+
+static inline Vec16s compress_to_int16(Vec16i const &a) { return compress(a.get_low(), a.get_high()); }
+
+// with signed saturation
+static inline Vec16c compress_to_int8_saturated(Vec16i const &a)
+{
+  Vec16s b = compress_saturated(a.get_low(), a.get_high());
+  Vec16c c = compress_saturated(b.get_low(), b.get_high());
+  return c;
+}
+
+static inline Vec16s compress_to_int16_saturated(Vec16i const &a) { return compress_saturated(a.get_low(), a.get_high()); }
+
+// with unsigned saturation
+static inline Vec16uc compress_to_int8_saturated(Vec16ui const &a)
+{
+  Vec16us b = compress_saturated(a.get_low(), a.get_high());
+  Vec16uc c = compress_saturated(b.get_low(), b.get_high());
+  return c;
+}
+
+static inline Vec16us compress_to_int16_saturated(Vec16ui const &a) { return compress_saturated(a.get_low(), a.get_high()); }
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec16i compress(Vec8q const &low, Vec8q const &high)
+{
+  return Vec16i(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec16i compress_saturated(Vec8q const &low, Vec8q const &high)
+{
+  return Vec16i(compress_saturated(low.get_low(), low.get_high()), compress_saturated(high.get_low(), high.get_high()));
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec16ui compress_saturated(Vec8uq const &low, Vec8uq const &high)
+{
+  return Vec16ui(compress_saturated(low.get_low(), low.get_high()), compress_saturated(high.get_low(), high.get_high()));
+}
+
+/*****************************************************************************
+ *
+ *          Integer division operators
+ *
+ *          Please see the file vectori128.h for explanation.
+ *
+ *****************************************************************************/
+
+// vector operator / : divide each element by divisor
+
+// vector operator / : divide all elements by same integer
+static inline Vec16i operator/(Vec16i const &a, Divisor_i const &d) { return Vec16i(a.get_low() / d, a.get_high() / d); }
+
+// vector operator /= : divide
+static inline Vec16i &operator/=(Vec16i &a, Divisor_i const &d)
+{
+  a = a / d;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16ui operator/(Vec16ui const &a, Divisor_ui const &d) { return Vec16ui(a.get_low() / d, a.get_high() / d); }
+
+// vector operator /= : divide
+static inline Vec16ui &operator/=(Vec16ui &a, Divisor_ui const &d)
+{
+  a = a / d;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Integer division 2: divisor is a compile-time constant
+ *
+ *****************************************************************************/
+
+// Divide Vec16i by compile-time constant
+template <int32_t d>
+static inline Vec16i divide_by_i(Vec16i const &a)
+{
+  return Vec16i(divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// define Vec16i a / const_int(d)
+template <int32_t d>
+static inline Vec16i operator/(Vec16i const &a, Const_int_t<d>)
+{
+  return divide_by_i<d>(a);
+}
+
+// define Vec16i a / const_uint(d)
+template <uint32_t d>
+static inline Vec16i operator/(Vec16i const &a, Const_uint_t<d>)
+{
+  Static_error_check<(d < 0x80000000u)> Error_overflow_dividing_signed_by_unsigned;  // Error: dividing signed by overflowing unsigned
+  return divide_by_i<int32_t(d)>(a);                                                 // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16i &operator/=(Vec16i &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16i &operator/=(Vec16i &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// Divide Vec16ui by compile-time constant
+template <uint32_t d>
+static inline Vec16ui divide_by_ui(Vec16ui const &a)
+{
+  return Vec16ui(divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
+}
+
+// define Vec16ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec16ui operator/(Vec16ui const &a, Const_uint_t<d>)
+{
+  return divide_by_ui<d>(a);
+}
+
+// define Vec16ui a / const_int(d)
+template <int32_t d>
+static inline Vec16ui operator/(Vec16ui const &a, Const_int_t<d>)
+{
+  Static_error_check<(d >= 0)> Error_dividing_unsigned_by_negative;  // Error: dividing unsigned by negative is ambiguous
+  return divide_by_ui<d>(a);                                         // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16ui &operator/=(Vec16ui &a, Const_uint_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16ui &operator/=(Vec16ui &a, Const_int_t<d> b)
+{
+  a = a / b;
+  return a;
+}
+
+/*****************************************************************************
+ *
+ *          Horizontal scan functions
+ *
+ *****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec16ib const &x)
+{
+  int a1 = horizontal_find_first(x.get_low());
+  if(a1 >= 0)
+    return a1;
+  int a2 = horizontal_find_first(x.get_high());
+  if(a2 < 0)
+    return a2;
+  return a2 + 8;
+}
+
+static inline int horizontal_find_first(Vec8qb const &x)
+{
+  int a1 = horizontal_find_first(x.get_low());
+  if(a1 >= 0)
+    return a1;
+  int a2 = horizontal_find_first(x.get_high());
+  if(a2 < 0)
+    return a2;
+  return a2 + 4;
+}
+
+// count the number of true elements
+static inline uint32_t horizontal_count(Vec16ib const &x) { return horizontal_count(x.get_low()) + horizontal_count(x.get_high()); }
+
+static inline uint32_t horizontal_count(Vec8qb const &x) { return horizontal_count(x.get_low()) + horizontal_count(x.get_high()); }
+
+/*****************************************************************************
+ *
+ *          Boolean <-> bitfield conversion functions
+ *
+ *****************************************************************************/
+
+// to_bits: convert to integer bitfield
+static inline uint16_t to_bits(Vec16b const &a) { return to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8); }
+
+// to_bits: convert to integer bitfield
+static inline uint16_t to_bits(Vec16ib const &a) { return to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8); }
+
+// to_Vec16ib: convert integer bitfield to boolean vector
+static inline Vec16ib to_Vec16ib(uint16_t const &x) { return Vec16i(to_Vec8ib(uint8_t(x)), to_Vec8ib(uint8_t(x >> 8))); }
+
+// to_bits: convert to integer bitfield
+static inline uint8_t to_bits(Vec8b const &a) { return to_bits(a.get_low()) | (to_bits(a.get_high()) << 4); }
+
+// to_Vec8qb: convert integer bitfield to boolean vector
+static inline Vec8qb to_Vec8qb(uint8_t x) { return Vec8q(to_Vec4qb(x), to_Vec4qb(x >> 4)); }
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORI512_H
diff --git a/src/vectorclass/vectormath_common.h b/src/vectorclass/vectormath_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..562350876aa47eb039ab380083a4a464572ccfe5
--- /dev/null
+++ b/src/vectorclass/vectormath_common.h
@@ -0,0 +1,340 @@
+/***************************  vectormath_common.h   ****************************
+ * Author:        Agner Fog
+ * Date created:  2014-04-18
+ * Last modified: 2016-11-25
+ * Version:       1.25
+ * Project:       vector classes
+ * Description:
+ * Header file containing common code for inline version of mathematical functions.
+ *
+ * Theory, methods and inspiration based partially on these sources:
+ * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
+ *   Ellis Horwood, 1989.
+ * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
+ *   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
+ * > Cephes math library by Stephen L. Moshier 1992,
+ *   http://www.netlib.org/cephes/
+ *
+ * Calculation methods:
+ * Some functions are using Padé approximations f(x) = P(x)/Q(x)
+ * Most single precision functions are using Taylor expansions
+ *
+ * For detailed instructions, see VectorClass.pdf
+ *
+ * (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
+ ******************************************************************************/
+
+#ifndef VECTORMATH_COMMON_H
+#define VECTORMATH_COMMON_H 1
+
+#ifdef VECTORMATH_LIB_H
+#error conflicting header files: vectormath_lib.h for external math functions, other vectormath_xxx.h for inline math functions
+#endif
+
+#include <math.h>
+#include "vectorclass.h"
+
+/******************************************************************************
+               define mathematical constants
+******************************************************************************/
+#define VM_PI 3.14159265358979323846                // pi
+#define VM_PI_2 1.57079632679489661923              // pi / 2
+#define VM_PI_4 0.785398163397448309616             // pi / 4
+#define VM_SQRT2 1.41421356237309504880             // sqrt(2)
+#define VM_LOG2E 1.44269504088896340736             // 1/log(2)
+#define VM_LOG10E 0.434294481903251827651           // 1/log(10)
+#define VM_LOG210 3.321928094887362347808           // log2(10)
+#define VM_LN2 0.693147180559945309417              // log(2)
+#define VM_LN10 2.30258509299404568402              // log(10)
+#define VM_SMALLEST_NORMAL 2.2250738585072014E-308  // smallest normal number, double
+#define VM_SMALLEST_NORMALF 1.17549435E-38f         // smallest normal number, float
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/******************************************************************************
+      templates for producing infinite and nan in desired vector type
+******************************************************************************/
+template <class VTYPE>
+static inline VTYPE infinite_vec();
+
+template <>
+inline Vec2d infinite_vec<Vec2d>()
+{
+  return infinite2d();
+}
+
+template <>
+inline Vec4f infinite_vec<Vec4f>()
+{
+  return infinite4f();
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+template <>
+inline Vec4d infinite_vec<Vec4d>()
+{
+  return infinite4d();
+}
+
+template <>
+inline Vec8f infinite_vec<Vec8f>()
+{
+  return infinite8f();
+}
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+template <>
+inline Vec8d infinite_vec<Vec8d>()
+{
+  return infinite8d();
+}
+
+template <>
+inline Vec16f infinite_vec<Vec16f>()
+{
+  return infinite16f();
+}
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// template for producing quiet NAN
+template <class VTYPE>
+static inline VTYPE nan_vec(int n = 0x100);
+
+template <>
+inline Vec2d nan_vec<Vec2d>(int n)
+{
+  return nan2d(n);
+}
+
+template <>
+inline Vec4f nan_vec<Vec4f>(int n)
+{
+  return nan4f(n);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+template <>
+inline Vec4d nan_vec<Vec4d>(int n)
+{
+  return nan4d(n);
+}
+
+template <>
+inline Vec8f nan_vec<Vec8f>(int n)
+{
+  return nan8f(n);
+}
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+template <>
+inline Vec8d nan_vec<Vec8d>(int n)
+{
+  return nan8d(n);
+}
+
+template <>
+inline Vec16f nan_vec<Vec16f>(int n)
+{
+  return nan16f(n);
+}
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Define NAN trace values
+#define NAN_LOG 0x101  // logarithm for x<0
+#define NAN_POW 0x102  // negative number raised to non-integer power
+#define NAN_HYP 0x104  // acosh for x<1 and atanh for abs(x)>1
+
+/******************************************************************************
+                  templates for polynomials
+Using Estrin's scheme to make shorter dependency chains and use FMA, starting
+longest dependency chains first.
+******************************************************************************/
+
+// template <typedef VECTYPE, typedef CTYPE>
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_2(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2)
+{
+  // calculates polynomial c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  // return = x2 * c2 + (x * c1 + c0);
+  return mul_add(x2, c2, mul_add(x, c1, c0));
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_3(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3)
+{
+  // calculates polynomial c3*x^3 + c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  // return (c2 + c3*x)*x2 + (c1*x + c0);
+  return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0));
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_4(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4)
+{
+  // calculates polynomial c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  // return (c2+c3*x)*x2 + ((c0+c1*x) + c4*x4);
+  return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c4 * x4);
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_4n(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3)
+{
+  // calculates polynomial 1*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  // return (c2+c3*x)*x2 + ((c0+c1*x) + x4);
+  return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + x4);
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_5(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5)
+{
+  // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  // return (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x));
+  return mul_add(mul_add(c3, x, c2), x2, mul_add(mul_add(c5, x, c4), x4, mul_add(c1, x, c0)));
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_5n(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4)
+{
+  // calculates polynomial 1*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  // return (c2+c3*x)*x2 + ((c4+x)*x4 + (c0+c1*x));
+  return mul_add(mul_add(c3, x, c2), x2, mul_add(c4 + x, x4, mul_add(c1, x, c0)));
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_6(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6)
+{
+  // calculates polynomial c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  // return  (c4+c5*x+c6*x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
+  return mul_add(mul_add(c6, x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_6n(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5)
+{
+  // calculates polynomial 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  // return  (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
+  return mul_add(mul_add(c5, x, c4 + x2), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_7(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7)
+{
+  // calculates polynomial c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  // return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
+  return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_8(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7,
+                                 CTYPE c8)
+{
+  // calculates polynomial c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  VTYPE x8 = x4 * x4;
+  // return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8*x8 + (c2+c3*x)*x2 + (c0+c1*x));
+  return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
+                 mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c8 * x8));
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_9(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7,
+                                 CTYPE c8, CTYPE c9)
+{
+  // calculates polynomial c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  VTYPE x8 = x4 * x4;
+  // return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
+  return mul_add(
+      mul_add(c9, x, c8), x8,
+      mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_10(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7,
+                                  CTYPE c8, CTYPE c9, CTYPE c10)
+{
+  // calculates polynomial c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  VTYPE x8 = x4 * x4;
+  // return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
+  return mul_add(
+      mul_add(x2, c10, mul_add(c9, x, c8)), x8,
+      mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_13(VTYPE const& x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7,
+                                  CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13)
+{
+  // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  VTYPE x8 = x4 * x4;
+  return mul_add(
+      mul_add(mul_add(c13, x, c12), x4, mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
+      mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
+}
+
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_13m(VTYPE const& x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9,
+                                   CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13)
+{
+  // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0
+  // VTYPE may be a vector type, CTYPE is a scalar type
+  VTYPE x2 = x * x;
+  VTYPE x4 = x2 * x2;
+  VTYPE x8 = x4 * x4;
+  // return  ((c8+c9*x) + (c10+c11*x)*x2 + (c12+c13*x)*x4)*x8 + (((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + x));
+  return mul_add(mul_add(mul_add(c13, x, c12), x4, mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
+                 mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, x)));
+}
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif
diff --git a/src/vectorclass/vectormath_exp.h b/src/vectorclass/vectormath_exp.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5e120ad1e63cf79b50bd9b2cad083e5fc5647ed
--- /dev/null
+++ b/src/vectorclass/vectormath_exp.h
@@ -0,0 +1,2231 @@
+/****************************  vectormath_exp.h   ******************************
+ * Author:        Agner Fog
+ * Date created:  2014-04-18
+ * Last modified: 2016-12-26
+ * Version:       1.26
+ * Project:       vector classes
+ * Description:
+ * Header file containing inline vector functions of logarithms, exponential
+ * and power functions:
+ * exp         exponential function
+ * exp2        exponential function base 2
+ * exp10       exponential function base 10
+ * exmp1       exponential function minus 1
+ * log         natural logarithm
+ * log2        logarithm base 2
+ * log10       logarithm base 10
+ * log1p       natural logarithm of 1+x
+ * cbrt        cube root
+ * pow         raise vector elements to power
+ * pow_ratio   raise vector elements to rational power
+ *
+ * Theory, methods and inspiration based partially on these sources:
+ * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
+ *   Ellis Horwood, 1989.
+ * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
+ *   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
+ * > Cephes math library by Stephen L. Moshier 1992,
+ *   http://www.netlib.org/cephes/
+ *
+ * For detailed instructions, see vectormath_common.h and VectorClass.pdf
+ *
+ * (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
+ ******************************************************************************/
+
+#ifndef VECTORMATH_EXP_H
+#define VECTORMATH_EXP_H 1
+
+#include "vectormath_common.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/******************************************************************************
+ *                 Exponential functions
+ ******************************************************************************/
+
+// Helper functions, used internally:
+
+// This function calculates pow(2,n) where n must be an integer. Does not check for overflow or underflow
+static inline Vec2d vm_pow2n(Vec2d const& n)
+{
+  const double pow2_52 = 4503599627370496.0;    // 2^52
+  const double bias    = 1023.0;                // bias in exponent
+  Vec2d a              = n + (bias + pow2_52);  // put n + bias in least significant bits
+  Vec2q b              = reinterpret_i(a);      // bit-cast to integer
+  Vec2q c              = b << 52;               // shift left 52 places to get into exponent field
+  Vec2d d              = reinterpret_d(c);      // bit-cast back to double
+  return d;
+}
+
+static inline Vec4f vm_pow2n(Vec4f const& n)
+{
+  const float pow2_23 = 8388608.0;             // 2^23
+  const float bias    = 127.0;                 // bias in exponent
+  Vec4f a             = n + (bias + pow2_23);  // put n + bias in least significant bits
+  Vec4i b             = reinterpret_i(a);      // bit-cast to integer
+  Vec4i c             = b << 23;               // shift left 23 places to get into exponent field
+  Vec4f d             = reinterpret_f(c);      // bit-cast back to float
+  return d;
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d vm_pow2n(Vec4d const& n)
+{
+  const double pow2_52 = 4503599627370496.0;    // 2^52
+  const double bias    = 1023.0;                // bias in exponent
+  Vec4d a              = n + (bias + pow2_52);  // put n + bias in least significant bits
+  Vec4q b              = reinterpret_i(a);      // bit-cast to integer
+  Vec4q c              = b << 52;               // shift left 52 places to get value into exponent field
+  Vec4d d              = reinterpret_d(c);      // bit-cast back to double
+  return d;
+}
+
+static inline Vec8f vm_pow2n(Vec8f const& n)
+{
+  const float pow2_23 = 8388608.0;             // 2^23
+  const float bias    = 127.0;                 // bias in exponent
+  Vec8f a             = n + (bias + pow2_23);  // put n + bias in least significant bits
+  Vec8i b             = reinterpret_i(a);      // bit-cast to integer
+  Vec8i c             = b << 23;               // shift left 23 places to get into exponent field
+  Vec8f d             = reinterpret_f(c);      // bit-cast back to float
+  return d;
+}
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d vm_pow2n(Vec8d const& n)
+{
+#ifdef __AVX512ER__
+  return _mm512_exp2a23_round_pd(n, _MM_FROUND_NO_EXC);  // this is exact only for integral n
+#else
+  const double pow2_52 = 4503599627370496.0;       // 2^52
+  const double bias    = 1023.0;                   // bias in exponent
+  Vec8d a              = n + (bias + pow2_52);     // put n + bias in least significant bits
+  Vec8q b              = Vec8q(reinterpret_i(a));  // bit-cast to integer
+  Vec8q c              = b << 52;                  // shift left 52 places to get value into exponent field
+  Vec8d d              = Vec8d(reinterpret_d(c));  // bit-cast back to double
+  return d;
+#endif
+}
+
+static inline Vec16f vm_pow2n(Vec16f const& n)
+{
+#ifdef __AVX512ER__
+  return _mm512_exp2a23_round_ps(n, _MM_FROUND_NO_EXC);
+#else
+  const float pow2_23 = 8388608.0;                 // 2^23
+  const float bias    = 127.0;                     // bias in exponent
+  Vec16f a            = n + (bias + pow2_23);      // put n + bias in least significant bits
+  Vec16i b            = Vec16i(reinterpret_i(a));  // bit-cast to integer
+  Vec16i c            = b << 23;                   // shift left 23 places to get into exponent field
+  Vec16f d            = Vec16f(reinterpret_f(c));  // bit-cast back to float
+  return d;
+#endif
+}
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for exp function, double precision
+// The limit of abs(x) is defined by max_x below
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+// M1: 0 for exp, 1 for expm1
+// BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)
+
+#if 1  // choose method
+
+// Taylor expansion
+template <class VTYPE, class BVTYPE, int M1, int BA>
+static inline VTYPE exp_d(VTYPE const& initial_x)
+{
+  // Taylor coefficients, 1/n!
+  // Not using minimax approximation because we prioritize precision close to x = 0
+  const double p2  = 1. / 2.;
+  const double p3  = 1. / 6.;
+  const double p4  = 1. / 24.;
+  const double p5  = 1. / 120.;
+  const double p6  = 1. / 720.;
+  const double p7  = 1. / 5040.;
+  const double p8  = 1. / 40320.;
+  const double p9  = 1. / 362880.;
+  const double p10 = 1. / 3628800.;
+  const double p11 = 1. / 39916800.;
+  const double p12 = 1. / 479001600.;
+  const double p13 = 1. / 6227020800.;
+
+  // maximum abs(x), value depends on BA, defined below
+  // The lower limit of x is slightly more restrictive than the upper limit.
+  // We are specifying the lower limit, except for BA = 1 because it is not used for negative x
+  double max_x;
+
+  // data vectors
+  VTYPE x, r, z, n2;
+  BVTYPE inrange;  // boolean vector
+
+  if(BA <= 1)
+    {                                    // exp(x)
+      max_x = BA == 0 ? 708.39 : 709.7;  // lower limit for 0.5*exp(x) is -707.6, but we are using 0.5*exp(x) only for positive x in
+                                         // hyperbolic functions
+      const double ln2d_hi = 0.693145751953125;
+      const double ln2d_lo = 1.42860682030941723212E-6;
+      x                    = initial_x;
+      r                    = round(initial_x * VM_LOG2E);
+      // subtraction in two steps for higher precision
+      x = nmul_add(r, ln2d_hi, x);  //  x -= r * ln2d_hi;
+      x = nmul_add(r, ln2d_lo, x);  //  x -= r * ln2d_lo;
+    }
+  else if(BA == 2)
+    {  // pow(2,x)
+      max_x = 1022.0;
+      r     = round(initial_x);
+      x     = initial_x - r;
+      x *= VM_LN2;
+    }
+  else if(BA == 10)
+    {  // pow(10,x)
+      max_x                   = 307.65;
+      const double log10_2_hi = 0.30102999554947019;  // log10(2) in two parts
+      const double log10_2_lo = 1.1451100899212592E-10;
+      x                       = initial_x;
+      r                       = round(initial_x * (VM_LOG2E * VM_LN10));
+      // subtraction in two steps for higher precision
+      x = nmul_add(r, log10_2_hi, x);  //  x -= r * log10_2_hi;
+      x = nmul_add(r, log10_2_lo, x);  //  x -= r * log10_2_lo;
+      x *= VM_LN10;
+    }
+  else
+    {  // undefined value of BA
+      return 0.;
+    }
+
+  z = polynomial_13m(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13);
+
+  if(BA == 1)
+    r--;  // 0.5 * exp(x)
+
+  // multiply by power of 2
+  n2 = vm_pow2n(r);
+
+  if(M1 == 0)
+    {
+      // exp
+      z = (z + 1.0) * n2;
+    }
+  else
+    {
+      // expm1
+      z = mul_add(z, n2, n2 - 1.0);  // z = z * n2 + (n2 - 1.0);
+    }
+
+  // check for overflow
+  inrange = abs(initial_x) < max_x;
+  // check for INF and NAN
+  inrange &= is_finite(initial_x);
+
+  if(horizontal_and(inrange))
+    {
+      // fast normal path
+      return z;
+    }
+  else
+    {
+      // overflow, underflow and NAN
+      r = select(sign_bit(initial_x), 0. - M1, infinite_vec<VTYPE>());  // value in case of +/- overflow or INF
+      z = select(inrange, z, r);                                        // +/- underflow
+      z = select(is_nan(initial_x), initial_x, z);                      // NAN goes through
+      return z;
+    }
+}
+
+#else
+
+// Pade expansion uses less code and fewer registers, but is slower
+template <class VTYPE, class BVTYPE, int M1, int BA>
+static inline VTYPE exp_d(VTYPE const& initial_x)
+{
+  // define constants
+  const double ln2p1   = 0.693145751953125;
+  const double ln2p2   = 1.42860682030941723212E-6;
+  const double log2e   = VM_LOG2E;
+  const double max_exp = 708.39;
+  // coefficients of pade polynomials
+  const double P0exp = 9.99999999999999999910E-1;
+  const double P1exp = 3.02994407707441961300E-2;
+  const double P2exp = 1.26177193074810590878E-4;
+  const double Q0exp = 2.00000000000000000009E0;
+  const double Q1exp = 2.27265548208155028766E-1;
+  const double Q2exp = 2.52448340349684104192E-3;
+  const double Q3exp = 3.00198505138664455042E-6;
+
+  VTYPE x, r, xx, px, qx, y, n2;  // data vectors
+  BVTYPE inrange;                 // boolean vector
+
+  x = initial_x;
+  r = round(initial_x * log2e);
+
+  // subtraction in one step would gives loss of precision
+  x -= r * ln2p1;
+  x -= r * ln2p2;
+
+  xx = x * x;
+
+  // px = x * P(x^2).
+  px = polynomial_2(xx, P0exp, P1exp, P2exp) * x;
+
+  // Evaluate Q(x^2).
+  qx = polynomial_3(xx, Q0exp, Q1exp, Q2exp, Q3exp);
+
+  // e^x = 1 + 2*P(x^2)/( Q(x^2) - P(x^2) )
+  y = (2.0 * px) / (qx - px);
+
+  // Get 2^n in double.
+  // n  = round_to_int64_limited(r);
+  // n2 = exp2(n);
+  n2 = vm_pow2n(r);  // this is faster
+
+  if(M1 == 0)
+    {
+      // exp
+      y = (y + 1.0) * n2;
+    }
+  else
+    {
+      // expm1
+      y = y * n2 + (n2 - 1.0);
+    }
+
+  // overflow
+  inrange = abs(initial_x) < max_exp;
+  // check for INF and NAN
+  inrange &= is_finite(initial_x);
+
+  if(horizontal_and(inrange))
+    {
+      // fast normal path
+      return y;
+    }
+  else
+    {
+      // overflow, underflow and NAN
+      r = select(sign_bit(initial_x), 0. - M1, infinite_vec<VTYPE>());  // value in case of overflow or INF
+      y = select(inrange, y, r);                                        // +/- overflow
+      y = select(is_nan(initial_x), initial_x, y);                      // NAN goes through
+      return y;
+    }
+}
+#endif
+
+// instances of exp_d template
+static inline Vec2d exp(Vec2d const& x) { return exp_d<Vec2d, Vec2db, 0, 0>(x); }
+
+static inline Vec2d expm1(Vec2d const& x) { return exp_d<Vec2d, Vec2db, 1, 0>(x); }
+
+static inline Vec2d exp2(Vec2d const& x) { return exp_d<Vec2d, Vec2db, 0, 2>(x); }
+
+static inline Vec2d exp10(Vec2d const& x) { return exp_d<Vec2d, Vec2db, 0, 10>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d exp(Vec4d const& x) { return exp_d<Vec4d, Vec4db, 0, 0>(x); }
+
+static inline Vec4d expm1(Vec4d const& x) { return exp_d<Vec4d, Vec4db, 1, 0>(x); }
+
+static inline Vec4d exp2(Vec4d const& x) { return exp_d<Vec4d, Vec4db, 0, 2>(x); }
+
+static inline Vec4d exp10(Vec4d const& x) { return exp_d<Vec4d, Vec4db, 0, 10>(x); }
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d exp(Vec8d const& x) { return exp_d<Vec8d, Vec8db, 0, 0>(x); }
+
+static inline Vec8d expm1(Vec8d const& x) { return exp_d<Vec8d, Vec8db, 1, 0>(x); }
+
+static inline Vec8d exp2(Vec8d const& x) { return exp_d<Vec8d, Vec8db, 0, 2>(x); }
+
+static inline Vec8d exp10(Vec8d const& x) { return exp_d<Vec8d, Vec8db, 0, 10>(x); }
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for exp function, single precision
+// The limit of abs(x) is defined by max_x below
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  float vector type
+// BVTYPE: boolean vector type
+// M1: 0 for exp, 1 for expm1
+// BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)
+
+template <class VTYPE, class BVTYPE, int M1, int BA>
+static inline VTYPE exp_f(VTYPE const& initial_x)
+{
+  // Taylor coefficients
+  const float P0expf = 1.f / 2.f;
+  const float P1expf = 1.f / 6.f;
+  const float P2expf = 1.f / 24.f;
+  const float P3expf = 1.f / 120.f;
+  const float P4expf = 1.f / 720.f;
+  const float P5expf = 1.f / 5040.f;
+
+  VTYPE x, r, x2, z, n2;  // data vectors
+  BVTYPE inrange;         // boolean vector
+
+  // maximum abs(x), value depends on BA, defined below
+  // The lower limit of x is slightly more restrictive than the upper limit.
+  // We are specifying the lower limit, except for BA = 1 because it is not used for negative x
+  float max_x;
+
+  if(BA <= 1)
+    {  // exp(x)
+      const float ln2f_hi = 0.693359375f;
+      const float ln2f_lo = -2.12194440e-4f;
+      max_x               = (BA == 0) ? 87.3f : 89.0f;
+
+      x = initial_x;
+      r = round(initial_x * float(VM_LOG2E));
+      x = nmul_add(r, VTYPE(ln2f_hi), x);  //  x -= r * ln2f_hi;
+      x = nmul_add(r, VTYPE(ln2f_lo), x);  //  x -= r * ln2f_lo;
+    }
+  else if(BA == 2)
+    {  // pow(2,x)
+      max_x = 126.f;
+      r     = round(initial_x);
+      x     = initial_x - r;
+      x     = x * (float)VM_LN2;
+    }
+  else if(BA == 10)
+    {  // pow(10,x)
+      max_x                  = 37.9f;
+      const float log10_2_hi = 0.301025391f;  // log10(2) in two parts
+      const float log10_2_lo = 4.60503907E-6f;
+      x                      = initial_x;
+      r                      = round(initial_x * float(VM_LOG2E * VM_LN10));
+      x                      = nmul_add(r, VTYPE(log10_2_hi), x);  //  x -= r * log10_2_hi;
+      x                      = nmul_add(r, VTYPE(log10_2_lo), x);  //  x -= r * log10_2_lo;
+      x                      = x * (float)VM_LN10;
+    }
+  else
+    {  // undefined value of BA
+      return 0.;
+    }
+
+  x2 = x * x;
+  z  = polynomial_5(x, P0expf, P1expf, P2expf, P3expf, P4expf, P5expf);
+  z  = mul_add(z, x2, x);  // z *= x2;  z += x;
+
+  if(BA == 1)
+    r--;  // 0.5 * exp(x)
+
+  // multiply by power of 2
+  n2 = vm_pow2n(r);
+
+  if(M1 == 0)
+    {
+      // exp
+      z = (z + 1.0f) * n2;
+    }
+  else
+    {
+      // expm1
+      z = mul_add(z, n2, n2 - 1.0f);  //  z = z * n2 + (n2 - 1.0f);
+    }
+
+  // check for overflow
+  inrange = abs(initial_x) < max_x;
+  // check for INF and NAN
+  inrange &= is_finite(initial_x);
+
+  if(horizontal_and(inrange))
+    {
+      // fast normal path
+      return z;
+    }
+  else
+    {
+      // overflow, underflow and NAN
+      r = select(sign_bit(initial_x), 0.f - M1, infinite_vec<VTYPE>());  // value in case of +/- overflow or INF
+      z = select(inrange, z, r);                                         // +/- underflow
+      z = select(is_nan(initial_x), initial_x, z);                       // NAN goes through
+      return z;
+    }
+}
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512
+// forward declarations of fast 512 bit versions
+static Vec16f exp(Vec16f const& x);
+static Vec16f exp2(Vec16f const& x);
+static Vec16f exp10(Vec16f const& x);
+#endif
+
+// instances of exp_f template
+static inline Vec4f exp(Vec4f const& x)
+{
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512  // use faster 512 bit version
+  return _mm512_castps512_ps128(exp(Vec16f(_mm512_castps128_ps512(x))));
+#else
+  return exp_f<Vec4f, Vec4fb, 0, 0>(x);
+#endif
+}
+
+static inline Vec4f expm1(Vec4f const& x) { return exp_f<Vec4f, Vec4fb, 1, 0>(x); }
+
+static inline Vec4f exp2(Vec4f const& x)
+{
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512  // use faster 512 bit version
+  return _mm512_castps512_ps128(exp2(Vec16f(_mm512_castps128_ps512(x))));
+#else
+  return exp_f<Vec4f, Vec4fb, 0, 2>(x);
+#endif
+}
+
+static inline Vec4f exp10(Vec4f const& x)
+{
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512  // use faster 512 bit version
+  return _mm512_castps512_ps128(exp10(Vec16f(_mm512_castps128_ps512(x))));
+#else
+  return exp_f<Vec4f, Vec4fb, 0, 10>(x);
+#endif
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f exp(Vec8f const& x)
+{
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512  // use faster 512 bit version
+  return _mm512_castps512_ps256(exp(Vec16f(_mm512_castps256_ps512(x))));
+#else
+  return exp_f<Vec8f, Vec8fb, 0, 0>(x);
+#endif
+}
+
+static inline Vec8f expm1(Vec8f const& x) { return exp_f<Vec8f, Vec8fb, 1, 0>(x); }
+
+static inline Vec8f exp2(Vec8f const& x)
+{
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512  // use faster 512 bit version
+  return _mm512_castps512_ps256(exp2(Vec16f(_mm512_castps256_ps512(x))));
+#else
+  return exp_f<Vec8f, Vec8fb, 0, 2>(x);
+#endif
+}
+
+static inline Vec8f exp10(Vec8f const& x)
+{
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512  // use faster 512 bit version
+  return _mm512_castps512_ps256(exp10(Vec16f(_mm512_castps256_ps512(x))));
+#else
+  return exp_f<Vec8f, Vec8fb, 0, 10>(x);
+#endif
+}
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f exp(Vec16f const& x)
+{
+#ifdef __AVX512ER__  // AVX512ER instruction set includes fast exponential function
+#ifdef VCL_FASTEXP
+  // very fast, but less precise for large x:
+  return _mm512_exp2a23_round_ps(x * float(VM_LOG2E), _MM_FROUND_NO_EXC);
+#else
+  // best precision, also for large x:
+  const Vec16f log2e  = float(VM_LOG2E);
+  const float ln2f_hi = 0.693359375f;
+  const float ln2f_lo = -2.12194440e-4f;
+  Vec16f x1           = x, r, y;
+  r                   = round(x1 * log2e);
+  x1                  = nmul_add(r, Vec16f(ln2f_hi), x1);  //  x -= r * ln2f_hi;
+  x1                  = nmul_add(r, Vec16f(ln2f_lo), x1);  //  x -= r * ln2f_lo;
+  x1                  = x1 * log2e;
+  y                   = _mm512_exp2a23_round_ps(r, _MM_FROUND_NO_EXC);
+  // y = vm_pow2n(r);
+  return y * _mm512_exp2a23_round_ps(x1, _MM_FROUND_NO_EXC);
+#endif  // VCL_FASTEXP
+#else   // no AVX512ER, use above template
+  return exp_f<Vec16f, Vec16fb, 0, 0>(x);
+#endif
+}
+
+static inline Vec16f expm1(Vec16f const& x) { return exp_f<Vec16f, Vec16fb, 1, 0>(x); }
+
+static inline Vec16f exp2(Vec16f const& x)
+{
+#ifdef __AVX512ER__
+  return Vec16f(_mm512_exp2a23_round_ps(x, _MM_FROUND_NO_EXC));
+#else
+  return exp_f<Vec16f, Vec16fb, 0, 2>(x);
+#endif
+}
+
+static inline Vec16f exp10(Vec16f const& x)
+{
+#ifdef __AVX512ER__  // AVX512ER instruction set includes fast exponential function
+#ifdef VCL_FASTEXP
+  // very fast, but less precise for large x:
+  return _mm512_exp2a23_round_ps(x * float(VM_LOG210), _MM_FROUND_NO_EXC);
+#else
+  // best precision, also for large x:
+  const float log10_2_hi = 0.301025391f;  // log10(2) in two parts
+  const float log10_2_lo = 4.60503907E-6f;
+  Vec16f x1              = x, r, y;
+  Vec16f log210          = float(VM_LOG210);
+  r                      = round(x1 * log210);
+  x1                     = nmul_add(r, Vec16f(log10_2_hi), x1);  //  x -= r * log10_2_hi
+  x1                     = nmul_add(r, Vec16f(log10_2_lo), x1);  //  x -= r * log10_2_lo
+  x1                     = x1 * log210;
+  // y = vm_pow2n(r);
+  y = _mm512_exp2a23_round_ps(r, _MM_FROUND_NO_EXC);
+  return y * _mm512_exp2a23_round_ps(x1, _MM_FROUND_NO_EXC);
+#endif  // VCL_FASTEXP
+#else   // no AVX512ER, use above template
+  return exp_f<Vec16f, Vec16fb, 0, 10>(x);
+#endif
+}
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+/******************************************************************************
+ *                 Logarithm functions
+ ******************************************************************************/
+
+// Helper functions: fraction_2(x) = fraction(x)*0.5
+
+// Modified fraction function:
+// Extract the fraction part of a floating point number, and divide by 2
+// The fraction function is defined in vectorf128.h etc.
+// fraction_2(x) = fraction(x)*0.5
+// This version gives half the fraction without extra delay
+// Does not work for x = 0
+static inline Vec4f fraction_2(Vec4f const& a)
+{
+  Vec4ui t1 = _mm_castps_si128(a);                     // reinterpret as 32-bit integer
+  Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F000000);  // set exponent to 0 + bias
+  return _mm_castsi128_ps(t2);
+}
+
+static inline Vec2d fraction_2(Vec2d const& a)
+{
+  Vec2uq t1 = _mm_castpd_si128(a);                                         // reinterpret as 64-bit integer
+  Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll);  // set exponent to 0 + bias
+  return _mm_castsi128_pd(t2);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f fraction_2(Vec8f const& a)
+{
+#if defined(VECTORI256_H) && VECTORI256_H > 2  // 256 bit integer vectors are available, AVX2
+  Vec8ui t1 = _mm256_castps_si256(a);          // reinterpret as 32-bit integer
+  Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F000000;  // set exponent to 0 + bias
+  return _mm256_castsi256_ps(t2);
+#else
+  return Vec8f(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+static inline Vec4d fraction_2(Vec4d const& a)
+{
+#if VECTORI256_H > 1                                                       // AVX2
+  Vec4uq t1 = _mm256_castpd_si256(a);                                      // reinterpret as 64-bit integer
+  Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll);  // set exponent to 0 + bias
+  return _mm256_castsi256_pd(t2);
+#else
+  return Vec4d(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f fraction_2(Vec16f const& a)
+{
+#if INSTRSET >= 9  // 512 bit integer vectors are available, AVX512
+  return _mm512_getmant_ps(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
+  // return Vec16f(_mm512_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5f;
+#else
+  return Vec16f(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+static inline Vec8d fraction_2(Vec8d const& a)
+{
+#if INSTRSET >= 9  // 512 bit integer vectors are available, AVX512
+  return _mm512_getmant_pd(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
+  // return Vec8d(_mm512_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5;
+#else
+  return Vec8d(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Helper functions: exponent_f(x) = exponent(x) as floating point number
+
+union vm_ufi
+{
+  float f;
+  uint32_t i;
+};
+
+union vm_udi
+{
+  double d;
+  uint64_t i;
+};
+
+// extract exponent of a positive number x as a floating point number
+// Note: the AVX512 version return -inf for x=0, the non-AVX versions return a negative number
+static inline Vec4f exponent_f(Vec4f const& x)
+{
+#ifdef __AVX512VL__  // AVX512VL
+  // note: this version returns -inf for x=0
+  return _mm_getexp_ps(x);
+#else
+  const float pow2_23   = 8388608.0f;  // 2^23
+  const float bias      = 127.f;       // bias in exponent
+  const vm_ufi upow2_23 = {pow2_23};
+  Vec4ui a              = reinterpret_i(x);        // bit-cast x to integer
+  Vec4ui b              = a >> 23;                 // shift down exponent to low bits
+  Vec4ui c              = b | Vec4ui(upow2_23.i);  // insert new exponent
+  Vec4f d               = reinterpret_f(c);        // bit-cast back to double
+  Vec4f e               = d - (pow2_23 + bias);    // subtract magic number and bias
+  return e;
+#endif
+}
+
+static inline Vec2d exponent_f(Vec2d const& x)
+{
+#ifdef __AVX512VL__  // AVX512VL
+  // note: this version returns -inf for x=0
+  return _mm_getexp_pd(x);
+#else
+  const double pow2_52  = 4503599627370496.0;  // 2^52
+  const double bias     = 1023.0;              // bias in exponent
+  const vm_udi upow2_52 = {pow2_52};
+
+  Vec2uq a = reinterpret_i(x);        // bit-cast x to integer
+  Vec2uq b = a >> 52;                 // shift down exponent to low bits
+  Vec2uq c = b | Vec2uq(upow2_52.i);  // insert new exponent
+  Vec2d d  = reinterpret_d(c);        // bit-cast back to double
+  Vec2d e  = d - (pow2_52 + bias);    // subtract magic number and bias
+  return e;
+#endif
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f exponent_f(Vec8f const& x)
+{
+#ifdef __AVX512VL__  // AVX512VL
+  // note: this version returns -inf for x=0
+  return _mm256_getexp_ps(x);
+#else
+  const float pow2_23   = 8388608.0f;  // 2^23
+  const float bias      = 127.f;       // bias in exponent
+  const vm_ufi upow2_23 = {pow2_23};
+  Vec8ui a              = reinterpret_i(x);        // bit-cast x to integer
+  Vec8ui b              = a >> 23;                 // shift down exponent to low bits
+  Vec8ui c              = b | Vec8ui(upow2_23.i);  // insert new exponent
+  Vec8f d               = reinterpret_f(c);        // bit-cast back to double
+  Vec8f e               = d - (pow2_23 + bias);    // subtract magic number and bias
+  return e;
+#endif
+}
+
+// extract exponent of a positive number x as a floating point number
+static inline Vec4d exponent_f(Vec4d const& x)
+{
+#ifdef __AVX512VL__  // AVX512VL
+  return _mm256_getexp_pd(x);
+#else
+  const double pow2_52  = 4503599627370496.0;  // 2^52
+  const double bias     = 1023.0;              // bias in exponent
+  const vm_udi upow2_52 = {pow2_52};
+
+  Vec4uq a = reinterpret_i(x);        // bit-cast x to integer
+  Vec4uq b = a >> 52;                 // shift down exponent to low bits
+  Vec4uq c = b | Vec4uq(upow2_52.i);  // insert new exponent
+  Vec4d d  = reinterpret_d(c);        // bit-cast back to double
+  Vec4d e  = d - (pow2_52 + bias);    // subtract magic number and bias
+  return e;
+#endif
+}
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f exponent_f(Vec16f const& x)
+{
+#if INSTRSET >= 9  // AVX512
+  // note: this version returns -inf for x=0
+  return _mm512_getexp_ps(x);
+#else
+  return Vec16f(exponent_f(x.get_low()), exponent_f(x.get_high()));
+#endif
+}
+
+// extract exponent of a positive number x as a floating point number
+static inline Vec8d exponent_f(Vec8d const& x)
+{
+#if INSTRSET >= 9  // AVX512
+  // note: this returns -inf for x=0
+  return _mm512_getexp_pd(x);
+#else
+  return Vec8d(exponent_f(x.get_low()), exponent_f(x.get_high()));
+#endif
+}
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// log function, double precision
+// template parameters:
+// VTYPE:  f.p. vector type
+// BVTYPE: boolean vector type
+// M1: 0 for log, 1 for log1p
+template <class VTYPE, class BVTYPE, int M1>
+static inline VTYPE log_d(VTYPE const& initial_x)
+{
+  // define constants
+  const double ln2_hi = 0.693359375;
+  const double ln2_lo = -2.121944400546905827679E-4;
+  const double P0log  = 7.70838733755885391666E0;
+  const double P1log  = 1.79368678507819816313E1;
+  const double P2log  = 1.44989225341610930846E1;
+  const double P3log  = 4.70579119878881725854E0;
+  const double P4log  = 4.97494994976747001425E-1;
+  const double P5log  = 1.01875663804580931796E-4;
+  const double Q0log  = 2.31251620126765340583E1;
+  const double Q1log  = 7.11544750618563894466E1;
+  const double Q2log  = 8.29875266912776603211E1;
+  const double Q3log  = 4.52279145837532221105E1;
+  const double Q4log  = 1.12873587189167450590E1;
+
+  VTYPE x1, x, x2, px, qx, res, fe;   // data vectors
+  BVTYPE blend, overflow, underflow;  // boolean vectors
+
+  if(M1 == 0)
+    {
+      x1 = initial_x;  // log(x)
+    }
+  else
+    {
+      x1 = initial_x + 1.0;  // log(x+1)
+    }
+  // separate mantissa from exponent
+  // VTYPE x  = fraction(x1) * 0.5;
+  x  = fraction_2(x1);
+  fe = exponent_f(x1);
+
+  blend = x > VM_SQRT2 * 0.5;
+  x     = if_add(!blend, x, x);   // conditional add
+  fe    = if_add(blend, fe, 1.);  // conditional add
+
+  if(M1 == 0)
+    {
+      // log(x). Expand around 1.0
+      x -= 1.0;
+    }
+  else
+    {
+      // log(x+1). Avoid loss of precision when adding 1 and later subtracting 1 if exponent = 0
+      x = select(fe == 0., initial_x, x - 1.0);
+    }
+
+  // rational form
+  px = polynomial_5(x, P0log, P1log, P2log, P3log, P4log, P5log);
+  x2 = x * x;
+  px *= x * x2;
+  qx  = polynomial_5n(x, Q0log, Q1log, Q2log, Q3log, Q4log);
+  res = px / qx;
+
+  // add exponent
+  res = mul_add(fe, ln2_lo, res);  // res += fe * ln2_lo;
+  res += nmul_add(x2, 0.5, x);     // res += x  - 0.5 * x2;
+  res = mul_add(fe, ln2_hi, res);  // res += fe * ln2_hi;
+
+  overflow  = !is_finite(x1);
+  underflow = x1 < VM_SMALLEST_NORMAL;  // denormals not supported by this functions
+
+  if(!horizontal_or(overflow | underflow))
+    {
+      // normal path
+      return res;
+    }
+  else
+    {
+      // overflow and underflow
+      res = select(underflow, nan_vec<VTYPE>(NAN_LOG), res);                    // x1  < 0 gives NAN
+      res = select(x1 == 0. || is_subnormal(x1), -infinite_vec<VTYPE>(), res);  // x1 == 0 gives -INF
+      res = select(overflow, x1, res);                                          // INF or NAN goes through
+      res = select(is_inf(x1) & sign_bit(x1), nan_vec<VTYPE>(NAN_LOG), res);    // -INF gives NAN
+      return res;
+    }
+}
+
+static inline Vec2d log(Vec2d const& x) { return log_d<Vec2d, Vec2db, 0>(x); }
+
+static inline Vec2d log1p(Vec2d const& x) { return log_d<Vec2d, Vec2db, 1>(x); }
+
+static inline Vec2d log2(Vec2d const& x) { return VM_LOG2E * log_d<Vec2d, Vec2db, 0>(x); }
+
+static inline Vec2d log10(Vec2d const& x) { return VM_LOG10E * log_d<Vec2d, Vec2db, 0>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d log(Vec4d const& x) { return log_d<Vec4d, Vec4db, 0>(x); }
+
+static inline Vec4d log1p(Vec4d const& x) { return log_d<Vec4d, Vec4db, 1>(x); }
+
+static inline Vec4d log2(Vec4d const& x) { return VM_LOG2E * log_d<Vec4d, Vec4db, 0>(x); }
+
+static inline Vec4d log10(Vec4d const& x) { return VM_LOG10E * log_d<Vec4d, Vec4db, 0>(x); }
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d log(Vec8d const& x) { return log_d<Vec8d, Vec8db, 0>(x); }
+
+static inline Vec8d log1p(Vec8d const& x) { return log_d<Vec8d, Vec8db, 1>(x); }
+
+static inline Vec8d log2(Vec8d const& x) { return VM_LOG2E * log_d<Vec8d, Vec8db, 0>(x); }
+
+static inline Vec8d log10(Vec8d const& x) { return VM_LOG10E * log_d<Vec8d, Vec8db, 0>(x); }
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// log function, single precision
+// template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  integer vector type with same element size
+// BVTYPE: boolean vector type
+// BTYPEI: boolean vector type for ITYPE
+// M1: 0 for log, 1 for log1p
+template <class VTYPE, class ITYPE, class BVTYPE, class BTYPEI, int M1>
+static inline VTYPE log_f(VTYPE const& initial_x)
+{
+  // define constants
+  const float ln2f_hi = 0.693359375f;
+  const float ln2f_lo = -2.12194440E-4f;
+  const float P0logf  = 3.3333331174E-1f;
+  const float P1logf  = -2.4999993993E-1f;
+  const float P2logf  = 2.0000714765E-1f;
+  const float P3logf  = -1.6668057665E-1f;
+  const float P4logf  = 1.4249322787E-1f;
+  const float P5logf  = -1.2420140846E-1f;
+  const float P6logf  = 1.1676998740E-1f;
+  const float P7logf  = -1.1514610310E-1f;
+  const float P8logf  = 7.0376836292E-2f;
+
+  VTYPE x1, x, res, x2, fe;           // data vectors
+  ITYPE e;                            // integer vector
+  BVTYPE blend, overflow, underflow;  // boolean vectors
+
+  if(M1 == 0)
+    {
+      x1 = initial_x;  // log(x)
+    }
+  else
+    {
+      x1 = initial_x + 1.0f;  // log(x+1)
+    }
+
+  // separate mantissa from exponent
+  x = fraction_2(x1);
+  e = exponent(x1);
+
+  blend = x > float(VM_SQRT2 * 0.5);
+  x     = if_add(!blend, x, x);                // conditional add
+  e     = if_add(BTYPEI(blend), e, ITYPE(1));  // conditional add
+  fe    = to_float(e);
+
+  if(M1 == 0)
+    {
+      // log(x). Expand around 1.0
+      x -= 1.0f;
+    }
+  else
+    {
+      // log(x+1). Avoid loss of precision when adding 1 and later subtracting 1 if exponent = 0
+      x = select(BVTYPE(e == 0), initial_x, x - 1.0f);
+    }
+
+  // Taylor expansion
+  res = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
+  x2  = x * x;
+  res *= x2 * x;
+
+  // add exponent
+  res = mul_add(fe, ln2f_lo, res);  // res += ln2f_lo  * fe;
+  res += nmul_add(x2, 0.5f, x);     // res += x - 0.5f * x2;
+  res = mul_add(fe, ln2f_hi, res);  // res += ln2f_hi  * fe;
+
+  overflow  = !is_finite(x1);
+  underflow = x1 < VM_SMALLEST_NORMALF;  // denormals not supported by this functions
+
+  if(!horizontal_or(overflow | underflow))
+    {
+      // normal path
+      return res;
+    }
+  else
+    {
+      // overflow and underflow
+      res = select(underflow, nan_vec<VTYPE>(NAN_LOG), res);                     // x1 < 0 gives NAN
+      res = select(x1 == 0.f || is_subnormal(x1), -infinite_vec<VTYPE>(), res);  // x1 == 0 or denormal gives -INF
+      res = select(overflow, x1, res);                                           // INF or NAN goes through
+      res = select(is_inf(x1) & sign_bit(x1), nan_vec<VTYPE>(NAN_LOG), res);     // -INF gives NAN
+      return res;
+    }
+}
+
+static inline Vec4f log(Vec4f const& x) { return log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x); }
+
+static inline Vec4f log1p(Vec4f const& x) { return log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 1>(x); }
+
+static inline Vec4f log2(Vec4f const& x) { return float(VM_LOG2E) * log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x); }
+
+static inline Vec4f log10(Vec4f const& x) { return float(VM_LOG10E) * log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f log(Vec8f const& x) { return log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x); }
+
+static inline Vec8f log1p(Vec8f const& x) { return log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 1>(x); }
+
+static inline Vec8f log2(Vec8f const& x) { return float(VM_LOG2E) * log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x); }
+
+static inline Vec8f log10(Vec8f const& x) { return float(VM_LOG10E) * log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x); }
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f log(Vec16f const& x) { return log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x); }
+
+static inline Vec16f log1p(Vec16f const& x) { return log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 1>(x); }
+
+static inline Vec16f log2(Vec16f const& x) { return float(VM_LOG2E) * log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x); }
+
+static inline Vec16f log10(Vec16f const& x) { return float(VM_LOG10E) * log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x); }
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+/******************************************************************************
+ *           Cube root and reciprocal cube root
+ ******************************************************************************/
+
+// cube root template, double precision
+// template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  uint32_t integer vector type with same total number of bits
+// ITYPE2: uint64_t integer vector type with same total number of bits
+// BVTYPE: boolean vector type
+// CR:     -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared
+template <class VTYPE, class ITYPE, class ITYPE2, class BVTYPE, int CR>
+static inline VTYPE cbrt_d(VTYPE const& x)
+{
+  const int iter = 7;  // iteration count of x^(-1/3) loop
+  int i;
+  VTYPE xa, xa3, a, a2;
+  ITYPE m1, m2;
+  BVTYPE underflow;
+  ITYPE2 q1(0x5540000000000000ULL);  // exponent bias
+  ITYPE2 q2(0x0005555500000000ULL);  // exponent multiplier for 1/3
+  ITYPE2 q3(0x0010000000000000ULL);  // denormal limit
+  const double one_third  = 1. / 3.;
+  const double four_third = 4. / 3.;
+
+  xa  = abs(x);
+  xa3 = one_third * xa;
+
+  // multiply exponent by -1/3
+  m1        = reinterpret_i(xa);
+  m2        = ITYPE(q1) - (m1 >> 20) * ITYPE(q2);
+  a         = reinterpret_d(m2);
+  underflow = BVTYPE(ITYPE2(m1) < q3);  // true if denormal or zero
+
+  // Newton Raphson iteration
+  for(i = 0; i < iter - 1; i++)
+    {
+      a2 = a * a;
+      a  = nmul_add(xa3, a2 * a2, four_third * a);  // a = four_third*a - xa3*a2*a2;
+    }
+  // last iteration with better precision
+  a2 = a * a;
+  a  = mul_add(one_third, nmul_add(xa, a2 * a2, a), a);  // a = a + one_third*(a - xa*a2*a2);
+
+  if(CR == -1)
+    {  // reciprocal cube root
+      // (note: gives wrong sign when input is INF)
+      // generate INF if underflow
+      a = select(underflow, infinite_vec<VTYPE>(), a);
+      // get sign
+      a = sign_combine(a, x);
+    }
+  else if(CR == 1)
+    {  // cube root
+      a = a * a * x;
+      // generate 0 if underflow
+      a = select(underflow, 0., a);
+    }
+  else if(CR == 2)
+    {  // cube root squared
+      // (note: gives wrong sign when input is INF)
+      a = a * xa;
+      // generate 0 if underflow
+      a = select(underflow, 0., a);
+    }
+  return a;
+}
+
+// template instances for cbrt and reciprocal_cbrt
+
+// cube root
+static inline Vec2d cbrt(Vec2d const& x) { return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, 1>(x); }
+
+// reciprocal cube root
+static inline Vec2d reciprocal_cbrt(Vec2d const& x) { return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, -1>(x); }
+
+// square cube root
+static inline Vec2d square_cbrt(Vec2d const& x) { return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, 2>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d cbrt(Vec4d const& x) { return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, 1>(x); }
+
+static inline Vec4d reciprocal_cbrt(Vec4d const& x) { return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, -1>(x); }
+
+static inline Vec4d square_cbrt(Vec4d const& x) { return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, 2>(x); }
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d cbrt(Vec8d const& x) { return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, 1>(x); }
+
+static inline Vec8d reciprocal_cbrt(Vec8d const& x) { return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, -1>(x); }
+
+static inline Vec8d square_cbrt(Vec8d const& x) { return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, 2>(x); }
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// cube root template, single precision
+// template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  uint32_t integer vector type
+// BVTYPE: boolean vector type
+// CR:     -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared
+template <class VTYPE, class ITYPE, class BVTYPE, int CR>
+static inline VTYPE cbrt_f(VTYPE const& x)
+{
+  const int iter = 6;  // iteration count of x^(-1/3) loop
+  int i;
+  VTYPE xa, xa3, a, a2;
+  ITYPE m1, m2;
+  BVTYPE underflow;
+  ITYPE q1(0x54800000U);  // exponent bias
+  ITYPE q2(0x002AAAAAU);  // exponent multiplier for 1/3
+  ITYPE q3(0x00800000U);  // denormal limit
+  const float one_third  = float(1. / 3.);
+  const float four_third = float(4. / 3.);
+
+  xa  = abs(x);
+  xa3 = one_third * xa;
+
+  // multiply exponent by -1/3
+  m1 = reinterpret_i(xa);
+  m2 = q1 - (m1 >> 23) * q2;
+  a  = reinterpret_f(m2);
+
+  underflow = BVTYPE(m1 < q3);  // true if denormal or zero
+
+  // Newton Raphson iteration
+  for(i = 0; i < iter - 1; i++)
+    {
+      a2 = a * a;
+      a  = nmul_add(xa3, a2 * a2, four_third * a);  // a = four_third*a - xa3*a2*a2;
+    }
+  // last iteration with better precision
+  a2 = a * a;
+  a  = mul_add(one_third, nmul_add(xa, a2 * a2, a), a);  // a = a + one_third*(a - xa*a2*a2);
+
+  if(CR == -1)
+    {  // reciprocal cube root
+      // generate INF if underflow
+      a = select(underflow, infinite_vec<VTYPE>(), a);
+      // get sign
+      a = sign_combine(a, x);
+    }
+  else if(CR == 1)
+    {  // cube root
+      a = a * a * x;
+      // generate 0 if underflow
+      a = select(underflow, 0., a);
+    }
+  else if(CR == 2)
+    {  // cube root squared
+      a = a * xa;
+      // generate 0 if underflow
+      a = select(underflow, 0., a);
+    }
+  return a;
+}
+
+// template instances for cbrt and reciprocal_cbrt
+
+// cube root
+static inline Vec4f cbrt(Vec4f const& x) { return cbrt_f<Vec4f, Vec4ui, Vec4fb, 1>(x); }
+
+// reciprocal cube root
+static inline Vec4f reciprocal_cbrt(Vec4f const& x) { return cbrt_f<Vec4f, Vec4ui, Vec4fb, -1>(x); }
+
+// square cube root
+static inline Vec4f square_cbrt(Vec4f const& x) { return cbrt_f<Vec4f, Vec4ui, Vec4fb, 2>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f cbrt(Vec8f const& x) { return cbrt_f<Vec8f, Vec8ui, Vec8fb, 1>(x); }
+
+static inline Vec8f reciprocal_cbrt(Vec8f const& x) { return cbrt_f<Vec8f, Vec8ui, Vec8fb, -1>(x); }
+
+static inline Vec8f square_cbrt(Vec8f const& x) { return cbrt_f<Vec8f, Vec8ui, Vec8fb, 2>(x); }
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f cbrt(Vec16f const& x) { return cbrt_f<Vec16f, Vec16ui, Vec16fb, 1>(x); }
+
+static inline Vec16f reciprocal_cbrt(Vec16f const& x) { return cbrt_f<Vec16f, Vec16ui, Vec16fb, -1>(x); }
+
+static inline Vec16f square_cbrt(Vec16f const& x) { return cbrt_f<Vec16f, Vec16ui, Vec16fb, 2>(x); }
+
+// Helper function for power function: insert special values of pow(x,y) when x=0:
+// y<0 -> inf, y=0 -> 1, y>0 -> 0, y=nan -> nan
+static inline Vec8d wm_pow_case_x0(Vec8db const& xiszero, Vec8d const& y, Vec8d const& z)
+{
+#if INSTRSET >= 9
+  const __m512i table = Vec8q(0x85858A00);
+  return _mm512_mask_fixupimm_pd(z, xiszero, y, table, 0);
+#else
+  return select(xiszero, select(y < 0., infinite_vec<Vec8d>(), select(y == 0., Vec8d(1.), Vec8d(0.))), z);
+#endif
+}
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d wm_pow_case_x0(Vec4db const& xiszero, Vec4d const& y, Vec4d const& z)
+{
+  //#if defined __AVX512VL__ && defined ?
+  //   const __m256i table = Vec4q(0x85858A00);
+  //    return _mm256_mask_fixupimm_pd(z, xiszero, y, table, 0);
+  //#else
+  return select(xiszero, select(y < 0., infinite_vec<Vec4d>(), select(y == 0., Vec4d(1.), Vec4d(0.))), z);
+  //#endif
+}
+#endif
+
+static inline Vec2d wm_pow_case_x0(Vec2db const& xiszero, Vec2d const& y, Vec2d const& z)
+{
+  //#if defined __AVX512VL__ && defined ?
+  //    const __m128i table = Vec2q(0x85858A00);
+  //    return _mm_mask_fixupimm_pd(z, xiszero, y, table, 0);
+  //#else
+  return select(xiszero, select(y < 0., infinite_vec<Vec2d>(), select(y == 0., Vec2d(1.), Vec2d(0.))), z);
+  //#endif
+}
+
+// ****************************************************************************
+//                pow template, double precision
+// ****************************************************************************
+// Calculate x to the power of y.
+
+// Precision is important here because rounding errors get multiplied by y.
+// The logarithm is calculated with extra precision, and the exponent is
+// calculated separately.
+// The logarithm is calculated by Pad\E9 approximation with 6'th degree
+// polynomials. A 7'th degree would be preferred for best precision by high y.
+// The alternative method: log(x) = z + z^3*R(z)/S(z), where z = 2(x-1)/(x+1)
+// did not give better precision.
+
+// Template parameters:
+// VTYPE:  data vector type
+// ITYPE:  signed integer vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class ITYPE, class BVTYPE>
+static inline VTYPE pow_template_d(VTYPE const& x0, VTYPE const& y)
+{
+  // define constants
+  const double ln2d_hi = 0.693145751953125;          // log(2) in extra precision, high bits
+  const double ln2d_lo = 1.42860682030941723212E-6;  // low bits of log(2)
+  const double log2e   = VM_LOG2E;                   // 1/log(2)
+  const double pow2_52 = 4503599627370496.0;         // 2^52
+
+  // coefficients for Pad\E9 polynomials
+  const double P0logl = 2.0039553499201281259648E1;
+  const double P1logl = 5.7112963590585538103336E1;
+  const double P2logl = 6.0949667980987787057556E1;
+  const double P3logl = 2.9911919328553073277375E1;
+  const double P4logl = 6.5787325942061044846969E0;
+  const double P5logl = 4.9854102823193375972212E-1;
+  const double P6logl = 4.5270000862445199635215E-5;
+  const double Q0logl = 6.0118660497603843919306E1;
+  const double Q1logl = 2.1642788614495947685003E2;
+  const double Q2logl = 3.0909872225312059774938E2;
+  const double Q3logl = 2.2176239823732856465394E2;
+  const double Q4logl = 8.3047565967967209469434E1;
+  const double Q5logl = 1.5062909083469192043167E1;
+
+  // Taylor coefficients for exp function, 1/n!
+  const double p2  = 1. / 2.;
+  const double p3  = 1. / 6.;
+  const double p4  = 1. / 24.;
+  const double p5  = 1. / 120.;
+  const double p6  = 1. / 720.;
+  const double p7  = 1. / 5040.;
+  const double p8  = 1. / 40320.;
+  const double p9  = 1. / 362880.;
+  const double p10 = 1. / 3628800.;
+  const double p11 = 1. / 39916800.;
+  const double p12 = 1. / 479001600.;
+  const double p13 = 1. / 6227020800.;
+
+  // data vectors
+  VTYPE x, x1, x2;
+  VTYPE px, qx, ef, yr, v, z, z1;
+  VTYPE lg, lg1, lg2;
+  VTYPE lgerr, x2err;
+  VTYPE e1, e2, e3, ee;
+  // integer vectors
+  ITYPE ei, ej, yodd;
+  // boolean vectors
+  BVTYPE blend, xzero, xnegative;
+  BVTYPE overflow, underflow, xfinite, yfinite, efinite;
+
+  // remove sign
+  x1 = abs(x0);
+
+  // Separate mantissa from exponent
+  // This gives the mantissa * 0.5
+  x = fraction_2(x1);
+
+  // reduce range of x = +/- sqrt(2)/2
+  blend = x > VM_SQRT2 * 0.5;
+  x     = if_add(!blend, x, x);  // conditional add
+
+  // Pade approximation
+  // Higher precision than in log function. Still higher precision wanted
+  x -= 1.0;
+  x2 = x * x;
+  px = polynomial_6(x, P0logl, P1logl, P2logl, P3logl, P4logl, P5logl, P6logl);
+  px *= x * x2;
+  qx  = polynomial_6n(x, Q0logl, Q1logl, Q2logl, Q3logl, Q4logl, Q5logl);
+  lg1 = px / qx;
+
+  // extract exponent
+  ef = exponent_f(x1);
+  ef = if_add(blend, ef, 1.);  // conditional add
+
+  // multiply exponent by y
+  // nearest integer e1 goes into exponent of result, remainder yr is added to log
+  e1 = round(ef * y);
+  yr = mul_sub_x(ef, y, e1);  // calculate remainder yr. precision very important here
+
+  // add initial terms to Pade expansion
+  lg = nmul_add(0.5, x2, x) + lg1;  // lg = (x - 0.5 * x2) + lg1;
+  // calculate rounding errors in lg
+  // rounding error in multiplication 0.5*x*x
+  x2err = mul_sub_x(0.5 * x, x, 0.5 * x2);
+  // rounding error in additions and subtractions
+  lgerr = mul_add(0.5, x2, lg - x) - lg1;  // lgerr = ((lg - x) + 0.5 * x2) - lg1;
+
+  // extract something for the exponent
+  e2 = round(lg * y * VM_LOG2E);
+  // subtract this from lg, with extra precision
+  v = mul_sub_x(lg, y, e2 * ln2d_hi);
+  v = nmul_add(e2, ln2d_lo, v);  // v -= e2 * ln2d_lo;
+
+  // add remainder from ef * y
+  v = mul_add(yr, VM_LN2, v);  // v += yr * VM_LN2;
+
+  // correct for previous rounding errors
+  v = nmul_add(lgerr + x2err, y, v);  // v -= (lgerr + x2err) * y;
+
+  // exp function
+
+  // extract something for the exponent if possible
+  x  = v;
+  e3 = round(x * log2e);
+  // high precision multiplication not needed here because abs(e3) <= 1
+  x = nmul_add(e3, VM_LN2, x);  // x -= e3 * VM_LN2;
+
+  z = polynomial_13m(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13);
+  z = z + 1.0;
+
+  // contributions to exponent
+  ee = e1 + e2 + e3;
+  ei = round_to_int64_limited(ee);
+  // biased exponent of result:
+  ej = ei + (ITYPE(reinterpret_i(z)) >> 52);
+  // check exponent for overflow and underflow
+  overflow  = BVTYPE(ej >= 0x07FF) | (ee > 3000.);
+  underflow = BVTYPE(ej <= 0x0000) | (ee < -3000.);
+
+  // add exponent by integer addition
+  z = reinterpret_d(ITYPE(reinterpret_i(z)) + (ei << 52));
+
+  // check for special cases
+  xfinite   = is_finite(x0);
+  yfinite   = is_finite(y);
+  efinite   = is_finite(ee);
+  xzero     = is_zero_or_subnormal(x0);
+  xnegative = x0 < 0.;
+
+  // check for overflow and underflow
+  if(horizontal_or(overflow | underflow))
+    {
+      // handle errors
+      z = select(underflow, VTYPE(0.), z);
+      z = select(overflow, infinite_vec<VTYPE>(), z);
+    }
+
+  // check for x == 0
+  z = wm_pow_case_x0(xzero, y, z);
+  // z = select(xzero, select(y < 0., infinite_vec<VTYPE>(), select(y == 0., VTYPE(1.), VTYPE(0.))), z);
+
+  // check for x < 0. y must be integer
+  if(horizontal_or(xnegative))
+    {
+      // test if y odd
+      yodd = ITYPE(reinterpret_i(abs(y) + pow2_52)) << 63;        // convert y to integer and shift bit 0 to position of sign bit
+      z1   = z | (x0 & VTYPE(reinterpret_d(yodd)));               // apply sign if y odd
+      z1   = select(y == round(y), z1, nan_vec<VTYPE>(NAN_POW));  // NAN if y not integer
+      z    = select(xnegative, z1, z);
+    }
+
+  // check for range errors
+  if(horizontal_and(xfinite & yfinite & (efinite | xzero)))
+    {
+      // fast return if no special cases
+      return z;
+    }
+
+  // handle special error cases
+  z    = select(yfinite & efinite, z, select(x1 == 1., VTYPE(1.), select((x1 > 1.) ^ sign_bit(y), infinite_vec<VTYPE>(), 0.)));
+  yodd = ITYPE(reinterpret_i(abs(y) + pow2_52)) << 63;  // same as above
+  z    = select(xfinite, z,
+             select(y == 0., VTYPE(1.), select(y < 0., VTYPE(0.), infinite_vec<VTYPE>() | (VTYPE(reinterpret_d(yodd)) & x0))));
+  z    = select(is_nan(x0), select(is_nan(y), x0 | y, x0), select(is_nan(y), y, z));
+  return z;
+}
+
+// This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined:
+// template <typename TT> static Vec2d pow(Vec2d const & a, TT n);
+
+// instantiations of pow_template_d:
+template <>
+inline Vec2d pow<Vec2d>(Vec2d const& x, Vec2d const& y)
+{
+  return pow_template_d<Vec2d, Vec2q, Vec2db>(x, y);
+}
+
+template <>
+inline Vec2d pow<double>(Vec2d const& x, double const& y)
+{
+  return pow_template_d<Vec2d, Vec2q, Vec2db>(x, y);
+}
+template <>
+inline Vec2d pow<float>(Vec2d const& x, float const& y)
+{
+  return pow_template_d<Vec2d, Vec2q, Vec2db>(x, (double)y);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+template <>
+inline Vec4d pow<Vec4d>(Vec4d const& x, Vec4d const& y)
+{
+  return pow_template_d<Vec4d, Vec4q, Vec4db>(x, y);
+}
+
+template <>
+inline Vec4d pow<double>(Vec4d const& x, double const& y)
+{
+  return pow_template_d<Vec4d, Vec4q, Vec4db>(x, y);
+}
+
+template <>
+inline Vec4d pow<float>(Vec4d const& x, float const& y)
+{
+  return pow_template_d<Vec4d, Vec4q, Vec4db>(x, (double)y);
+}
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+template <>
+inline Vec8d pow<Vec8d>(Vec8d const& x, Vec8d const& y)
+{
+  return pow_template_d<Vec8d, Vec8q, Vec8db>(x, y);
+}
+
+template <>
+inline Vec8d pow<double>(Vec8d const& x, double const& y)
+{
+  return pow_template_d<Vec8d, Vec8q, Vec8db>(x, y);
+}
+
+template <>
+inline Vec8d pow<float>(Vec8d const& x, float const& y)
+{
+  return pow_template_d<Vec8d, Vec8q, Vec8db>(x, (double)y);
+}
+
+// Helper function for power function: insert special values of pow(x,y) when x=0:
+// y<0 -> inf, y=0 -> 1, y>0 -> 0, y=nan -> nan
+static inline Vec16f wm_pow_case_x0(Vec16fb const& xiszero, Vec16f const& y, Vec16f const& z)
+{
+#if INSTRSET >= 9
+  const __m512i table = Vec16ui(0x85858A00);
+  return _mm512_mask_fixupimm_ps(z, xiszero, y, table, 0);
+#else
+  return select(xiszero, select(y < 0.f, infinite_vec<Vec16f>(), select(y == 0.f, Vec16f(1.f), Vec16f(0.f))), z);
+#endif
+}
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f wm_pow_case_x0(Vec8fb const& xiszero, Vec8f const& y, Vec8f const& z)
+{
+  return select(xiszero, select(y < 0.f, infinite_vec<Vec8f>(), select(y == 0.f, Vec8f(1.f), Vec8f(0.f))), z);
+}
+#endif
+
+static inline Vec4f wm_pow_case_x0(Vec4fb const& xiszero, Vec4f const& y, Vec4f const& z)
+{
+  return select(xiszero, select(y < 0.f, infinite_vec<Vec4f>(), select(y == 0.f, Vec4f(1.f), Vec4f(0.f))), z);
+}
+
+// ****************************************************************************
+//                pow template, single precision
+// ****************************************************************************
+
+// Template parameters:
+// VTYPE:  data vector type
+// ITYPE:  signed integer vector type
+// BVTYPE: boolean vector type
+// Calculate x to the power of y
+template <class VTYPE, class ITYPE, class BVTYPE>
+static inline VTYPE pow_template_f(VTYPE const& x0, VTYPE const& y)
+{
+  // define constants
+  const float ln2f_hi = 0.693359375f;
+  const float ln2f_lo = -2.12194440e-4f;
+  // const float max_expf =  87.3f;
+  const float log2e   = float(VM_LOG2E);  // 1/log(2)
+  const float pow2_23 = 8388608.0f;       // 2^23
+
+  const float P0logf = 3.3333331174E-1f;
+  const float P1logf = -2.4999993993E-1f;
+  const float P2logf = 2.0000714765E-1f;
+  const float P3logf = -1.6668057665E-1f;
+  const float P4logf = 1.4249322787E-1f;
+  const float P5logf = -1.2420140846E-1f;
+  const float P6logf = 1.1676998740E-1f;
+  const float P7logf = -1.1514610310E-1f;
+  const float P8logf = 7.0376836292E-2f;
+
+  // Taylor coefficients for exp function, 1/n!
+  const float p2expf = 1.f / 2.f;
+  const float p3expf = 1.f / 6.f;
+  const float p4expf = 1.f / 24.f;
+  const float p5expf = 1.f / 120.f;
+  const float p6expf = 1.f / 720.f;
+  const float p7expf = 1.f / 5040.f;
+
+  // data vectors
+  VTYPE x, x1, x2;
+  VTYPE ef, yr, v, z, z1;
+  VTYPE lg, lg1;
+  VTYPE lgerr, x2err;
+  VTYPE e1, e2, e3, ee;
+  // integer vectors
+  ITYPE ei, ej, yodd;
+  // boolean vectors
+  BVTYPE blend, xzero, xnegative;
+  BVTYPE overflow, underflow, xfinite, yfinite, efinite;
+
+  // remove sign
+  x1 = abs(x0);
+
+  // Separate mantissa from exponent
+  // This gives the mantissa * 0.5
+  x = fraction_2(x1);
+
+  // reduce range of x = +/- sqrt(2)/2
+  blend = x > float(VM_SQRT2 * 0.5);
+  x     = if_add(!blend, x, x);  // conditional add
+
+  // Taylor expansion, high precision
+  x -= 1.0f;
+  x2  = x * x;
+  lg1 = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
+  lg1 *= x2 * x;
+
+  // extract exponent
+  ef = exponent_f(x1);
+  ef = if_add(blend, ef, 1.0f);  // conditional add
+
+  // multiply exponent by y
+  // nearest integer e1 goes into exponent of result, remainder yr is added to log
+  e1 = round(ef * y);
+  yr = mul_sub_x(ef, y, e1);  // calculate remainder yr. precision very important here
+
+  // add initial terms to expansion
+  lg = nmul_add(0.5f, x2, x) + lg1;  // lg = (x - 0.5f * x2) + lg1;
+
+  // calculate rounding errors in lg
+  // rounding error in multiplication 0.5*x*x
+  x2err = mul_sub_x(0.5f * x, x, 0.5f * x2);
+  // rounding error in additions and subtractions
+  lgerr = mul_add(0.5f, x2, lg - x) - lg1;  // lgerr = ((lg - x) + 0.5f * x2) - lg1;
+
+  // extract something for the exponent
+  e2 = round(lg * y * float(VM_LOG2E));
+  // subtract this from lg, with extra precision
+  v = mul_sub_x(lg, y, e2 * ln2f_hi);
+  v = nmul_add(e2, ln2f_lo, v);  // v -= e2 * ln2f_lo;
+
+  // correct for previous rounding errors
+  v -= mul_sub(lgerr + x2err, y, yr * float(VM_LN2));  // v -= (lgerr + x2err) * y - yr * float(VM_LN2) ;
+
+  // exp function
+
+  // extract something for the exponent if possible
+  x  = v;
+  e3 = round(x * log2e);
+  // high precision multiplication not needed here because abs(e3) <= 1
+  x = nmul_add(e3, float(VM_LN2), x);  // x -= e3 * float(VM_LN2);
+
+  // Taylor polynomial
+  x2 = x * x;
+  z  = polynomial_5(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf) * x2 + x + 1.0f;
+
+  // contributions to exponent
+  ee = e1 + e2 + e3;
+  ei = round_to_int(ee);
+  // biased exponent of result:
+  ej = ei + (ITYPE(reinterpret_i(z)) >> 23);
+  // check exponent for overflow and underflow
+  overflow  = BVTYPE(ej >= 0x0FF) | (ee > 300.f);
+  underflow = BVTYPE(ej <= 0x000) | (ee < -300.f);
+
+  // add exponent by integer addition
+  z = reinterpret_f(ITYPE(reinterpret_i(z)) + (ei << 23));  // the extra 0x10000 is shifted out here
+
+  // check for special cases
+  xfinite = is_finite(x0);
+  yfinite = is_finite(y);
+  efinite = is_finite(ee);
+
+  xzero     = is_zero_or_subnormal(x0);
+  xnegative = x0 < 0.f;
+
+  // check for overflow and underflow
+  if(horizontal_or(overflow | underflow))
+    {
+      // handle errors
+      z = select(underflow, VTYPE(0.f), z);
+      z = select(overflow, infinite_vec<VTYPE>(), z);
+    }
+
+  // check for x == 0
+  z = wm_pow_case_x0(xzero, y, z);
+  // z = select(xzero, select(y < 0.f, infinite_vec<VTYPE>(), select(y == 0.f, VTYPE(1.f), VTYPE(0.f))), z);
+
+  // check for x < 0. y must be integer
+  if(horizontal_or(xnegative))
+    {
+      // test if y odd
+      yodd = ITYPE(reinterpret_i(abs(y) + pow2_23)) << 31;        // convert y to integer and shift bit 0 to position of sign bit
+      z1   = z | (x0 & VTYPE(reinterpret_f(yodd)));               // apply sign if y odd
+      z1   = select(y == round(y), z1, nan_vec<VTYPE>(NAN_POW));  // NAN if y not integer
+      z    = select(xnegative, z1, z);
+    }
+
+  // check for range errors
+  if(horizontal_and(xfinite & yfinite & (efinite | xzero)))
+    {
+      // fast return if no special cases
+      return z;
+    }
+
+  // handle special error cases
+  z    = select(yfinite & efinite, z, select(x1 == 1.f, VTYPE(1.f), select((x1 > 1.f) ^ sign_bit(y), infinite_vec<VTYPE>(), 0.f)));
+  yodd = ITYPE(reinterpret_i(abs(y) + pow2_23)) << 31;  // same as above
+  z    = select(xfinite, z,
+             select(y == 0.f, VTYPE(1.f), select(y < 0.f, VTYPE(0.f), infinite_vec<VTYPE>() | (VTYPE(reinterpret_f(yodd)) & x0))));
+  z    = select(is_nan(x0), select(is_nan(y), x0 | y, x0), select(is_nan(y), y, z));
+  return z;
+}
+
+// This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined:
+// template <typename TT> static Vec4f pow(Vec4f const & a, TT n);
+
+template <>
+inline Vec4f pow<Vec4f>(Vec4f const& x, Vec4f const& y)
+{
+  return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, y);
+}
+
+template <>
+inline Vec4f pow<float>(Vec4f const& x, float const& y)
+{
+  return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, y);
+}
+
+template <>
+inline Vec4f pow<double>(Vec4f const& x, double const& y)
+{
+  return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, (float)y);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+template <>
+inline Vec8f pow<Vec8f>(Vec8f const& x, Vec8f const& y)
+{
+  return pow_template_f<Vec8f, Vec8i, Vec8fb>(x, y);
+}
+
+template <>
+inline Vec8f pow<float>(Vec8f const& x, float const& y)
+{
+  return pow_template_f<Vec8f, Vec8i, Vec8fb>(x, y);
+}
+template <>
+inline Vec8f pow<double>(Vec8f const& x, double const& y)
+{
+  return pow_template_f<Vec8f, Vec8i, Vec8fb>(x, (float)y);
+}
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+template <>
+inline Vec16f pow<Vec16f>(Vec16f const& x, Vec16f const& y)
+{
+  return pow_template_f<Vec16f, Vec16i, Vec16fb>(x, y);
+}
+
+template <>
+inline Vec16f pow<float>(Vec16f const& x, float const& y)
+{
+  return pow_template_f<Vec16f, Vec16i, Vec16fb>(x, y);
+}
+
+template <>
+inline Vec16f pow<double>(Vec16f const& x, double const& y)
+{
+  return pow_template_f<Vec16f, Vec16i, Vec16fb>(x, (float)y);
+}
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// *************************************************************
+//             power function with rational exponent
+// *************************************************************
+// Power function with rational exponent: x^(a/b)
+// Template must be defined as class to allow partial template specialization
+template <int a, int b>
+class Power_rational
+{
+ public:
+  // overloaded member function for each vector type
+  Vec4f pow(Vec4f const& x)
+  {
+    Vec4f y = x;
+    // negative x allowed when b odd or a even
+    // (if a is even then either b is odd or a/b can be reduced,
+    // but we can check a even anyway at no cost to be sure)
+    if(a == 0)
+      return 1.f;
+    if((b | ~a) & 1)
+      y = abs(y);
+    y = pow(y, float(double(a) / double(b)));
+    if(a & b & 1)
+      y = sign_combine(y, x);  // apply sign if a and b both odd
+    if((a ^ b) >= 0)
+      y = select(x == 0.f, 0.f, y);  // zero allowed for positive a and b
+    return y;
+  }
+  Vec2d pow(Vec2d const& x)
+  {
+    Vec2d y = x;
+    if(a == 0)
+      return 1.;
+    if((b | ~a) & 1)
+      y = abs(y);
+    y = pow(y, double((long double)a / (long double)b));
+    if(a & b & 1)
+      y = sign_combine(y, x);
+    if((a ^ b) >= 0)
+      y = select(x == 0., 0., y);
+    return y;
+  }
+#if MAX_VECTOR_SIZE >= 256
+  Vec8f pow(Vec8f const& x)
+  {
+    Vec8f y = x;
+    if(a == 0)
+      return 1.f;
+    if((b | ~a) & 1)
+      y = abs(y);
+    y = pow(y, float(double(a) / double(b)));
+    if(a & b & 1)
+      y = sign_combine(y, x);
+    if((a ^ b) >= 0)
+      y = select(x == 0.f, 0.f, y);
+    return y;
+  }
+  Vec4d pow(Vec4d const& x)
+  {
+    Vec4d y = x;
+    if(a == 0)
+      return 1.;
+    if((b | ~a) & 1)
+      y = abs(y);
+    y = pow(y, double((long double)a / (long double)b));
+    if(a & b & 1)
+      y = sign_combine(y, x);
+    if((a ^ b) >= 0)
+      y = select(x == 0., 0., y);
+    return y;
+  }
+#endif  // MAX_VECTOR_SIZE >= 256
+#if MAX_VECTOR_SIZE >= 512
+  Vec16f pow(Vec16f const& x)
+  {
+    Vec16f y = x;
+    if(a == 0)
+      return 1.f;
+    if((b | ~a) & 1)
+      y = abs(y);
+    y = pow(y, float(double(a) / double(b)));
+    if(a & b & 1)
+      y = sign_combine(y, x);
+    if((a ^ b) >= 0)
+      y = select(x == 0.f, 0.f, y);
+    return y;
+  }
+  Vec8d pow(Vec8d const& x)
+  {
+    Vec8d y = x;
+    if(a == 0)
+      return 1.;
+    if((b | ~a) & 1)
+      y = abs(y);
+    y = pow(y, double((long double)a / (long double)b));
+    if(a & b & 1)
+      y = sign_combine(y, x);
+    if((a ^ b) >= 0)
+      y = select(x == 0., 0., y);
+    return y;
+  }
+#endif  // MAX_VECTOR_SIZE >= 512
+};
+
+// partial specialization for b = 1
+template <int a>
+class Power_rational<a, 1>
+{
+ public:
+  template <class VTYPE>
+  VTYPE pow(VTYPE const& x)
+  {
+    return pow_n<a>(x);
+  }
+};
+
+// partial specialization for b = 2
+template <int a>
+class Power_rational<a, 2>
+{
+ public:
+  template <class VTYPE>
+  VTYPE pow(VTYPE const& x)
+  {
+    VTYPE y = pow_n<(a > 0 ? a / 2 : (a - 1) / 2)>(x);
+    if(a & 1)
+      y *= sqrt(x);
+    return y;
+  }
+};
+
+// full specialization for a = 1, b = 2
+template <>
+class Power_rational<1, 2>
+{
+ public:
+  template <class VTYPE>
+  VTYPE pow(VTYPE const& x)
+  {
+    return sqrt(x);
+  }
+};
+
+// full specialization for a = -1, b = 2
+template <>
+class Power_rational<-1, 2>
+{
+ public:
+  template <class VTYPE>
+  VTYPE pow(VTYPE const& x)
+  {
+    // (this is faster than iteration method on modern CPUs)
+    return VTYPE(1.f) / sqrt(x);
+  }
+};
+
+// partial specialization for b = 3
+template <int a>
+class Power_rational<a, 3>
+{
+ public:
+  template <class VTYPE>
+  VTYPE pow(VTYPE const& x)
+  {
+    VTYPE t;
+    switch(a % 3)
+      {
+        case -2:
+          t = reciprocal_cbrt(x);
+          t *= t;
+          if(a == -2)
+            return t;
+          t = t / pow_n<(-a - 2) / 3>(x);
+          break;
+        case -1:
+          t = reciprocal_cbrt(x);
+          if(a == -1)
+            return t;
+          t = t / pow_n<(-a - 1) / 3>(x);
+          break;
+        case 0:
+          t = pow_n<a / 3>(x);
+          break;
+        case 1:
+          t = cbrt(x);
+          if(a == 1)
+            return t;
+          t = t * pow_n<a / 3>(x);
+          break;
+        case 2:
+          t = square_cbrt(x);
+          if(a == 2)
+            return t;
+          t = t * pow_n<a / 3>(x);
+          break;
+      }
+    return t;
+  }
+};
+
+// partial specialization for b = 4
+template <int a>
+class Power_rational<a, 4>
+{
+ public:
+  template <class VTYPE>
+  VTYPE pow(VTYPE const& x)
+  {
+    VTYPE t, s1, s2;
+    s1 = sqrt(x);
+    if(a & 1)
+      s2 = sqrt(s1);
+    switch(a % 4)
+      {
+        case -3:
+          t = s2 / pow_n<1 + (-a) / 4>(x);
+          break;
+        case -2:
+          t = s1 / pow_n<1 + (-a) / 4>(x);
+          break;
+        case -1:
+          if(a != -1)
+            s2 *= pow_n<(-a) / 4>(x);
+          t = VTYPE(1.f) / s2;
+          break;
+        case 0:
+        default:
+          t = pow_n<a / 4>(x);
+          break;
+        case 1:
+          t = s2;
+          if(a != 1)
+            t *= pow_n<a / 4>(x);
+          break;
+        case 2:
+          t = s1;
+          if(a != 2)
+            t *= pow_n<a / 4>(x);
+          break;
+        case 3:
+          t = s1 * s2;
+          if(a != 3)
+            t *= pow_n<a / 4>(x);
+          break;
+      }
+    return t;
+  }
+};
+
+// partial specialization for b = 6
+template <int a>
+class Power_rational<a, 6>
+{
+ public:
+  template <class VTYPE>
+  VTYPE pow(VTYPE const& x)
+  {
+    VTYPE t, s1, s2, s3;
+    switch(a % 6)
+      {
+        case -5:
+          t = reciprocal_cbrt(x);
+          t = t * t * sqrt(t);
+          if(a != -5)
+            t /= pow_n<(-a) / 6>(x);
+          break;
+        case -4:
+          t = reciprocal_cbrt(x);
+          t *= t;
+          if(a != -4)
+            t /= pow_n<(-a) / 6>(x);
+          break;
+        case -3:
+          t = pow_n<a / 6>(x);
+          t /= sqrt(x);
+          break;
+        case -2:
+          t = reciprocal_cbrt(x);
+          if(a != -2)
+            t /= pow_n<(-a) / 6>(x);
+          break;
+        case -1:
+          t = sqrt(reciprocal_cbrt(x));
+          if(a != -1)
+            t /= pow_n<(-a) / 6>(x);
+          break;
+        case 0:
+        default:
+          t = pow_n<a / 6>(x);
+          break;
+        case 1:
+          t = sqrt(cbrt(x));
+          if(a != 1)
+            t *= pow_n<a / 6>(x);
+          break;
+        case 2:
+          t = cbrt(x);
+          if(a != 2)
+            t *= pow_n<a / 6>(x);
+          break;
+        case 3:
+          t = sqrt(x);
+          if(a != 3)
+            t *= pow_n<a / 6>(x);
+          break;
+        case 4:
+          t = square_cbrt(x);
+          if(a != 4)
+            t *= pow_n<a / 6>(x);
+          break;
+        case 5:
+          t = cbrt(x);
+          t = t * t * sqrt(t);
+          if(a != 5)
+            t *= pow_n<a / 6>(x);
+          break;
+      }
+    return t;
+  }
+};
+
+// partial specialization for b = 8
+template <int a>
+class Power_rational<a, 8>
+{
+ public:
+  template <class VTYPE>
+  VTYPE pow(VTYPE const& x)
+  {
+    VTYPE t, s1, s2, s3;
+    s1 = sqrt(x);  // x^(1/2)
+    if(a & 3)
+      s2 = sqrt(s1);  // x^(1/4)
+    if(a & 1)
+      s3 = sqrt(s2);  // x^(1/8)
+    switch(a % 8)
+      {
+        case -7:
+          t = s3 / pow_n<1 + (-a) / 8>(x);
+          break;
+        case -6:
+          t = s2 / pow_n<1 + (-a) / 8>(x);
+          break;
+        case -5:
+          t = s3 * (s2 / pow_n<1 + (-a) / 8>(x));
+          break;
+        case -4:
+          t = s1 / pow_n<1 + (-a) / 8>(x);
+          break;
+        case -3:
+          t = s3 * (s1 / pow_n<1 + (-a) / 8>(x));
+          break;
+        case -2:
+          if(a != -2)
+            s2 *= pow_n<(-a) / 8>(x);
+          t = VTYPE(1.f) / s2;
+          break;
+        case -1:
+          if(a != -1)
+            s3 *= pow_n<(-a) / 8>(x);
+          t = VTYPE(1.f) / s3;
+          break;
+        case 0:
+        default:
+          t = pow_n<a / 8>(x);
+          break;
+        case 1:
+          t = s3;
+          if(a != 1)
+            t *= pow_n<a / 8>(x);
+          break;
+        case 2:
+          t = s2;
+          if(a != 2)
+            t *= pow_n<a / 8>(x);
+          break;
+        case 3:
+          t = s2 * s3;
+          if(a != 3)
+            t *= pow_n<a / 8>(x);
+          break;
+        case 4:
+          t = s1;
+          if(a != 4)
+            t *= pow_n<a / 8>(x);
+          break;
+        case 5:
+          t = s1 * s3;
+          if(a != 5)
+            t *= pow_n<a / 8>(x);
+          break;
+        case 6:
+          t = s1 * s2;
+          if(a != 6)
+            t *= pow_n<a / 8>(x);
+          break;
+        case 7:
+          t = s2 * s3;
+          if(a != 7)
+            s1 *= pow_n<a / 8>(x);
+          t *= s1;
+          break;
+      }
+    return t;
+  }
+};
+
+// macro to call template class member function pow
+#define pow_ratio(x, a, b) (Power_rational < (b) < 0 ? -(a) : (a), (b) < 0 ? -(b) : (b) > ().pow(x))
+
+/******************************************************************************
+ *                 Detect NAN codes
+ *
+ * These functions return the code hidden in a NAN. The sign bit is ignored
+ ******************************************************************************/
+
+static inline Vec4i nan_code(Vec4f const& x)
+{
+  Vec4i a  = reinterpret_i(x);
+  Vec4ib b = (a & 0x7F800000) == 0x7F800000;  // check if NAN/INF
+  return a & 0x007FFFFF & Vec4i(b);           // isolate NAN code bits
+}
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec2q nan_code(Vec2d const& x)
+{
+  Vec2q a       = reinterpret_i(x);
+  Vec2q const m = 0x7FF0000000000000;
+  Vec2q const n = 0x000FFFFFFFFFFFFF;
+  Vec2qb b      = (a & m) == m;  // check if NAN/INF
+  return a & n & Vec2q(b);       // isolate NAN code bits
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec8i nan_code(Vec8f const& x)
+{
+  Vec8i a  = reinterpret_i(x);
+  Vec8ib b = (a & 0x7F800000) == 0x7F800000;  // check if NAN/INF
+  return a & 0x007FFFFF & Vec8i(b);           // isolate NAN code bits
+}
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec4q nan_code(Vec4d const& x)
+{
+  Vec4q a       = reinterpret_i(x);
+  Vec4q const m = 0x7FF0000000000000;
+  Vec4q const n = 0x000FFFFFFFFFFFFF;
+  Vec4qb b      = (a & m) == m;  // check if NAN/INF
+  return a & n & Vec4q(b);       // isolate NAN code bits
+}
+
+#endif  // MAX_VECTOR_SIZE >= 256
+#if MAX_VECTOR_SIZE >= 512
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec16i nan_code(Vec16f const& x)
+{
+  Vec16i a  = Vec16i(reinterpret_i(x));
+  Vec16ib b = (a & 0x7F800000) == 0x7F800000;  // check if NAN/INF
+  return a & 0x007FFFFF & Vec16i(b);           // isolate NAN code bits
+}
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec8q nan_code(Vec8d const& x)
+{
+  Vec8q a       = Vec8q(reinterpret_i(x));
+  Vec8q const m = 0x7FF0000000000000;
+  Vec8q const n = 0x000FFFFFFFFFFFFF;
+  Vec8qb b      = (a & m) == m;  // check if NAN/INF
+  return a & n & Vec8q(b);       // isolate NAN code bits
+}
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORMATH_EXP_H
diff --git a/src/vectorclass/vectormath_hyp.h b/src/vectorclass/vectormath_hyp.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c2c1606ed817695434abdb00fb31bad0dc2f55f
--- /dev/null
+++ b/src/vectorclass/vectormath_hyp.h
@@ -0,0 +1,689 @@
+/****************************  vectormath_hyp.h   ******************************
+ * Author:        Agner Fog
+ * Date created:  2014-07-09
+ * Last modified: 2015-02-10
+ * Version:       1.16
+ * Project:       vector classes
+ * Description:
+ * Header file containing inline vector functions of hyperbolic and inverse
+ * hyperbolic functions:
+ * sinh        hyperbolic sine
+ * cosh        hyperbolic cosine
+ * tanh        hyperbolic tangent
+ * asinh       inverse hyperbolic sine
+ * acosh       inverse hyperbolic cosine
+ * atanh       inverse hyperbolic tangent
+ *
+ * Theory, methods and inspiration based partially on these sources:
+ * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
+ *   Ellis Horwood, 1989.
+ * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
+ *   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
+ * > Cephes math library by Stephen L. Moshier 1992,
+ *   http://www.netlib.org/cephes/
+ *
+ * For detailed instructions, see vectormath_common.h and VectorClass.pdf
+ *
+ * (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
+ ******************************************************************************/
+
+#ifndef VECTORMATH_HYP_H
+#define VECTORMATH_HYP_H 1
+
+#include "vectormath_exp.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/******************************************************************************
+ *                 Hyperbolic functions
+ ******************************************************************************/
+
+// Template for sinh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE sinh_d(VTYPE const& x0)
+{
+  // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+  // Coefficients
+  const double p0 = -3.51754964808151394800E5;
+  const double p1 = -1.15614435765005216044E4;
+  const double p2 = -1.63725857525983828727E2;
+  const double p3 = -7.89474443963537015605E-1;
+
+  const double q0 = -2.11052978884890840399E6;
+  const double q1 = 3.61578279834431989373E4;
+  const double q2 = -2.77711081420602794433E2;
+  const double q3 = 1.0;
+
+  // data vectors
+  VTYPE x, x2, y1, y2;
+  BVTYPE x_small;  // boolean vector
+
+  x       = abs(x0);
+  x_small = x <= 1.0;  // use Pade approximation if abs(x) <= 1
+
+  if(horizontal_or(x_small))
+    {
+      // At least one element needs small method
+      x2 = x * x;
+      y1 = polynomial_3(x2, p0, p1, p2, p3) / polynomial_3(x2, q0, q1, q2, q3);
+      y1 = mul_add(y1, x * x2, x);  // y1 = x + x2*(x*y1);
+    }
+  if(!horizontal_and(x_small))
+    {
+      // At least one element needs big method
+      y2 = exp_d<VTYPE, BVTYPE, 0, 1>(x);  //   0.5 * exp(x)
+      y2 -= 0.25 / y2;                     // - 0.5 * exp(-x)
+    }
+  y1 = select(x_small, y1, y2);  // choose method
+  y1 = sign_combine(y1, x0);     // get original sign
+
+  return y1;
+}
+
+// instances of sinh_d template
+static inline Vec2d sinh(Vec2d const& x) { return sinh_d<Vec2d, Vec2db>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d sinh(Vec4d const& x) { return sinh_d<Vec4d, Vec4db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d sinh(Vec8d const& x) { return sinh_d<Vec8d, Vec8db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for sinh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE sinh_f(VTYPE const& x0)
+{
+  // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+  // Coefficients
+  const float r0 = 1.66667160211E-1f;
+  const float r1 = 8.33028376239E-3f;
+  const float r2 = 2.03721912945E-4f;
+
+  // data vectors
+  VTYPE x, x2, y1, y2;
+  BVTYPE x_small;  // boolean vector
+
+  x       = abs(x0);
+  x_small = x <= 1.0f;  // use polynomial approximation if abs(x) <= 1
+
+  if(horizontal_or(x_small))
+    {
+      // At least one element needs small method
+      x2 = x * x;
+      y1 = polynomial_2(x2, r0, r1, r2);
+      y1 = mul_add(y1, x2 * x, x);  // y1 = x + x2*(x*y1);
+    }
+  if(!horizontal_and(x_small))
+    {
+      // At least one element needs big method
+      y2 = exp_f<VTYPE, BVTYPE, 0, 1>(x);  //   0.5 * exp(x)
+      y2 -= 0.25f / y2;                    // - 0.5 * exp(-x)
+    }
+  y1 = select(x_small, y1, y2);  // choose method
+  y1 = sign_combine(y1, x0);     // get original sign
+
+  return y1;
+}
+
+// instances of sinh_f template
+static inline Vec4f sinh(Vec4f const& x) { return sinh_f<Vec4f, Vec4fb>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f sinh(Vec8f const& x) { return sinh_f<Vec8f, Vec8fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f sinh(Vec16f const& x) { return sinh_f<Vec16f, Vec16fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for cosh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE cosh_d(VTYPE const& x0)
+{
+  // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+  // data vectors
+  VTYPE x, y;
+
+  x = abs(x0);
+  y = exp_d<VTYPE, BVTYPE, 0, 1>(x);  //   0.5 * exp(x)
+  y += 0.25 / y;                      // + 0.5 * exp(-x)
+  return y;
+}
+
+// instances of sinh_d template
+static inline Vec2d cosh(Vec2d const& x) { return cosh_d<Vec2d, Vec2db>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d cosh(Vec4d const& x) { return cosh_d<Vec4d, Vec4db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d cosh(Vec8d const& x) { return cosh_d<Vec8d, Vec8db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for cosh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE cosh_f(VTYPE const& x0)
+{
+  // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+  // data vectors
+  VTYPE x, y;
+
+  x = abs(x0);
+  y = exp_f<VTYPE, BVTYPE, 0, 1>(x);  //   0.5 * exp(x)
+  y += 0.25f / y;                     // + 0.5 * exp(-x)
+  return y;
+}
+
+// instances of sinh_d template
+static inline Vec4f cosh(Vec4f const& x) { return cosh_f<Vec4f, Vec4fb>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f cosh(Vec8f const& x) { return cosh_f<Vec8f, Vec8fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f cosh(Vec16f const& x) { return cosh_f<Vec16f, Vec16fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for tanh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE tanh_d(VTYPE const& x0)
+{
+  // Coefficients
+  const double p0 = -1.61468768441708447952E3;
+  const double p1 = -9.92877231001918586564E1;
+  const double p2 = -9.64399179425052238628E-1;
+
+  const double q0 = 4.84406305325125486048E3;
+  const double q1 = 2.23548839060100448583E3;
+  const double q2 = 1.12811678491632931402E2;
+  const double q3 = 1.0;
+
+  // data vectors
+  VTYPE x, x2, y1, y2;
+  BVTYPE x_small, x_big;  // boolean vectors
+
+  x       = abs(x0);
+  x_small = x <= 0.625;  // use Pade approximation if abs(x) <= 5/8
+
+  if(horizontal_or(x_small))
+    {
+      // At least one element needs small method
+      x2 = x * x;
+      y1 = polynomial_2(x2, p0, p1, p2) / polynomial_3(x2, q0, q1, q2, q3);
+      y1 = mul_add(y1, x2 * x, x);  // y1 = x + x2*(x*y1);
+    }
+  if(!horizontal_and(x_small))
+    {
+      // At least one element needs big method
+      y2 = exp(x + x);              // exp(2*x)
+      y2 = 1.0 - 2.0 / (y2 + 1.0);  // tanh(x)
+    }
+  x_big = x > 350.;
+  y1    = select(x_small, y1, y2);  // choose method
+  y1    = select(x_big, 1.0, y1);   // avoid overflow
+  y1    = sign_combine(y1, x0);     // get original sign
+
+  return y1;
+}
+
+// instances of tanh_d template
+static inline Vec2d tanh(Vec2d const& x) { return tanh_d<Vec2d, Vec2db>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d tanh(Vec4d const& x) { return tanh_d<Vec4d, Vec4db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d tanh(Vec8d const& x) { return tanh_d<Vec8d, Vec8db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for tanh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE tanh_f(VTYPE const& x0)
+{
+  // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+  // Coefficients
+  const float r0 = -3.33332819422E-1f;
+  const float r1 = 1.33314422036E-1f;
+  const float r2 = -5.37397155531E-2f;
+  const float r3 = 2.06390887954E-2f;
+  const float r4 = -5.70498872745E-3f;
+
+  // data vectors
+  VTYPE x, x2, y1, y2;
+  BVTYPE x_small, x_big;  // boolean vectors
+
+  x       = abs(x0);
+  x_small = x <= 0.625f;  // use polynomial approximation if abs(x) <= 5/8
+
+  if(horizontal_or(x_small))
+    {
+      // At least one element needs small method
+      x2 = x * x;
+      y1 = polynomial_4(x2, r0, r1, r2, r3, r4);
+      y1 = mul_add(y1, x2 * x, x);  // y1 = x + (x2*x)*y1;
+    }
+  if(!horizontal_and(x_small))
+    {
+      // At least one element needs big method
+      y2 = exp(x + x);                 // exp(2*x)
+      y2 = 1.0f - 2.0f / (y2 + 1.0f);  // tanh(x)
+    }
+  x_big = x > 44.4f;
+  y1    = select(x_small, y1, y2);  // choose method
+  y1    = select(x_big, 1.0f, y1);  // avoid overflow
+  y1    = sign_combine(y1, x0);     // get original sign
+
+  return y1;
+}
+
+// instances of tanh_f template
+static inline Vec4f tanh(Vec4f const& x) { return tanh_f<Vec4f, Vec4fb>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f tanh(Vec8f const& x) { return tanh_f<Vec8f, Vec8fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f tanh(Vec16f const& x) { return tanh_f<Vec16f, Vec16fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+/******************************************************************************
+ *                 Inverse hyperbolic functions
+ ******************************************************************************/
+
+// Template for asinh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE asinh_d(VTYPE const& x0)
+{
+  // Coefficients
+  const double p0 = -5.56682227230859640450E0;
+  const double p1 = -9.09030533308377316566E0;
+  const double p2 = -4.37390226194356683570E0;
+  const double p3 = -5.91750212056387121207E-1;
+  const double p4 = -4.33231683752342103572E-3;
+
+  const double q0 = 3.34009336338516356383E1;
+  const double q1 = 6.95722521337257608734E1;
+  const double q2 = 4.86042483805291788324E1;
+  const double q3 = 1.28757002067426453537E1;
+  const double q4 = 1.0;
+
+  // data vectors
+  VTYPE x, x2, y1, y2;
+  BVTYPE x_small, x_huge;  // boolean vectors
+
+  x2      = x0 * x0;
+  x       = abs(x0);
+  x_small = x <= 0.533;  // use Pade approximation if abs(x) <= 0.5
+                         // both methods give the highest error close to 0.5. this limit is adjusted for minimum error
+  x_huge = x > 1.E20;    // simple approximation, avoid overflow
+
+  if(horizontal_or(x_small))
+    {
+      // At least one element needs small method
+      y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_4(x2, q0, q1, q2, q3, q4);
+      y1 = mul_add(y1, x2 * x, x);  // y1 = x + (x2*x)*y1;
+    }
+  if(!horizontal_and(x_small))
+    {
+      // At least one element needs big method
+      y2 = log(x + sqrt(x2 + 1.0));
+      if(horizontal_or(x_huge))
+        {
+          // At least one element needs huge method to avoid overflow
+          y2 = select(x_huge, log(x) + VM_LN2, y2);
+        }
+    }
+  y1 = select(x_small, y1, y2);  // choose method
+  y1 = sign_combine(y1, x0);     // get original sign
+
+  return y1;
+}
+
+// instances of asinh_d template
+static inline Vec2d asinh(Vec2d const& x) { return asinh_d<Vec2d, Vec2db>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d asinh(Vec4d const& x) { return asinh_d<Vec4d, Vec4db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d asinh(Vec8d const& x) { return asinh_d<Vec8d, Vec8db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for asinh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE asinh_f(VTYPE const& x0)
+{
+  // Coefficients
+  const float r0 = -1.6666288134E-1f;
+  const float r1 = 7.4847586088E-2f;
+  const float r2 = -4.2699340972E-2f;
+  const float r3 = 2.0122003309E-2f;
+
+  // data vectors
+  VTYPE x, x2, y1, y2;
+  BVTYPE x_small, x_huge;  // boolean vectors
+
+  x2      = x0 * x0;
+  x       = abs(x0);
+  x_small = x <= 0.51f;  // use polynomial approximation if abs(x) <= 0.5
+  x_huge  = x > 1.E10f;  // simple approximation, avoid overflow
+
+  if(horizontal_or(x_small))
+    {
+      // At least one element needs small method
+      y1 = polynomial_3(x2, r0, r1, r2, r3);
+      y1 = mul_add(y1, x2 * x, x);  // y1 = x + (x2*x)*y1;
+    }
+  if(!horizontal_and(x_small))
+    {
+      // At least one element needs big method
+      y2 = log(x + sqrt(x2 + 1.0f));
+      if(horizontal_or(x_huge))
+        {
+          // At least one element needs huge method to avoid overflow
+          y2 = select(x_huge, log(x) + (float)VM_LN2, y2);
+        }
+    }
+  y1 = select(x_small, y1, y2);  // choose method
+  y1 = sign_combine(y1, x0);     // get original sign
+
+  return y1;
+}
+
+// instances of asinh_f template
+static inline Vec4f asinh(Vec4f const& x) { return asinh_f<Vec4f, Vec4fb>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f asinh(Vec8f const& x) { return asinh_f<Vec8f, Vec8fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f asinh(Vec16f const& x) { return asinh_f<Vec16f, Vec16fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for acosh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE acosh_d(VTYPE const& x0)
+{
+  // Coefficients
+  const double p0 = 1.10855947270161294369E5;
+  const double p1 = 1.08102874834699867335E5;
+  const double p2 = 3.43989375926195455866E4;
+  const double p3 = 3.94726656571334401102E3;
+  const double p4 = 1.18801130533544501356E2;
+
+  const double q0 = 7.83869920495893927727E4;
+  const double q1 = 8.29725251988426222434E4;
+  const double q2 = 2.97683430363289370382E4;
+  const double q3 = 4.15352677227719831579E3;
+  const double q4 = 1.86145380837903397292E2;
+  const double q5 = 1.0;
+
+  // data vectors
+  VTYPE x1, y1, y2;
+  BVTYPE x_small, x_huge, undef;  // boolean vectors
+
+  x1      = x0 - 1.0;
+  undef   = x0 < 1.0;    // result is NAN
+  x_small = x1 < 0.49;   // use Pade approximation if abs(x-1) < 0.5
+  x_huge  = x1 > 1.E20;  // simple approximation, avoid overflow
+
+  if(horizontal_or(x_small))
+    {
+      // At least one element needs small method
+      y1 = sqrt(x1) * (polynomial_4(x1, p0, p1, p2, p3, p4) / polynomial_5(x1, q0, q1, q2, q3, q4, q5));
+      // x < 1 generates NAN
+      y1 = select(undef, nan_vec<VTYPE>(NAN_HYP), y1);
+    }
+  if(!horizontal_and(x_small))
+    {
+      // At least one element needs big method
+      y2 = log(x0 + sqrt(mul_sub(x0, x0, 1.0)));
+      if(horizontal_or(x_huge))
+        {
+          // At least one element needs huge method to avoid overflow
+          y2 = select(x_huge, log(x0) + VM_LN2, y2);
+        }
+    }
+  y1 = select(x_small, y1, y2);  // choose method
+  return y1;
+}
+
+// instances of acosh_d template
+static inline Vec2d acosh(Vec2d const& x) { return acosh_d<Vec2d, Vec2db>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d acosh(Vec4d const& x) { return acosh_d<Vec4d, Vec4db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d acosh(Vec8d const& x) { return acosh_d<Vec8d, Vec8db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for acosh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE acosh_f(VTYPE const& x0)
+{
+  // Coefficients
+  const float r0 = 1.4142135263E0f;
+  const float r1 = -1.1784741703E-1f;
+  const float r2 = 2.6454905019E-2f;
+  const float r3 = -7.5272886713E-3f;
+  const float r4 = 1.7596881071E-3f;
+
+  // data vectors
+  VTYPE x1, y1, y2;
+  BVTYPE x_small, x_huge, undef;  // boolean vectors
+
+  x1      = x0 - 1.0f;
+  undef   = x0 < 1.0f;    // result is NAN
+  x_small = x1 < 0.49f;   // use Pade approximation if abs(x-1) < 0.5
+  x_huge  = x1 > 1.E10f;  // simple approximation, avoid overflow
+
+  if(horizontal_or(x_small))
+    {
+      // At least one element needs small method
+      y1 = sqrt(x1) * polynomial_4(x1, r0, r1, r2, r3, r4);
+      // x < 1 generates NAN
+      y1 = select(undef, nan_vec<VTYPE>(NAN_HYP), y1);
+    }
+  if(!horizontal_and(x_small))
+    {
+      // At least one element needs big method
+      y2 = log(x0 + sqrt(mul_sub(x0, x0, 1.0)));
+      if(horizontal_or(x_huge))
+        {
+          // At least one element needs huge method to avoid overflow
+          y2 = select(x_huge, log(x0) + (float)VM_LN2, y2);
+        }
+    }
+  y1 = select(x_small, y1, y2);  // choose method
+  return y1;
+}
+
+// instances of acosh_f template
+static inline Vec4f acosh(Vec4f const& x) { return acosh_f<Vec4f, Vec4fb>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f acosh(Vec8f const& x) { return acosh_f<Vec8f, Vec8fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f acosh(Vec16f const& x) { return acosh_f<Vec16f, Vec16fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for atanh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE atanh_d(VTYPE const& x0)
+{
+  // Coefficients
+  const double p0 = -3.09092539379866942570E1;
+  const double p1 = 6.54566728676544377376E1;
+  const double p2 = -4.61252884198732692637E1;
+  const double p3 = 1.20426861384072379242E1;
+  const double p4 = -8.54074331929669305196E-1;
+
+  const double q0 = -9.27277618139601130017E1;
+  const double q1 = 2.52006675691344555838E2;
+  const double q2 = -2.49839401325893582852E2;
+  const double q3 = 1.08938092147140262656E2;
+  const double q4 = -1.95638849376911654834E1;
+  const double q5 = 1.0;
+
+  // data vectors
+  VTYPE x, x2, y1, y2, y3;
+  BVTYPE x_small;  // boolean vector
+
+  x       = abs(x0);
+  x_small = x < 0.5;  // use Pade approximation if abs(x) < 0.5
+
+  if(horizontal_or(x_small))
+    {
+      // At least one element needs small method
+      x2 = x * x;
+      y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_5(x2, q0, q1, q2, q3, q4, q5);
+      y1 = mul_add(y1, x2 * x, x);
+    }
+  if(!horizontal_and(x_small))
+    {
+      // At least one element needs big method
+      y2 = log((1.0 + x) / (1.0 - x)) * 0.5;
+      // check if out of range
+      y3 = select(x == 1.0, infinite_vec<VTYPE>(), nan_vec<VTYPE>(NAN_HYP));
+      y2 = select(x >= 1.0, y3, y2);
+    }
+  y1 = select(x_small, y1, y2);  // choose method
+  y1 = sign_combine(y1, x0);     // get original sign
+
+  return y1;
+}
+
+// instances of atanh_d template
+static inline Vec2d atanh(Vec2d const& x) { return atanh_d<Vec2d, Vec2db>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d atanh(Vec4d const& x) { return atanh_d<Vec4d, Vec4db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d atanh(Vec8d const& x) { return atanh_d<Vec8d, Vec8db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// Template for atanh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// BVTYPE: boolean vector type
+template <class VTYPE, class BVTYPE>
+static inline VTYPE atanh_f(VTYPE const& x0)
+{
+  // Coefficients
+  const float r0 = 3.33337300303E-1f;
+  const float r1 = 1.99782164500E-1f;
+  const float r2 = 1.46691431730E-1f;
+  const float r3 = 8.24370301058E-2f;
+  const float r4 = 1.81740078349E-1f;
+
+  // data vectors
+  VTYPE x, x2, y1, y2, y3;
+  BVTYPE x_small;  // boolean vector
+
+  x       = abs(x0);
+  x_small = x < 0.5f;  // use polynomial approximation if abs(x) < 0.5
+
+  if(horizontal_or(x_small))
+    {
+      // At least one element needs small method
+      x2 = x * x;
+      y1 = polynomial_4(x2, r0, r1, r2, r3, r4);
+      y1 = mul_add(y1, x2 * x, x);
+    }
+  if(!horizontal_and(x_small))
+    {
+      // At least one element needs big method
+      y2 = log((1.0f + x) / (1.0f - x)) * 0.5f;
+      // check if out of range
+      y3 = select(x == 1.0f, infinite_vec<VTYPE>(), nan_vec<VTYPE>(NAN_HYP));
+      y2 = select(x >= 1.0f, y3, y2);
+    }
+  y1 = select(x_small, y1, y2);  // choose method
+  y1 = sign_combine(y1, x0);     // get original sign
+
+  return y1;
+}
+
+// instances of atanh_f template
+static inline Vec4f atanh(Vec4f const& x) { return atanh_f<Vec4f, Vec4fb>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f atanh(Vec8f const& x) { return atanh_f<Vec8f, Vec8fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f atanh(Vec16f const& x) { return atanh_f<Vec16f, Vec16fb>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif
diff --git a/src/vectorclass/vectormath_lib.h b/src/vectorclass/vectormath_lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..04a849532bafcc0d1f6315bdd4dfbc1f030b8e29
--- /dev/null
+++ b/src/vectorclass/vectormath_lib.h
@@ -0,0 +1,2559 @@
+/****************************  vectormath_lib.h   *****************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2016-04-26
+* Version:       1.22
+* Project:       vector classes
+* Description:
+* Header file defining mathematical functions on floating point vectors
+* May use Intel SVML library or AMD LIBM library
+*
+* Instructions:
+* Define VECTORMATH to one of the following values:
+*   0:  Use ordinary math library (slow)
+*   1:  Use AMD LIBM library
+*   2:  Use Intel SVML library with any compiler
+*   3:  Use Intel SVML library with Intel compiler
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2012-2016 GNU General Public License http://www.gnu.org/licenses
+\*****************************************************************************/
+
+// check combination of header files
+#ifndef VECTORMATH_LIB_H
+#define VECTORMATH_LIB_H
+
+#include "vectorf128.h"
+
+#ifndef VECTORMATH
+#ifdef __INTEL_COMPILER
+#define VECTORMATH 3
+#else
+#define VECTORMATH 0
+#endif  // __INTEL_COMPILER
+#endif  // VECTORMATH
+
+#include <math.h>
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+/*****************************************************************************
+ *
+ *      VECTORMATH = 0. Use ordinary library (scalar)
+ *
+ *****************************************************************************/
+#if VECTORMATH == 0
+
+#ifndef VECTORMATH_COMMON_H
+// exponential and power functions
+static inline Vec4f exp(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(expf(xx[0]), expf(xx[1]), expf(xx[2]), expf(xx[3]));
+}
+static inline Vec2d exp(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::exp(xx[0]), ::exp(xx[1]));
+}
+
+// There is no certain way to know which functions are available, but at least some (Gnu)
+// compilers have defines to specify this
+#ifdef HAVE_EXPM1
+static inline Vec4f expm1(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(expm1(xx[0]), expm1(xx[1]), expm1(xx[2]), expm1(xx[3]));
+}
+static inline Vec2d expm1(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(expm1(xx[0]), expm1(xx[1]));
+}
+#endif
+
+#ifdef HAVE_EXP2
+static inline Vec4f exp2(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(exp2(xx[0]), exp2(xx[1]), exp2(xx[2]), exp2(xx[3]));
+}
+static inline Vec2d exp2(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(exp2(xx[0]), exp2(xx[1]));
+}
+#else
+static inline Vec4f exp2(Vec4f const& x) { return exp(x * Vec4f(0.693147180559945309417f /* log(2) */)); }
+static inline Vec2d exp2(Vec2d const& x) { return exp(x * Vec2d(0.693147180559945309417 /* log(2) */)); }
+#endif
+
+static inline Vec4f exp10(Vec4f const& x) { return exp(x * Vec4f(2.30258509299404568402f /* log(10) */)); }
+static inline Vec2d exp10(Vec2d const& x) { return exp(x * Vec2d(2.30258509299404568402 /* log(10) */)); }
+
+static inline Vec4f pow(Vec4f const& a, Vec4f const& b)
+{
+  float aa[4], bb[4];
+  a.store(aa);
+  b.store(bb);
+  return Vec4f(powf(aa[0], bb[0]), powf(aa[1], bb[1]), powf(aa[2], bb[2]), powf(aa[3], bb[3]));
+}
+static inline Vec2d pow(Vec2d const& a, Vec2d const& b)
+{
+  double aa[4], bb[4];
+  a.store(aa);
+  b.store(bb);
+  return Vec2d(::pow(aa[0], bb[0]), ::pow(aa[1], bb[1]));
+}
+
+static inline Vec4f log(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(logf(xx[0]), logf(xx[1]), logf(xx[2]), logf(xx[3]));
+}
+static inline Vec2d log(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::log(xx[0]), ::log(xx[1]));
+}
+
+#ifdef HAVE_LOG1P
+static inline Vec4f log1p(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(::log1p(xx[0]), ::log1p(xx[1]), ::log1p(xx[2]), ::log1p(xx[3]));
+}
+static inline Vec2d log1p(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::log1p(xx[0]), ::log1p(xx[1]));
+}
+#endif
+
+static inline Vec4f log2(Vec4f const& x)
+{  // logarithm base 2
+  return log(x) * Vec4f(1.44269504088896340736f /* log2(e) */);
+}
+static inline Vec2d log2(Vec2d const& x)
+{  // logarithm base 2
+  return log(x) * Vec2d(1.44269504088896340736 /* log2(e) */);
+}
+
+static inline Vec4f log10(Vec4f const& x)
+{  // logarithm base 10
+  float xx[4];
+  x.store(xx);
+  return Vec4f(log10f(xx[0]), log10f(xx[1]), log10f(xx[2]), log10f(xx[3]));
+}
+static inline Vec2d log10(Vec2d const& x)
+{  // logarithm base 10
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::log10(xx[0]), ::log10(xx[1]));
+}
+
+// trigonometric functions
+static inline Vec4f sin(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(sinf(xx[0]), sinf(xx[1]), sinf(xx[2]), sinf(xx[3]));
+}
+static inline Vec2d sin(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::sin(xx[0]), ::sin(xx[1]));
+}
+
+static inline Vec4f cos(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(cosf(xx[0]), cosf(xx[1]), cosf(xx[2]), cosf(xx[3]));
+}
+static inline Vec2d cos(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::cos(xx[0]), ::cos(xx[1]));
+}
+
+static inline Vec4f sincos(Vec4f* pcos, Vec4f const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  *pcos = cos(x);
+  return sin(x);
+}
+static inline Vec2d sincos(Vec2d* pcos, Vec2d const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  *pcos = cos(x);
+  return sin(x);
+}
+
+static inline Vec4f tan(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(tanf(xx[0]), tanf(xx[1]), tanf(xx[2]), tanf(xx[3]));
+}
+static inline Vec2d tan(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::tan(xx[0]), ::tan(xx[1]));
+}
+
+// inverse trigonometric functions
+static inline Vec4f asin(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(asinf(xx[0]), asinf(xx[1]), asinf(xx[2]), asinf(xx[3]));
+}
+static inline Vec2d asin(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::asin(xx[0]), ::asin(xx[1]));
+}
+
+static inline Vec4f acos(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(acosf(xx[0]), acosf(xx[1]), acosf(xx[2]), acosf(xx[3]));
+}
+static inline Vec2d acos(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::acos(xx[0]), ::acos(xx[1]));
+}
+
+static inline Vec4f atan(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(atanf(xx[0]), atanf(xx[1]), atanf(xx[2]), atanf(xx[3]));
+}
+static inline Vec2d atan(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::atan(xx[0]), ::atan(xx[1]));
+}
+
+static inline Vec4f atan2(Vec4f const& a, Vec4f const& b)
+{  // inverse tangent of a/b
+  float aa[4], bb[4];
+  a.store(aa);
+  b.store(bb);
+  return Vec4f(atan2f(aa[0], bb[0]), atan2f(aa[1], bb[1]), atan2f(aa[2], bb[2]), atan2f(aa[3], bb[3]));
+}
+static inline Vec2d atan2(Vec2d const& a, Vec2d const& b)
+{  // inverse tangent of a/b
+  double aa[4], bb[4];
+  a.store(aa);
+  b.store(bb);
+  return Vec2d(::atan2(aa[0], bb[0]), ::atan2(aa[1], bb[1]));
+}
+#endif  // VECTORMATH_COMMON_H
+
+// hyperbolic functions
+static inline Vec4f sinh(Vec4f const& x)
+{  // hyperbolic sine
+  float xx[4];
+  x.store(xx);
+  return Vec4f(sinhf(xx[0]), sinhf(xx[1]), sinhf(xx[2]), sinhf(xx[3]));
+}
+static inline Vec2d sinh(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::sinh(xx[0]), ::sinh(xx[1]));
+}
+
+static inline Vec4f cosh(Vec4f const& x)
+{  // hyperbolic cosine
+  float xx[4];
+  x.store(xx);
+  return Vec4f(coshf(xx[0]), coshf(xx[1]), coshf(xx[2]), coshf(xx[3]));
+}
+static inline Vec2d cosh(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::cosh(xx[0]), ::cosh(xx[1]));
+}
+
+static inline Vec4f tanh(Vec4f const& x)
+{  // hyperbolic tangent
+  float xx[4];
+  x.store(xx);
+  return Vec4f(tanhf(xx[0]), tanhf(xx[1]), tanhf(xx[2]), tanhf(xx[3]));
+}
+static inline Vec2d tanh(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::tanh(xx[0]), ::tanh(xx[1]));
+}
+
+// error function
+#ifdef HAVE_ERF
+static inline Vec4f erf(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(::erf(xx[0]), ::erf(xx[1]), ::erf(xx[2]), ::erf(xx[3]));
+}
+static inline Vec2d erf(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::erf(xx[0]), ::erf(xx[1]));
+}
+#endif
+
+#ifdef HAVE_ERFC
+static inline Vec4f erfc(Vec4f const& x)
+{
+  float xx[4];
+  x.store(xx);
+  return Vec4f(::erfc(xx[0]), ::erfc(xx[1]), ::erfc(xx[2]), ::erfc(xx[3]));
+}
+static inline Vec2d erfc(Vec2d const& x)
+{
+  double xx[4];
+  x.store(xx);
+  return Vec2d(::erfc(xx[0]), ::erfc(xx[1]));
+}
+#endif
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+static inline Vec4f cexp(Vec4f const& x)
+{  // complex exponential function
+  float xx[4], ee[2];
+  x.store(xx);
+  Vec4f z(cosf(xx[1]), sinf(xx[1]), cosf(xx[3]), sinf(xx[3]));
+  ee[0] = expf(xx[0]);
+  ee[1] = expf(xx[2]);
+  return z * Vec4f(ee[0], ee[0], ee[1], ee[1]);
+}
+
+static inline Vec2d cexp(Vec2d const& x)
+{  // complex exponential function
+  double xx[2];
+  x.store(xx);
+  Vec2d z(::cos(xx[1]), ::sin(xx[1]));
+  return z * ::exp(xx[0]);
+}
+
+#if defined(VECTORF256_H)  // 256 bit vectors defined
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec8f exp(Vec8f const& x)
+{  // exponential function
+  return Vec8f(exp(x.get_low()), exp(x.get_high()));
+}
+static inline Vec4d exp(Vec4d const& x)
+{  // exponential function
+  return Vec4d(exp(x.get_low()), exp(x.get_high()));
+}
+#ifdef HAVE_EXPM1
+static inline Vec8f expm1(Vec8f const& x)
+{  // exp(x)-1
+  return Vec8f(expm1(x.get_low()), expm1(x.get_high()));
+}
+static inline Vec4d expm1(Vec4d const& x)
+{  // exp(x)-1
+  return Vec4d(expm1(x.get_low()), expm1(x.get_high()));
+}
+#endif
+
+static inline Vec8f exp2(Vec8f const& x)
+{  // pow(2,x)
+  return Vec8f(exp2(x.get_low()), exp2(x.get_high()));
+}
+static inline Vec4d exp2(Vec4d const& x)
+{  // pow(2,x)
+  return Vec4d(exp2(x.get_low()), exp2(x.get_high()));
+}
+
+static inline Vec8f exp10(Vec8f const& x)
+{  // pow(10,x)
+  return Vec8f(exp10(x.get_low()), exp10(x.get_high()));
+}
+static inline Vec4d exp10(Vec4d const& x)
+{  // pow(10,x)
+  return Vec4d(exp10(x.get_low()), exp10(x.get_high()));
+}
+
+static inline Vec8f pow(Vec8f const& a, Vec8f const& b)
+{  // pow(a,b) = a to the power of b
+  return Vec8f(pow(a.get_low(), b.get_low()), pow(a.get_high(), b.get_high()));
+}
+static inline Vec4d pow(Vec4d const& a, Vec4d const& b)
+{  // pow(a,b) = a to the power of b
+  return Vec4d(pow(a.get_low(), b.get_low()), pow(a.get_high(), b.get_high()));
+}
+
+// logarithms
+static inline Vec8f log(Vec8f const& x)
+{  // natural logarithm
+  return Vec8f(log(x.get_low()), log(x.get_high()));
+}
+static inline Vec4d log(Vec4d const& x)
+{  // natural logarithm
+  return Vec4d(log(x.get_low()), log(x.get_high()));
+}
+#ifdef HAVE_LOG1P
+static inline Vec8f log1p(Vec8f const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return Vec8f(log1p(x.get_low()), log1p(x.get_high()));
+}
+static inline Vec4d log1p(Vec4d const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return Vec4d(log1p(x.get_low()), log1p(x.get_high()));
+}
+#endif
+
+static inline Vec8f log2(Vec8f const& x)
+{  // logarithm base 2
+  return Vec8f(log2(x.get_low()), log2(x.get_high()));
+}
+static inline Vec4d log2(Vec4d const& x)
+{  // logarithm base 2
+  return Vec4d(log2(x.get_low()), log2(x.get_high()));
+}
+
+static inline Vec8f log10(Vec8f const& x)
+{  // logarithm base 10
+  return Vec8f(log10(x.get_low()), log10(x.get_high()));
+}
+static inline Vec4d log10(Vec4d const& x)
+{  // logarithm base 10
+  return Vec4d(log10(x.get_low()), log10(x.get_high()));
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec8f sin(Vec8f const& x)
+{  // sine
+  return Vec8f(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec4d sin(Vec4d const& x)
+{  // sine
+  return Vec4d(sin(x.get_low()), sin(x.get_high()));
+}
+
+static inline Vec8f cos(Vec8f const& x)
+{  // cosine
+  return Vec8f(cos(x.get_low()), cos(x.get_high()));
+}
+static inline Vec4d cos(Vec4d const& x)
+{  // cosine
+  return Vec4d(cos(x.get_low()), cos(x.get_high()));
+}
+
+static inline Vec8f sincos(Vec8f* pcos, Vec8f const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  *pcos = Vec8f(cos(x.get_low()), cos(x.get_high()));
+  return Vec8f(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec4d sincos(Vec4d* pcos, Vec4d const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  *pcos = Vec4d(cos(x.get_low()), cos(x.get_high()));
+  return Vec4d(sin(x.get_low()), sin(x.get_high()));
+}
+
+static inline Vec8f tan(Vec8f const& x)
+{  // tangent
+  return Vec8f(tan(x.get_low()), tan(x.get_high()));
+}
+static inline Vec4d tan(Vec4d const& x)
+{  // tangent
+  return Vec4d(tan(x.get_low()), tan(x.get_high()));
+}
+
+// inverse trigonometric functions
+static inline Vec8f asin(Vec8f const& x)
+{  // inverse sine
+  return Vec8f(asin(x.get_low()), asin(x.get_high()));
+}
+static inline Vec4d asin(Vec4d const& x)
+{  // inverse sine
+  return Vec4d(asin(x.get_low()), asin(x.get_high()));
+}
+
+static inline Vec8f acos(Vec8f const& x)
+{  // inverse cosine
+  return Vec8f(acos(x.get_low()), acos(x.get_high()));
+}
+static inline Vec4d acos(Vec4d const& x)
+{  // inverse cosine
+  return Vec4d(acos(x.get_low()), acos(x.get_high()));
+}
+
+static inline Vec8f atan(Vec8f const& x)
+{  // inverse tangent
+  return Vec8f(atan(x.get_low()), atan(x.get_high()));
+}
+static inline Vec4d atan(Vec4d const& x)
+{  // inverse tangent
+  return Vec4d(atan(x.get_low()), atan(x.get_high()));
+}
+
+static inline Vec8f atan2(Vec8f const& a, Vec8f const& b)
+{  // inverse tangent of a/b
+  return Vec8f(atan2(a.get_low(), b.get_low()), atan2(a.get_high(), b.get_high()));
+}
+static inline Vec4d atan2(Vec4d const& a, Vec4d const& b)
+{  // inverse tangent of a/b
+  return Vec4d(atan2(a.get_low(), b.get_low()), atan2(a.get_high(), b.get_high()));
+}
+#endif  // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec8f sinh(Vec8f const& x)
+{  // hyperbolic sine
+  return Vec8f(sinh(x.get_low()), sinh(x.get_high()));
+}
+static inline Vec4d sinh(Vec4d const& x)
+{  // hyperbolic sine
+  return Vec4d(sinh(x.get_low()), sinh(x.get_high()));
+}
+
+static inline Vec8f cosh(Vec8f const& x)
+{  // hyperbolic cosine
+  return Vec8f(cosh(x.get_low()), cosh(x.get_high()));
+}
+static inline Vec4d cosh(Vec4d const& x)
+{  // hyperbolic cosine
+  return Vec4d(cosh(x.get_low()), cosh(x.get_high()));
+}
+
+static inline Vec8f tanh(Vec8f const& x)
+{  // hyperbolic tangent
+  return Vec8f(tanh(x.get_low()), tanh(x.get_high()));
+}
+static inline Vec4d tanh(Vec4d const& x)
+{  // hyperbolic tangent
+  return Vec4d(tanh(x.get_low()), tanh(x.get_high()));
+}
+
+// error function
+#ifdef HAVE_ERF
+static inline Vec8f erf(Vec8f const& x)
+{  // error function
+  return Vec8f(erf(x.get_low()), erf(x.get_high()));
+}
+static inline Vec4d erf(Vec4d const& x)
+{  // error function
+  return Vec4d(erf(x.get_low()), erf(x.get_high()));
+}
+#endif
+#ifdef HAVE_ERFC
+static inline Vec8f erfc(Vec8f const& x)
+{  // error function complement
+  return Vec8f(erfc(x.get_low()), erfc(x.get_high()));
+}
+static inline Vec4d erfc(Vec4d const& x)
+{  // error function complement
+  return Vec4d(erfc(x.get_low()), erfc(x.get_high()));
+}
+#endif
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+static inline Vec8f cexp(Vec8f const& x)
+{  // complex exponential function
+  return Vec8f(cexp(x.get_low()), cexp(x.get_high()));
+}
+static inline Vec4d cexp(Vec4d const& x)
+{  // complex exponential function
+  return Vec4d(cexp(x.get_low()), cexp(x.get_high()));
+}
+
+#endif  // VECTORF256_H == 1
+
+/*****************************************************************************
+ *
+ *      VECTORMATH = 1. Use AMD LIBM library
+ *
+ *****************************************************************************/
+#elif VECTORMATH == 1
+//#include <amdlibm.h>
+#include "amdlibm.h"  // if header file is in current directory
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec4f exp(Vec4f const& x)
+{  // exponential function
+  return amd_vrs4_expf(x);
+}
+static inline Vec2d exp(Vec2d const& x)
+{  // exponential function
+  return amd_vrd2_exp(x);
+}
+
+static inline Vec4f expm1(Vec4f const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return amd_vrs4_expm1f(x);
+}
+static inline Vec2d expm1(Vec2d const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return amd_vrd2_expm1(x);
+}
+
+static inline Vec4f exp2(Vec4f const& x)
+{  // pow(2,x)
+  return amd_vrs4_exp2f(x);
+}
+static inline Vec2d exp2(Vec2d const& x)
+{  // pow(2,x)
+  return amd_vrd2_exp2(x);
+}
+
+static inline Vec4f exp10(Vec4f const& x)
+{  // pow(10,x)
+  return amd_vrs4_exp10f(x);
+}
+static inline Vec2d exp10(Vec2d const& x)
+{  // pow(10,x)
+  return amd_vrd2_exp10(x);
+}
+
+static inline Vec4f pow(Vec4f const& a, Vec4f const& b)
+{  // pow(a,b) = a to the power of b
+  return amd_vrs4_powf(a, b);
+}
+static inline Vec2d pow(Vec2d const& a, Vec2d const& b)
+{  // pow(a,b) = a to the power of b
+  return amd_vrd2_pow(a, b);
+}
+
+static inline Vec4f cbrt(Vec4f const& x)
+{  // pow(x,1/3)
+  return amd_vrs4_cbrtf(x);
+}
+static inline Vec2d cbrt(Vec2d const& x)
+{  // pow(x,1/3)
+  return amd_vrd2_cbrt(x);
+}
+
+// logarithms
+static inline Vec4f log(Vec4f const& x)
+{  // natural logarithm
+  return amd_vrs4_logf(x);
+}
+static inline Vec2d log(Vec2d const& x)
+{  // natural logarithm
+  return amd_vrd2_log(x);
+}
+
+static inline Vec4f log1p(Vec4f const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return amd_vrs4_log1pf(x);
+}
+static inline Vec2d log1p(Vec2d const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return amd_vrd2_log1p(x);
+}
+
+static inline Vec4f log2(Vec4f const& x)
+{  // logarithm base 2
+  return amd_vrs4_log2f(x);
+}
+static inline Vec2d log2(Vec2d const& x)
+{  // logarithm base 2
+  return amd_vrd2_log2(x);
+}
+
+static inline Vec4f log10(Vec4f const& x)
+{  // logarithm base 10
+  return amd_vrs4_log10f(x);
+}
+static inline Vec2d log10(Vec2d const& x)
+{  // logarithm base 10
+  return amd_vrd2_log10(x);
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec4f sin(Vec4f const& x)
+{  // sine
+  return amd_vrs4_sinf(x);
+}
+static inline Vec2d sin(Vec2d const& x)
+{  // sine
+  return amd_vrd2_sin(x);
+}
+
+static inline Vec4f cos(Vec4f const& x)
+{  // cosine
+  return amd_vrs4_cosf(x);
+}
+static inline Vec2d cos(Vec2d const& x)
+{  // cosine
+  return amd_vrd2_cos(x);
+}
+
+static inline Vec4f sincos(Vec4f* pcos, Vec4f const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  __m128 r_sin;
+  amd_vrs4_sincosf(x, &r_sin, (__m128*)pcos);
+  return r_sin;
+}
+static inline Vec2d sincos(Vec2d* pcos, Vec2d const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  __m128d r_sin;
+  amd_vrd2_sincos(x, &r_sin, (__m128d*)pcos);
+  return r_sin;
+}
+
+static inline Vec4f tan(Vec4f const& x)
+{  // tangent
+  return amd_vrs4_tanf(x);
+}
+static inline Vec2d tan(Vec2d const& x)
+{  // tangent
+  return amd_vrd2_tan(x);
+}
+
+// inverse trigonometric functions not supported
+
+#endif  // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions not supported
+
+// error function not supported
+
+// complex exponential function not supported
+
+#ifdef VECTORF256_H
+
+// Emulate 256 bit vector functions with two 128-bit vectors
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec8f exp(Vec8f const& x)
+{  // exponential function
+  return Vec8f(exp(x.get_low()), exp(x.get_high()));
+}
+static inline Vec4d exp(Vec4d const& x)
+{  // exponential function
+  return Vec4d(exp(x.get_low()), exp(x.get_high()));
+}
+
+static inline Vec8f expm1(Vec8f const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return Vec8f(expm1(x.get_low()), expm1(x.get_high()));
+}
+static inline Vec4d expm1(Vec4d const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return Vec4d(expm1(x.get_low()), expm1(x.get_high()));
+}
+
+static inline Vec8f exp2(Vec8f const& x)
+{  // pow(2,x)
+  return Vec8f(exp2(x.get_low()), exp2(x.get_high()));
+}
+static inline Vec4d exp2(Vec4d const& x)
+{  // pow(2,x)
+  return Vec4d(exp2(x.get_low()), exp2(x.get_high()));
+}
+
+static inline Vec8f exp10(Vec8f const& x)
+{  // pow(10,x)
+  return Vec8f(exp10(x.get_low()), exp10(x.get_high()));
+}
+static inline Vec4d exp10(Vec4d const& x)
+{  // pow(10,x)
+  return Vec4d(exp10(x.get_low()), exp10(x.get_high()));
+}
+
+static inline Vec8f pow(Vec8f const& a, Vec8f const& b)
+{  // pow(a,b) = a to the power of b
+  return Vec8f(pow(a.get_low(), b.get_low()), pow(a.get_high(), b.get_high()));
+}
+static inline Vec4d pow(Vec4d const& a, Vec4d const& b)
+{  // pow(a,b) = a to the power of b
+  return Vec4d(pow(a.get_low(), b.get_low()), pow(a.get_high(), b.get_high()));
+}
+
+static inline Vec8f cbrt(Vec8f const& x)
+{  // pow(x,1/3)
+  return Vec8f(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+static inline Vec4d cbrt(Vec4d const& x)
+{  // pow(x,1/3)
+  return Vec4d(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+
+// logarithms
+static inline Vec8f log(Vec8f const& x)
+{  // natural logarithm
+  return Vec8f(log(x.get_low()), log(x.get_high()));
+}
+static inline Vec4d log(Vec4d const& x)
+{  // natural logarithm
+  return Vec4d(log(x.get_low()), log(x.get_high()));
+}
+
+static inline Vec8f log1p(Vec8f const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return Vec8f(log1p(x.get_low()), log1p(x.get_high()));
+}
+static inline Vec4d log1p(Vec4d const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return Vec4d(log1p(x.get_low()), log1p(x.get_high()));
+}
+
+static inline Vec8f log2(Vec8f const& x)
+{  // logarithm base 2
+  return Vec8f(log2(x.get_low()), log2(x.get_high()));
+}
+static inline Vec4d log2(Vec4d const& x)
+{  // logarithm base 2
+  return Vec4d(log2(x.get_low()), log2(x.get_high()));
+}
+
+static inline Vec8f log10(Vec8f const& x)
+{  // logarithm base 10
+  return Vec8f(log10(x.get_low()), log10(x.get_high()));
+}
+static inline Vec4d log10(Vec4d const& x)
+{  // logarithm base 10
+  return Vec4d(log10(x.get_low()), log10(x.get_high()));
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec8f sin(Vec8f const& x)
+{  // sine
+  return Vec8f(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec4d sin(Vec4d const& x)
+{  // sine
+  return Vec4d(sin(x.get_low()), sin(x.get_high()));
+}
+
+static inline Vec8f cos(Vec8f const& x)
+{  // cosine
+  return Vec8f(cos(x.get_low()), cos(x.get_high()));
+}
+static inline Vec4d cos(Vec4d const& x)
+{  // cosine
+  return Vec4d(cos(x.get_low()), cos(x.get_high()));
+}
+
+static inline Vec8f sincos(Vec8f* pcos, Vec8f const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  Vec4f r_sin0, r_sin1, r_cos0, r_cos1;
+  r_sin0 = sincos(&r_cos0, x.get_low());
+  r_sin1 = sincos(&r_cos1, x.get_high());
+  *pcos  = Vec8f(r_cos0, r_cos1);
+  return Vec8f(r_sin0, r_sin1);
+}
+static inline Vec4d sincos(Vec4d* pcos, Vec4d const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  Vec2d r_sin0, r_sin1, r_cos0, r_cos1;
+  r_sin0 = sincos(&r_cos0, x.get_low());
+  r_sin1 = sincos(&r_cos1, x.get_high());
+  *pcos  = Vec4d(r_cos0, r_cos1);
+  return Vec4d(r_sin0, r_sin1);
+}
+
+static inline Vec8f tan(Vec8f const& x)
+{  // tangent
+  return Vec8f(tan(x.get_low()), tan(x.get_high()));
+}
+static inline Vec4d tan(Vec4d const& x)
+{  // tangent
+  return Vec4d(tan(x.get_low()), tan(x.get_high()));
+}
+
+#endif  // VECTORMATH_COMMON_H
+
+#endif  // VECTORF256_H == 1
+
+/*****************************************************************************
+ *
+ *      VECTORMATH = 2. Use Intel SVML library with any compiler
+ *
+ *****************************************************************************/
+#elif VECTORMATH == 2
+
+extern "C"
+{
+  extern __m128 __svml_expf4(__m128);
+  extern __m128d __svml_exp2(__m128d);
+  extern __m128 __svml_expm1f4(__m128);
+  extern __m128d __svml_expm12(__m128d);
+  extern __m128 __svml_exp2f4(__m128);
+  extern __m128d __svml_exp22(__m128d);
+  extern __m128 __svml_exp10f4(__m128);
+  extern __m128d __svml_exp102(__m128d);
+  extern __m128 __svml_powf4(__m128, __m128);
+  extern __m128d __svml_pow2(__m128d, __m128d);
+  extern __m128 __svml_cbrtf4(__m128);
+  extern __m128d __svml_cbrt2(__m128d);
+  extern __m128 __svml_invsqrtf4(__m128);
+  extern __m128d __svml_invsqrt2(__m128d);
+  extern __m128 __svml_logf4(__m128);
+  extern __m128d __svml_log2(__m128d);
+  extern __m128 __svml_log1pf4(__m128);
+  extern __m128d __svml_log1p2(__m128d);
+  extern __m128 __svml_log2f4(__m128);
+  extern __m128d __svml_log22(__m128d);
+  extern __m128 __svml_log10f4(__m128);
+  extern __m128d __svml_log102(__m128d);
+  extern __m128 __svml_sinf4(__m128);
+  extern __m128d __svml_sin2(__m128d);
+  extern __m128 __svml_cosf4(__m128);
+  extern __m128d __svml_cos2(__m128d);
+  extern __m128 __svml_sincosf4(__m128);   // cos returned in xmm1
+  extern __m128d __svml_sincos2(__m128d);  // cos returned in xmm1
+  extern __m128 __svml_tanf4(__m128);
+  extern __m128d __svml_tan2(__m128d);
+  extern __m128 __svml_asinf4(__m128);
+  extern __m128d __svml_asin2(__m128d);
+  extern __m128 __svml_acosf4(__m128);
+  extern __m128d __svml_acos2(__m128d);
+  extern __m128 __svml_atanf4(__m128);
+  extern __m128d __svml_atan2(__m128d);
+  extern __m128 __svml_atan2f4(__m128, __m128);
+  extern __m128d __svml_atan22(__m128d, __m128d);
+  extern __m128 __svml_sinhf4(__m128);
+  extern __m128d __svml_sinh2(__m128d);
+  extern __m128 __svml_coshf4(__m128);
+  extern __m128d __svml_cosh2(__m128d);
+  extern __m128 __svml_tanhf4(__m128);
+  extern __m128d __svml_tanh2(__m128d);
+  extern __m128 __svml_asinhf4(__m128);
+  extern __m128d __svml_asinh2(__m128d);
+  extern __m128 __svml_acoshf4(__m128);
+  extern __m128d __svml_acosh2(__m128d);
+  extern __m128 __svml_atanhf4(__m128);
+  extern __m128d __svml_atanh2(__m128d);
+  extern __m128 __svml_erff4(__m128);
+  extern __m128d __svml_erf2(__m128d);
+  extern __m128 __svml_erfcf4(__m128);
+  extern __m128d __svml_erfc2(__m128d);
+  extern __m128 __svml_erfinvf4(__m128);
+  extern __m128d __svml_erfinv2(__m128d);
+  extern __m128 __svml_cdfnorminvf4(__m128);
+  extern __m128d __svml_cdfnorminv2(__m128d);
+  extern __m128 __svml_cdfnormf4(__m128);
+  extern __m128d __svml_cdfnorm2(__m128d);
+  extern __m128 __svml_cexpf4(__m128);
+  extern __m128d __svml_cexp2(__m128d);
+}
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec4f exp(Vec4f const& x)
+{  // exponential function
+  return __svml_expf4(x);
+}
+static inline Vec2d exp(Vec2d const& x)
+{  // exponential function
+  return __svml_exp2(x);
+}
+
+static inline Vec4f expm1(Vec4f const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return __svml_expm1f4(x);
+}
+static inline Vec2d expm1(Vec2d const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return __svml_expm12(x);
+}
+
+static inline Vec4f exp2(Vec4f const& x)
+{  // pow(2,x)
+  return __svml_exp2f4(x);
+}
+static inline Vec2d exp2(Vec2d const& x)
+{  // pow(2,x)
+  return __svml_exp22(x);
+}
+
+static inline Vec4f exp10(Vec4f const& x)
+{  // pow(10,x)
+  return __svml_exp10f4(x);
+}
+static inline Vec2d exp10(Vec2d const& x)
+{  // pow(10,x)
+  return __svml_exp102(x);
+}
+
+static inline Vec4f pow(Vec4f const& a, Vec4f const& b)
+{  // pow(a,b) = a to the power of b
+  return __svml_powf4(a, b);
+}
+static inline Vec2d pow(Vec2d const& a, Vec2d const& b)
+{  // pow(a,b) = a to the power of b
+  return __svml_pow2(a, b);
+}
+
+static inline Vec4f cbrt(Vec4f const& x)
+{  // pow(x,1/3)
+  return __svml_cbrtf4(x);
+}
+static inline Vec2d cbrt(Vec2d const& x)
+{  // pow(x,1/3)
+  return __svml_cbrt2(x);
+}
+
+// logarithms
+static inline Vec4f log(Vec4f const& x)
+{  // natural logarithm
+  return __svml_logf4(x);
+}
+static inline Vec2d log(Vec2d const& x)
+{  // natural logarithm
+  return __svml_log2(x);
+}
+
+static inline Vec4f log1p(Vec4f const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return __svml_log1pf4(x);
+}
+static inline Vec2d log1p(Vec2d const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return __svml_log1p2(x);
+}
+
+static inline Vec4f log2(Vec4f const& x)
+{  // logarithm base 2
+  return __svml_log2f4(x);
+}
+static inline Vec2d log2(Vec2d const& x)
+{  // logarithm base 2
+  return __svml_log22(x);
+}
+
+static inline Vec4f log10(Vec4f const& x)
+{  // logarithm base 10
+  return __svml_log10f4(x);
+}
+static inline Vec2d log10(Vec2d const& x)
+{  // logarithm base 10
+  return __svml_log102(x);
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec4f sin(Vec4f const& x)
+{  // sine
+  return __svml_sinf4(x);
+}
+static inline Vec2d sin(Vec2d const& x)
+{  // sine
+  return __svml_sin2(x);
+}
+
+static inline Vec4f cos(Vec4f const& x)
+{  // cosine
+  return __svml_cosf4(x);
+}
+static inline Vec2d cos(Vec2d const& x)
+{  // cosine
+  return __svml_cos2(x);
+}
+
+#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER)
+// no inline assembly in 64 bit MS compiler
+static inline Vec4f sincos(Vec4f* pcos, Vec4f const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  __m128 r_sin, r_cos;
+  r_sin = __svml_sincosf4(x);
+#if defined(__unix__) || defined(__GNUC__)
+  //   __asm__ ( "call __svml_sincosf4 \n movaps %%xmm0, %0 \n movaps %%xmm1, %1" : "=m"(r_sin), "=m"(r_cos) : "xmm0"(x) );
+  __asm__ __volatile__("movaps %%xmm1, %0" : "=m"(r_cos));
+#else  // Windows
+  _asm movaps r_cos, xmm1;
+#endif
+  *pcos = r_cos;
+  return r_sin;
+}
+static inline Vec2d sincos(Vec2d* pcos, Vec2d const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  __m128d r_sin, r_cos;
+  r_sin = __svml_sincos2(x);
+#if defined(__unix__) || defined(__GNUC__)
+  __asm__ __volatile__("movaps %%xmm1, %0" : "=m"(r_cos));
+#else  // Windows
+  _asm movapd r_cos, xmm1;
+#endif
+  *pcos = r_cos;
+  return r_sin;
+}
+#endif  // inline assembly available
+
+static inline Vec4f tan(Vec4f const& x)
+{  // tangent
+  return __svml_tanf4(x);
+}
+static inline Vec2d tan(Vec2d const& x)
+{  // tangent
+  return __svml_tan2(x);
+}
+
+// inverse trigonometric functions
+static inline Vec4f asin(Vec4f const& x)
+{  // inverse sine
+  return __svml_asinf4(x);
+}
+static inline Vec2d asin(Vec2d const& x)
+{  // inverse sine
+  return __svml_asin2(x);
+}
+
+static inline Vec4f acos(Vec4f const& x)
+{  // inverse cosine
+  return __svml_acosf4(x);
+}
+static inline Vec2d acos(Vec2d const& x)
+{  // inverse cosine
+  return __svml_acos2(x);
+}
+
+static inline Vec4f atan(Vec4f const& x)
+{  // inverse tangent
+  return __svml_atanf4(x);
+}
+static inline Vec2d atan(Vec2d const& x)
+{  // inverse tangent
+  return __svml_atan2(x);
+}
+
+static inline Vec4f atan2(Vec4f const& a, Vec4f const& b)
+{  // inverse tangent of a/b
+  return __svml_atan2f4(a, b);
+}
+static inline Vec2d atan2(Vec2d const& a, Vec2d const& b)
+{  // inverse tangent of a/b
+  return __svml_atan22(a, b);
+}
+
+#endif  // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec4f sinh(Vec4f const& x)
+{  // hyperbolic sine
+  return __svml_sinhf4(x);
+}
+static inline Vec2d sinh(Vec2d const& x)
+{  // hyperbolic sine
+  return __svml_sinh2(x);
+}
+
+static inline Vec4f cosh(Vec4f const& x)
+{  // hyperbolic cosine
+  return __svml_coshf4(x);
+}
+static inline Vec2d cosh(Vec2d const& x)
+{  // hyperbolic cosine
+  return __svml_cosh2(x);
+}
+
+static inline Vec4f tanh(Vec4f const& x)
+{  // hyperbolic tangent
+  return __svml_tanhf4(x);
+}
+static inline Vec2d tanh(Vec2d const& x)
+{  // hyperbolic tangent
+  return __svml_tanh2(x);
+}
+
+static inline Vec4f asinh(Vec4f const& x)
+{  // inverse hyperbolic sine
+  return __svml_asinhf4(x);
+}
+static inline Vec2d asinh(Vec2d const& x)
+{  // inverse hyperbolic sine
+  return __svml_asinh2(x);
+}
+
+static inline Vec4f acosh(Vec4f const& x)
+{  // inverse hyperbolic cosine
+  return __svml_acoshf4(x);
+}
+static inline Vec2d acosh(Vec2d const& x)
+{  // inverse hyperbolic cosine
+  return __svml_acosh2(x);
+}
+
+static inline Vec4f atanh(Vec4f const& x)
+{  // inverse hyperbolic tangent
+  return __svml_atanhf4(x);
+}
+static inline Vec2d atanh(Vec2d const& x)
+{  // inverse hyperbolic tangent
+  return __svml_atanh2(x);
+}
+
+// error function
+static inline Vec4f erf(Vec4f const& x)
+{  // error function
+  return __svml_erff4(x);
+}
+static inline Vec2d erf(Vec2d const& x)
+{  // error function
+  return __svml_erf2(x);
+}
+
+static inline Vec4f erfc(Vec4f const& x)
+{  // error function complement
+  return __svml_erfcf4(x);
+}
+static inline Vec2d erfc(Vec2d const& x)
+{  // error function complement
+  return __svml_erfc2(x);
+}
+
+static inline Vec4f erfinv(Vec4f const& x)
+{  // inverse error function
+  return __svml_erfinvf4(x);
+}
+static inline Vec2d erfinv(Vec2d const& x)
+{  // inverse error function
+  return __svml_erfinv2(x);
+}
+
+static inline Vec4f cdfnorm(Vec4f const& x)
+{  // cumulative normal distribution function
+  return __svml_cdfnormf4(x);
+}
+static inline Vec2d cdfnorm(Vec2d const& x)
+{  // cumulative normal distribution function
+  return __svml_cdfnorm2(x);
+}
+
+static inline Vec4f cdfnorminv(Vec4f const& x)
+{  // inverse cumulative normal distribution function
+  return __svml_cdfnorminvf4(x);
+}
+static inline Vec2d cdfnorminv(Vec2d const& x)
+{  // inverse cumulative normal distribution function
+  return __svml_cdfnorminv2(x);
+}
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+static inline Vec4f cexp(Vec4f const& x)
+{  // complex exponential function
+  return __svml_cexpf4(x);
+}
+static inline Vec2d cexp(Vec2d const& x)
+{  // complex exponential function
+  return __svml_cexp2(x);
+}
+
+#if defined(VECTORF256_H) && VECTORF256_H >= 2
+// AVX gives 256 bit vectors
+
+extern "C"
+{
+  extern __m256 __svml_expf8(__m256);
+  extern __m256d __svml_exp4(__m256d);
+  extern __m256 __svml_expm1f8(__m256);
+  extern __m256d __svml_expm14(__m256d);
+  extern __m256 __svml_exp2f8(__m256);
+  extern __m256d __svml_exp24(__m256d);
+  extern __m256 __svml_exp10f8(__m256);
+  extern __m256d __svml_exp104(__m256d);
+  extern __m256 __svml_powf8(__m256, __m256);
+  extern __m256d __svml_pow4(__m256d, __m256d);
+  extern __m256 __svml_cbrtf8(__m256);
+  extern __m256d __svml_cbrt4(__m256d);
+  extern __m256 __svml_invsqrtf8(__m256);
+  extern __m256d __svml_invsqrt4(__m256d);
+  extern __m256 __svml_logf8(__m256);
+  extern __m256d __svml_log4(__m256d);
+  extern __m256 __svml_log1pf8(__m256);
+  extern __m256d __svml_log1p4(__m256d);
+  extern __m256 __svml_log2f8(__m256);
+  extern __m256d __svml_log24(__m256d);
+  extern __m256 __svml_log10f8(__m256);
+  extern __m256d __svml_log104(__m256d);
+  extern __m256 __svml_sinf8(__m256);
+  extern __m256d __svml_sin4(__m256d);
+  extern __m256 __svml_cosf8(__m256);
+  extern __m256d __svml_cos4(__m256d);
+  extern __m256 __svml_sincosf8(__m256);   // cos returned in ymm1
+  extern __m256d __svml_sincos4(__m256d);  // cos returned in ymm1
+  extern __m256 __svml_tanf8(__m256);
+  extern __m256d __svml_tan4(__m256d);
+  extern __m256 __svml_asinf8(__m256);
+  extern __m256d __svml_asin4(__m256d);
+  extern __m256 __svml_acosf8(__m256);
+  extern __m256d __svml_acos4(__m256d);
+  extern __m256 __svml_atanf8(__m256);
+  extern __m256d __svml_atan4(__m256d);
+  extern __m256 __svml_atan2f8(__m256, __m256);
+  extern __m256d __svml_atan24(__m256d, __m256d);
+  extern __m256 __svml_sinhf8(__m256);
+  extern __m256d __svml_sinh4(__m256d);
+  extern __m256 __svml_coshf8(__m256);
+  extern __m256d __svml_cosh4(__m256d);
+  extern __m256 __svml_tanhf8(__m256);
+  extern __m256d __svml_tanh4(__m256d);
+  extern __m256 __svml_asinhf8(__m256);
+  extern __m256d __svml_asinh4(__m256d);
+  extern __m256 __svml_acoshf8(__m256);
+  extern __m256d __svml_acosh4(__m256d);
+  extern __m256 __svml_atanhf8(__m256);
+  extern __m256d __svml_atanh4(__m256d);
+  extern __m256 __svml_erff8(__m256);
+  extern __m256d __svml_erf4(__m256d);
+  extern __m256 __svml_erfcf8(__m256);
+  extern __m256d __svml_erfc4(__m256d);
+  extern __m256 __svml_erfinvf8(__m256);
+  extern __m256d __svml_erfinv4(__m256d);
+  extern __m256 __svml_cdfnorminvf8(__m256);
+  extern __m256d __svml_cdfnorminv4(__m256d);
+  extern __m256 __svml_cdfnormf8(__m256);
+  extern __m256d __svml_cdfnorm4(__m256d);
+  // extern __m256  __svml_cexpf8      (__m256); // missing in current version of SVML (jan 2012)
+  // extern __m256d __svml_cexp4       (__m256d);
+}
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec8f exp(Vec8f const& x)
+{  // exponential function
+  return __svml_expf8(x);
+}
+static inline Vec4d exp(Vec4d const& x)
+{  // exponential function
+  return __svml_exp4(x);
+}
+
+static inline Vec8f expm1(Vec8f const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return __svml_expm1f8(x);
+}
+static inline Vec4d expm1(Vec4d const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return __svml_expm14(x);
+}
+
+static inline Vec8f exp2(Vec8f const& x)
+{  // pow(2,x)
+  return __svml_exp2f8(x);
+}
+static inline Vec4d exp2(Vec4d const& x)
+{  // pow(2,x)
+  return __svml_exp24(x);
+}
+
+static inline Vec8f exp10(Vec8f const& x)
+{  // pow(10,x)
+  return __svml_exp10f8(x);
+}
+static inline Vec4d exp10(Vec4d const& x)
+{  // pow(10,x)
+  return __svml_exp104(x);
+}
+
+static inline Vec8f pow(Vec8f const& a, Vec8f const& b)
+{  // pow(a,b) = a to the power of b
+  return __svml_powf8(a, b);
+}
+static inline Vec4d pow(Vec4d const& a, Vec4d const& b)
+{  // pow(a,b) = a to the power of b
+  return __svml_pow4(a, b);
+}
+
+static inline Vec8f cbrt(Vec8f const& x)
+{  // pow(x,1/3)
+  return __svml_cbrtf8(x);
+}
+static inline Vec4d cbrt(Vec4d const& x)
+{  // pow(x,1/3)
+  return __svml_cbrt4(x);
+}
+
+// logarithms
+static inline Vec8f log(Vec8f const& x)
+{  // natural logarithm
+  return __svml_logf8(x);
+}
+static inline Vec4d log(Vec4d const& x)
+{  // natural logarithm
+  return __svml_log4(x);
+}
+
+static inline Vec8f log1p(Vec8f const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return __svml_log1pf8(x);
+}
+static inline Vec4d log1p(Vec4d const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return __svml_log1p4(x);
+}
+
+static inline Vec8f log2(Vec8f const& x)
+{  // logarithm base 2
+  return __svml_log2f8(x);
+}
+static inline Vec4d log2(Vec4d const& x)
+{  // logarithm base 2
+  return __svml_log24(x);
+}
+
+static inline Vec8f log10(Vec8f const& x)
+{  // logarithm base 10
+  return __svml_log10f8(x);
+}
+static inline Vec4d log10(Vec4d const& x)
+{  // logarithm base 10
+  return __svml_log104(x);
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec8f sin(Vec8f const& x)
+{  // sine
+  return __svml_sinf8(x);
+}
+static inline Vec4d sin(Vec4d const& x)
+{  // sine
+  return __svml_sin4(x);
+}
+
+static inline Vec8f cos(Vec8f const& x)
+{  // cosine
+  return __svml_cosf8(x);
+}
+static inline Vec4d cos(Vec4d const& x)
+{  // cosine
+  return __svml_cos4(x);
+}
+
+#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER)
+// no inline assembly in 64 bit MS compiler
+static inline Vec8f sincos(Vec8f* pcos, Vec8f const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  __m256 r_sin, r_cos;
+  r_sin = __svml_sincosf8(x);
+#if defined(__unix__) || defined(__GNUC__)
+  __asm__ __volatile__("vmovaps %%ymm1, %0" : "=m"(r_cos));
+#else  // Windows
+  _asm vmovaps r_cos, ymm1;
+#endif
+  *pcos = r_cos;
+  return r_sin;
+}
+static inline Vec4d sincos(Vec4d* pcos, Vec4d const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  __m256d r_sin, r_cos;
+  r_sin = __svml_sincos4(x);
+#if defined(__unix__) || defined(__GNUC__)
+  __asm__ __volatile__("vmovaps %%ymm1, %0" : "=m"(r_cos));
+#else  // Windows
+  _asm vmovapd r_cos, ymm1;
+#endif
+  *pcos = r_cos;
+  return r_sin;
+}
+#endif  // inline assembly available
+
+static inline Vec8f tan(Vec8f const& x)
+{  // tangent
+  return __svml_tanf8(x);
+}
+static inline Vec4d tan(Vec4d const& x)
+{  // tangent
+  return __svml_tan4(x);
+}
+
+// inverse trigonometric functions
+static inline Vec8f asin(Vec8f const& x)
+{  // inverse sine
+  return __svml_asinf8(x);
+}
+static inline Vec4d asin(Vec4d const& x)
+{  // inverse sine
+  return __svml_asin4(x);
+}
+
+static inline Vec8f acos(Vec8f const& x)
+{  // inverse cosine
+  return __svml_acosf8(x);
+}
+static inline Vec4d acos(Vec4d const& x)
+{  // inverse cosine
+  return __svml_acos4(x);
+}
+
+static inline Vec8f atan(Vec8f const& x)
+{  // inverse tangent
+  return __svml_atanf8(x);
+}
+static inline Vec4d atan(Vec4d const& x)
+{  // inverse tangent
+  return __svml_atan4(x);
+}
+
+static inline Vec8f atan2(Vec8f const& a, Vec8f const& b)
+{  // inverse tangent of a/b
+  return __svml_atan2f8(a, b);
+}
+static inline Vec4d atan2(Vec4d const& a, Vec4d const& b)
+{  // inverse tangent of a/b
+  return __svml_atan24(a, b);
+}
+
+#endif  // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec8f sinh(Vec8f const& x)
+{  // hyperbolic sine
+  return __svml_sinhf8(x);
+}
+static inline Vec4d sinh(Vec4d const& x)
+{  // hyperbolic sine
+  return __svml_sinh4(x);
+}
+
+static inline Vec8f cosh(Vec8f const& x)
+{  // hyperbolic cosine
+  return __svml_coshf8(x);
+}
+static inline Vec4d cosh(Vec4d const& x)
+{  // hyperbolic cosine
+  return __svml_cosh4(x);
+}
+
+static inline Vec8f tanh(Vec8f const& x)
+{  // hyperbolic tangent
+  return __svml_tanhf8(x);
+}
+static inline Vec4d tanh(Vec4d const& x)
+{  // hyperbolic tangent
+  return __svml_tanh4(x);
+}
+
+static inline Vec8f asinh(Vec8f const& x)
+{  // inverse hyperbolic sine
+  return __svml_asinhf8(x);
+}
+static inline Vec4d asinh(Vec4d const& x)
+{  // inverse hyperbolic sine
+  return __svml_asinh4(x);
+}
+
+static inline Vec8f acosh(Vec8f const& x)
+{  // inverse hyperbolic cosine
+  return __svml_acoshf8(x);
+}
+static inline Vec4d acosh(Vec4d const& x)
+{  // inverse hyperbolic cosine
+  return __svml_acosh4(x);
+}
+
+static inline Vec8f atanh(Vec8f const& x)
+{  // inverse hyperbolic tangent
+  return __svml_atanhf8(x);
+}
+static inline Vec4d atanh(Vec4d const& x)
+{  // inverse hyperbolic tangent
+  return __svml_atanh4(x);
+}
+
+// error function
+static inline Vec8f erf(Vec8f const& x)
+{  // error function
+  return __svml_erff8(x);
+}
+static inline Vec4d erf(Vec4d const& x)
+{  // error function
+  return __svml_erf4(x);
+}
+
+static inline Vec8f erfc(Vec8f const& x)
+{  // error function complement
+  return __svml_erfcf8(x);
+}
+static inline Vec4d erfc(Vec4d const& x)
+{  // error function complement
+  return __svml_erfc4(x);
+}
+
+static inline Vec8f erfinv(Vec8f const& x)
+{  // inverse error function
+  return __svml_erfinvf8(x);
+}
+static inline Vec4d erfinv(Vec4d const& x)
+{  // inverse error function
+  return __svml_erfinv4(x);
+}
+
+static inline Vec8f cdfnorm(Vec8f const& x)
+{  // cumulative normal distribution function
+  return __svml_cdfnormf8(x);
+}
+static inline Vec4d cdfnorm(Vec4d const& x)
+{  // cumulative normal distribution function
+  return __svml_cdfnorm4(x);
+}
+
+static inline Vec8f cdfnorminv(Vec8f const& x)
+{  // inverse cumulative normal distribution function
+  return __svml_cdfnorminvf8(x);
+}
+static inline Vec4d cdfnorminv(Vec4d const& x)
+{  // inverse cumulative normal distribution function
+  return __svml_cdfnorminv4(x);
+}
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+// 256-bit version missing in current version of SVML (jan 2012). Use 128 bit version
+static inline Vec8f cexp(Vec8f const& x)
+{  // complex exponential function
+  return Vec8f(cexp(x.get_low()), cexp(x.get_high()));
+}
+static inline Vec4d cexp(Vec4d const& x)
+{  // complex exponential function
+  return Vec4d(cexp(x.get_low()), cexp(x.get_high()));
+}
+
+#endif  // VECTORF256_H == 2
+
+/*****************************************************************************
+ *
+ *      VECTORMATH = 3. Use Intel SVML library with Intel compiler
+ *
+ *****************************************************************************/
+#elif VECTORMATH == 3
+#include <ia32intrin.h>  // intel svml functions defined in Intel version of immintrin.h
+
+// 128 bit vectors
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec4f exp(Vec4f const& x)
+{  // exponential function
+  return _mm_exp_ps(x);
+}
+static inline Vec2d exp(Vec2d const& x)
+{  // exponential function
+  return _mm_exp_pd(x);
+}
+
+static inline Vec4f expm1(Vec4f const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return _mm_expm1_ps(x);
+}
+static inline Vec2d expm1(Vec2d const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return _mm_expm1_pd(x);
+}
+
+static inline Vec4f exp2(Vec4f const& x)
+{  // pow(2,x)
+  return _mm_exp2_ps(x);
+}
+static inline Vec2d exp2(Vec2d const& x)
+{  // pow(2,x)
+  return _mm_exp2_pd(x);
+}
+
+static inline Vec4f exp10(Vec4f const& x)
+{  // pow(10,x)
+  return _mm_exp10_ps(x);
+}
+static inline Vec2d exp10(Vec2d const& x)
+{  // pow(10,x)
+  return _mm_exp10_pd(x);
+}
+
+static inline Vec4f pow(Vec4f const& a, Vec4f const& b)
+{  // pow(a,b) = a to the power of b
+  return _mm_pow_ps(a, b);
+}
+static inline Vec2d pow(Vec2d const& a, Vec2d const& b)
+{  // pow(a,b) = a to the power of b
+  return _mm_pow_pd(a, b);
+}
+
+static inline Vec4f cbrt(Vec4f const& x)
+{  // pow(x,1/3)
+  return _mm_cbrt_ps(x);
+}
+static inline Vec2d cbrt(Vec2d const& x)
+{  // pow(x,1/3)
+  return _mm_cbrt_pd(x);
+}
+
+// logarithms
+static inline Vec4f log(Vec4f const& x)
+{  // natural logarithm
+  return _mm_log_ps(x);
+}
+static inline Vec2d log(Vec2d const& x)
+{  // natural logarithm
+  return _mm_log_pd(x);
+}
+
+static inline Vec4f log1p(Vec4f const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return _mm_log1p_ps(x);
+}
+static inline Vec2d log1p(Vec2d const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return _mm_log1p_pd(x);
+}
+
+static inline Vec4f log2(Vec4f const& x)
+{  // logarithm base 2
+  return _mm_log2_ps(x);
+}
+static inline Vec2d log2(Vec2d const& x)
+{  // logarithm base 2
+  return _mm_log2_pd(x);
+}
+
+static inline Vec4f log10(Vec4f const& x)
+{  // logarithm base 10
+  return _mm_log10_ps(x);
+}
+static inline Vec2d log10(Vec2d const& x)
+{  // logarithm base 10
+  return _mm_log10_pd(x);
+}
+
+// trigonometric functions
+static inline Vec4f sin(Vec4f const& x)
+{  // sine
+  return _mm_sin_ps(x);
+}
+static inline Vec2d sin(Vec2d const& x)
+{  // sine
+  return _mm_sin_pd(x);
+}
+
+static inline Vec4f cos(Vec4f const& x)
+{  // cosine
+  return _mm_cos_ps(x);
+}
+static inline Vec2d cos(Vec2d const& x)
+{  // cosine
+  return _mm_cos_pd(x);
+}
+
+static inline Vec4f sincos(Vec4f* pcos, Vec4f const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  __m128 r_sin, r_cos;
+  r_sin = _mm_sincos_ps(&r_cos, x);
+  *pcos = r_cos;
+  return r_sin;
+}
+static inline Vec2d sincos(Vec2d* pcos, Vec2d const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  __m128d r_sin, r_cos;
+  r_sin = _mm_sincos_pd(&r_cos, x);
+  *pcos = r_cos;
+  return r_sin;
+}
+
+static inline Vec4f tan(Vec4f const& x)
+{  // tangent
+  return _mm_tan_ps(x);
+}
+static inline Vec2d tan(Vec2d const& x)
+{  // tangent
+  return _mm_tan_pd(x);
+}
+
+// inverse trigonometric functions
+static inline Vec4f asin(Vec4f const& x)
+{  // inverse sine
+  return _mm_asin_ps(x);
+}
+static inline Vec2d asin(Vec2d const& x)
+{  // inverse sine
+  return _mm_asin_pd(x);
+}
+
+static inline Vec4f acos(Vec4f const& x)
+{  // inverse cosine
+  return _mm_acos_ps(x);
+}
+static inline Vec2d acos(Vec2d const& x)
+{  // inverse cosine
+  return _mm_acos_pd(x);
+}
+
+static inline Vec4f atan(Vec4f const& x)
+{  // inverse tangent
+  return _mm_atan_ps(x);
+}
+static inline Vec2d atan(Vec2d const& x)
+{  // inverse tangent
+  return _mm_atan_pd(x);
+}
+
+static inline Vec4f atan2(Vec4f const& a, Vec4f const& b)
+{  // inverse tangent of a/b
+  return _mm_atan2_ps(a, b);
+}
+static inline Vec2d atan2(Vec2d const& a, Vec2d const& b)
+{  // inverse tangent of a/b
+  return _mm_atan2_pd(a, b);
+}
+
+#endif  // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec4f sinh(Vec4f const& x)
+{  // hyperbolic sine
+  return _mm_sinh_ps(x);
+}
+static inline Vec2d sinh(Vec2d const& x)
+{  // hyperbolic sine
+  return _mm_sinh_pd(x);
+}
+
+static inline Vec4f cosh(Vec4f const& x)
+{  // hyperbolic cosine
+  return _mm_cosh_ps(x);
+}
+static inline Vec2d cosh(Vec2d const& x)
+{  // hyperbolic cosine
+  return _mm_cosh_pd(x);
+}
+
+static inline Vec4f tanh(Vec4f const& x)
+{  // hyperbolic tangent
+  return _mm_tanh_ps(x);
+}
+static inline Vec2d tanh(Vec2d const& x)
+{  // hyperbolic tangent
+  return _mm_tanh_pd(x);
+}
+
+static inline Vec4f asinh(Vec4f const& x)
+{  // inverse hyperbolic sine
+  return _mm_asinh_ps(x);
+}
+static inline Vec2d asinh(Vec2d const& x)
+{  // inverse hyperbolic sine
+  return _mm_asinh_pd(x);
+}
+
+static inline Vec4f acosh(Vec4f const& x)
+{  // inverse hyperbolic cosine
+  return _mm_acosh_ps(x);
+}
+static inline Vec2d acosh(Vec2d const& x)
+{  // inverse hyperbolic cosine
+  return _mm_acosh_pd(x);
+}
+
+static inline Vec4f atanh(Vec4f const& x)
+{  // inverse hyperbolic tangent
+  return _mm_atanh_ps(x);
+}
+static inline Vec2d atanh(Vec2d const& x)
+{  // inverse hyperbolic tangent
+  return _mm_atanh_pd(x);
+}
+
+// error function
+static inline Vec4f erf(Vec4f const& x)
+{  // error function
+  return _mm_erf_ps(x);
+}
+static inline Vec2d erf(Vec2d const& x)
+{  // error function
+  return _mm_erf_pd(x);
+}
+
+static inline Vec4f erfc(Vec4f const& x)
+{  // error function complement
+  return _mm_erfc_ps(x);
+}
+static inline Vec2d erfc(Vec2d const& x)
+{  // error function complement
+  return _mm_erfc_pd(x);
+}
+
+static inline Vec4f erfinv(Vec4f const& x)
+{  // inverse error function
+  return _mm_erfinv_ps(x);
+}
+static inline Vec2d erfinv(Vec2d const& x)
+{  // inverse error function
+  return _mm_erfinv_pd(x);
+}
+
+extern "C"
+{
+  extern __m128 __svml_cdfnormf4(__m128);   // not in immintrin.h
+  extern __m128d __svml_cdfnorm2(__m128d);  // not in immintrin.h
+}
+
+static inline Vec4f cdfnorm(Vec4f const& x)
+{  // cumulative normal distribution function
+  return __svml_cdfnormf4(x);
+}
+static inline Vec2d cdfnorm(Vec2d const& x)
+{  // cumulative normal distribution function
+  return __svml_cdfnorm2(x);
+}
+
+static inline Vec4f cdfnorminv(Vec4f const& x)
+{  // inverse cumulative normal distribution function
+  return _mm_cdfnorminv_ps(x);
+}
+static inline Vec2d cdfnorminv(Vec2d const& x)
+{  // inverse cumulative normal distribution function
+  return _mm_cdfnorminv_pd(x);
+}
+
+// complex functions
+extern "C"
+{
+  extern __m128 __svml_cexpf2(__m128);   // not in immintrin.h
+  extern __m128 __svml_cexpf4(__m128);   // not in immintrin.h
+  extern __m128d __svml_cexp2(__m128d);  // not in immintrin.h
+}
+
+static inline Vec4f cexp(Vec4f const& x)
+{  // complex exponential function
+  return __svml_cexpf4(x);
+}
+static inline Vec2d cexp(Vec2d const& x)
+{  // complex exponential function
+  return __svml_cexp2(x);
+}
+
+#if defined(VECTORF256_H) && VECTORF256_H >= 2
+
+// 256 bit vectors
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec8f exp(Vec8f const& x)
+{  // exponential function
+  return _mm256_exp_ps(x);
+}
+static inline Vec4d exp(Vec4d const& x)
+{  // exponential function
+  return _mm256_exp_pd(x);
+}
+
+static inline Vec8f expm1(Vec8f const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return _mm256_expm1_ps(x);
+}
+static inline Vec4d expm1(Vec4d const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return _mm256_expm1_pd(x);
+}
+
+static inline Vec8f exp2(Vec8f const& x)
+{  // pow(2,x)
+  return _mm256_exp2_ps(x);
+}
+static inline Vec4d exp2(Vec4d const& x)
+{  // pow(2,x)
+  return _mm256_exp2_pd(x);
+}
+
+static inline Vec8f exp10(Vec8f const& x)
+{  // pow(10,x)
+  return _mm256_exp10_ps(x);
+}
+static inline Vec4d exp10(Vec4d const& x)
+{  // pow(10,x)
+  return _mm256_exp10_pd(x);
+}
+
+static inline Vec8f pow(Vec8f const& a, Vec8f const& b)
+{  // pow(a,b) = a to the power of b
+  return _mm256_pow_ps(a, b);
+}
+static inline Vec4d pow(Vec4d const& a, Vec4d const& b)
+{  // pow(a,b) = a to the power of b
+  return _mm256_pow_pd(a, b);
+}
+
+static inline Vec8f cbrt(Vec8f const& x)
+{  // pow(x,1/3)
+  return _mm256_cbrt_ps(x);
+}
+static inline Vec4d cbrt(Vec4d const& x)
+{  // pow(x,1/3)
+  return _mm256_cbrt_pd(x);
+}
+
+// logarithms
+static inline Vec8f log(Vec8f const& x)
+{  // natural logarithm
+  return _mm256_log_ps(x);
+}
+static inline Vec4d log(Vec4d const& x)
+{  // natural logarithm
+  return _mm256_log_pd(x);
+}
+
+static inline Vec8f log1p(Vec8f const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return _mm256_log1p_ps(x);
+}
+static inline Vec4d log1p(Vec4d const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return _mm256_log1p_pd(x);
+}
+
+static inline Vec8f log2(Vec8f const& x)
+{  // logarithm base 2
+  return _mm256_log2_ps(x);
+}
+static inline Vec4d log2(Vec4d const& x)
+{  // logarithm base 2
+  return _mm256_log2_pd(x);
+}
+
+static inline Vec8f log10(Vec8f const& x)
+{  // logarithm base 10
+  return _mm256_log10_ps(x);
+}
+static inline Vec4d log10(Vec4d const& x)
+{  // logarithm base 10
+  return _mm256_log10_pd(x);
+}
+
+// trigonometric functions
+static inline Vec8f sin(Vec8f const& x)
+{  // sine
+  return _mm256_sin_ps(x);
+}
+static inline Vec4d sin(Vec4d const& x)
+{  // sine
+  return _mm256_sin_pd(x);
+}
+
+static inline Vec8f cos(Vec8f const& x)
+{  // cosine
+  return _mm256_cos_ps(x);
+}
+static inline Vec4d cos(Vec4d const& x)
+{  // cosine
+  return _mm256_cos_pd(x);
+}
+
+static inline Vec8f sincos(Vec8f* pcos, Vec8f const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  __m256 r_sin, r_cos;
+  r_sin = _mm256_sincos_ps(&r_cos, x);
+  *pcos = r_cos;
+  return r_sin;
+}
+static inline Vec4d sincos(Vec4d* pcos, Vec4d const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  __m256d r_sin, r_cos;
+  r_sin = _mm256_sincos_pd(&r_cos, x);
+  *pcos = r_cos;
+  return r_sin;
+}
+
+static inline Vec8f tan(Vec8f const& x)
+{  // tangent
+  return _mm256_tan_ps(x);
+}
+static inline Vec4d tan(Vec4d const& x)
+{  // tangent
+  return _mm256_tan_pd(x);
+}
+
+// inverse trigonometric functions
+static inline Vec8f asin(Vec8f const& x)
+{  // inverse sine
+  return _mm256_asin_ps(x);
+}
+static inline Vec4d asin(Vec4d const& x)
+{  // inverse sine
+  return _mm256_asin_pd(x);
+}
+
+static inline Vec8f acos(Vec8f const& x)
+{  // inverse cosine
+  return _mm256_acos_ps(x);
+}
+static inline Vec4d acos(Vec4d const& x)
+{  // inverse cosine
+  return _mm256_acos_pd(x);
+}
+
+static inline Vec8f atan(Vec8f const& x)
+{  // inverse tangent
+  return _mm256_atan_ps(x);
+}
+static inline Vec4d atan(Vec4d const& x)
+{  // inverse tangent
+  return _mm256_atan_pd(x);
+}
+
+static inline Vec8f atan2(Vec8f const& a, Vec8f const& b)
+{  // inverse tangent of a/b
+  return _mm256_atan2_ps(a, b);
+}
+static inline Vec4d atan2(Vec4d const& a, Vec4d const& b)
+{  // inverse tangent of a/b
+  return _mm256_atan2_pd(a, b);
+}
+
+#endif  // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec8f sinh(Vec8f const& x)
+{  // hyperbolic sine
+  return _mm256_sinh_ps(x);
+}
+static inline Vec4d sinh(Vec4d const& x)
+{  // hyperbolic sine
+  return _mm256_sinh_pd(x);
+}
+
+static inline Vec8f cosh(Vec8f const& x)
+{  // hyperbolic cosine
+  return _mm256_cosh_ps(x);
+}
+static inline Vec4d cosh(Vec4d const& x)
+{  // hyperbolic cosine
+  return _mm256_cosh_pd(x);
+}
+
+static inline Vec8f tanh(Vec8f const& x)
+{  // hyperbolic tangent
+  return _mm256_tanh_ps(x);
+}
+static inline Vec4d tanh(Vec4d const& x)
+{  // hyperbolic tangent
+  return _mm256_tanh_pd(x);
+}
+
+static inline Vec8f asinh(Vec8f const& x)
+{  // inverse hyperbolic sine
+  return _mm256_asinh_ps(x);
+}
+static inline Vec4d asinh(Vec4d const& x)
+{  // inverse hyperbolic sine
+  return _mm256_asinh_pd(x);
+}
+
+static inline Vec8f acosh(Vec8f const& x)
+{  // inverse hyperbolic cosine
+  return _mm256_acosh_ps(x);
+}
+static inline Vec4d acosh(Vec4d const& x)
+{  // inverse hyperbolic cosine
+  return _mm256_acosh_pd(x);
+}
+
+static inline Vec8f atanh(Vec8f const& x)
+{  // inverse hyperbolic tangent
+  return _mm256_atanh_ps(x);
+}
+static inline Vec4d atanh(Vec4d const& x)
+{  // inverse hyperbolic tangent
+  return _mm256_atanh_pd(x);
+}
+
+// error function
+static inline Vec8f erf(Vec8f const& x)
+{  // error function
+  return _mm256_erf_ps(x);
+}
+static inline Vec4d erf(Vec4d const& x)
+{  // error function
+  return _mm256_erf_pd(x);
+}
+
+static inline Vec8f erfc(Vec8f const& x)
+{  // error function complement
+  return _mm256_erfc_ps(x);
+}
+static inline Vec4d erfc(Vec4d const& x)
+{  // error function complement
+  return _mm256_erfc_pd(x);
+}
+
+static inline Vec8f erfinv(Vec8f const& x)
+{  // inverse error function
+  return _mm256_erfinv_ps(x);
+}
+static inline Vec4d erfinv(Vec4d const& x)
+{  // inverse error function
+  return _mm256_erfinv_pd(x);
+}
+
+extern "C"
+{
+  extern __m256 __svml_cdfnormf8(__m256);   // not in immintrin.h
+  extern __m256d __svml_cdfnorm4(__m256d);  // not in immintrin.h
+}
+static inline Vec8f cdfnorm(Vec8f const& x)
+{  // cumulative normal distribution function
+  return __svml_cdfnormf8(x);
+}
+static inline Vec4d cdfnorm(Vec4d const& x)
+{  // cumulative normal distribution function
+  return __svml_cdfnorm4(x);
+}
+
+static inline Vec8f cdfnorminv(Vec8f const& x)
+{  // inverse cumulative normal distribution function
+  return _mm256_cdfnorminv_ps(x);
+}
+static inline Vec4d cdfnorminv(Vec4d const& x)
+{  // inverse cumulative normal distribution function
+  return _mm256_cdfnorminv_pd(x);
+}
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+static inline Vec8f cexp(Vec8f const& x)
+{  // complex exponential function
+  return Vec8f(cexp(x.get_low()), cexp(x.get_high()));
+}
+static inline Vec4d cexp(Vec4d const& x)
+{  // complex exponential function
+  return Vec4d(cexp(x.get_low()), cexp(x.get_high()));
+}
+
+#endif  // VECTORF256_H >= 2
+
+#else
+#error unknown value of VECTORMATH
+#endif  // VECTORMATH
+
+#if defined(VECTORF256_H) && VECTORF256_H == 1 && (VECTORMATH == 2 || VECTORMATH == 3)
+/*****************************************************************************
+ *
+ *      VECTORF256_H == 1. 256 bit vectors emulated as two 128-bit vectors,
+ *      SVML library
+ *
+ *****************************************************************************/
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec8f exp(Vec8f const& x)
+{  // exponential function
+  return Vec8f(exp(x.get_low()), exp(x.get_high()));
+}
+static inline Vec4d exp(Vec4d const& x)
+{  // exponential function
+  return Vec4d(exp(x.get_low()), exp(x.get_high()));
+}
+
+static inline Vec8f expm1(Vec8f const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return Vec8f(expm1(x.get_low()), expm1(x.get_high()));
+}
+static inline Vec4d expm1(Vec4d const& x)
+{  // exp(x)-1. Avoids loss of precision if x is close to 1
+  return Vec4d(expm1(x.get_low()), expm1(x.get_high()));
+}
+
+static inline Vec8f exp2(Vec8f const& x)
+{  // pow(2,x)
+  return Vec8f(exp2(x.get_low()), exp2(x.get_high()));
+}
+static inline Vec4d exp2(Vec4d const& x)
+{  // pow(2,x)
+  return Vec4d(exp2(x.get_low()), exp2(x.get_high()));
+}
+
+static inline Vec8f exp10(Vec8f const& x)
+{  // pow(10,x)
+  return Vec8f(exp10(x.get_low()), exp10(x.get_high()));
+}
+static inline Vec4d exp10(Vec4d const& x)
+{  // pow(10,x)
+  return Vec4d(exp10(x.get_low()), exp10(x.get_high()));
+}
+
+static inline Vec8f pow(Vec8f const& a, Vec8f const& b)
+{  // pow(a,b) = a to the power of b
+  return Vec8f(pow(a.get_low(), b.get_low()), pow(a.get_high(), b.get_high()));
+}
+static inline Vec4d pow(Vec4d const& a, Vec4d const& b)
+{  // pow(a,b) = a to the power of b
+  return Vec4d(pow(a.get_low(), b.get_low()), pow(a.get_high(), b.get_high()));
+}
+
+static inline Vec8f cbrt(Vec8f const& x)
+{  // pow(x,1/3)
+  return Vec8f(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+static inline Vec4d cbrt(Vec4d const& x)
+{  // pow(x,1/3)
+  return Vec4d(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+
+// logarithms
+static inline Vec8f log(Vec8f const& x)
+{  // natural logarithm
+  return Vec8f(log(x.get_low()), log(x.get_high()));
+}
+static inline Vec4d log(Vec4d const& x)
+{  // natural logarithm
+  return Vec4d(log(x.get_low()), log(x.get_high()));
+}
+
+static inline Vec8f log1p(Vec8f const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return Vec8f(log1p(x.get_low()), log1p(x.get_high()));
+}
+static inline Vec4d log1p(Vec4d const& x)
+{  // log(1+x). Avoids loss of precision if 1+x is close to 1
+  return Vec4d(log1p(x.get_low()), log1p(x.get_high()));
+}
+
+static inline Vec8f log2(Vec8f const& x)
+{  // logarithm base 2
+  return Vec8f(log2(x.get_low()), log2(x.get_high()));
+}
+static inline Vec4d log2(Vec4d const& x)
+{  // logarithm base 2
+  return Vec4d(log2(x.get_low()), log2(x.get_high()));
+}
+
+static inline Vec8f log10(Vec8f const& x)
+{  // logarithm base 10
+  return Vec8f(log10(x.get_low()), log10(x.get_high()));
+}
+static inline Vec4d log10(Vec4d const& x)
+{  // logarithm base 10
+  return Vec4d(log10(x.get_low()), log10(x.get_high()));
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec8f sin(Vec8f const& x)
+{  // sine
+  return Vec8f(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec4d sin(Vec4d const& x)
+{  // sine
+  return Vec4d(sin(x.get_low()), sin(x.get_high()));
+}
+
+static inline Vec8f cos(Vec8f const& x)
+{  // cosine
+  return Vec8f(cos(x.get_low()), cos(x.get_high()));
+}
+static inline Vec4d cos(Vec4d const& x)
+{  // cosine
+  return Vec4d(cos(x.get_low()), cos(x.get_high()));
+}
+
+#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER)
+// no inline assembly in 64 bit MS compiler
+static inline Vec8f sincos(Vec8f* pcos, Vec8f const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  Vec4f r_sin0, r_sin1, r_cos0, r_cos1;
+  r_sin0 = sincos(&r_cos0, x.get_low());
+  r_sin1 = sincos(&r_cos1, x.get_high());
+  *pcos  = Vec8f(r_cos0, r_cos1);
+  return Vec8f(r_sin0, r_sin1);
+}
+static inline Vec4d sincos(Vec4d* pcos, Vec4d const& x)
+{  // sine and cosine. sin(x) returned, cos(x) in pcos
+  Vec2d r_sin0, r_sin1, r_cos0, r_cos1;
+  r_sin0 = sincos(&r_cos0, x.get_low());
+  r_sin1 = sincos(&r_cos1, x.get_high());
+  *pcos  = Vec4d(r_cos0, r_cos1);
+  return Vec4d(r_sin0, r_sin1);
+}
+#endif  // inline assembly available
+
+static inline Vec8f tan(Vec8f const& x)
+{  // tangent
+  return Vec8f(tan(x.get_low()), tan(x.get_high()));
+}
+static inline Vec4d tan(Vec4d const& x)
+{  // tangent
+  return Vec4d(tan(x.get_low()), tan(x.get_high()));
+}
+
+// inverse trigonometric functions
+static inline Vec8f asin(Vec8f const& x)
+{  // inverse sine
+  return Vec8f(asin(x.get_low()), asin(x.get_high()));
+}
+static inline Vec4d asin(Vec4d const& x)
+{  // inverse sine
+  return Vec4d(asin(x.get_low()), asin(x.get_high()));
+}
+
+static inline Vec8f acos(Vec8f const& x)
+{  // inverse cosine
+  return Vec8f(acos(x.get_low()), acos(x.get_high()));
+}
+static inline Vec4d acos(Vec4d const& x)
+{  // inverse cosine
+  return Vec4d(acos(x.get_low()), acos(x.get_high()));
+}
+
+static inline Vec8f atan(Vec8f const& x)
+{  // inverse tangent
+  return Vec8f(atan(x.get_low()), atan(x.get_high()));
+}
+static inline Vec4d atan(Vec4d const& x)
+{  // inverse tangent
+  return Vec4d(atan(x.get_low()), atan(x.get_high()));
+}
+
+static inline Vec8f atan2(Vec8f const& a, Vec8f const& b)
+{  // inverse tangent of a/b
+  return Vec8f(atan2(a.get_low(), b.get_low()), atan2(a.get_high(), b.get_high()));
+}
+static inline Vec4d atan2(Vec4d const& a, Vec4d const& b)
+{  // inverse tangent of a/b
+  return Vec4d(atan2(a.get_low(), b.get_low()), atan2(a.get_high(), b.get_high()));
+}
+
+#endif  // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec8f sinh(Vec8f const& x)
+{  // hyperbolic sine
+  return Vec8f(sinh(x.get_low()), sinh(x.get_high()));
+}
+static inline Vec4d sinh(Vec4d const& x)
+{  // hyperbolic sine
+  return Vec4d(sinh(x.get_low()), sinh(x.get_high()));
+}
+
+static inline Vec8f cosh(Vec8f const& x)
+{  // hyperbolic cosine
+  return Vec8f(cosh(x.get_low()), cosh(x.get_high()));
+}
+static inline Vec4d cosh(Vec4d const& x)
+{  // hyperbolic cosine
+  return Vec4d(cosh(x.get_low()), cosh(x.get_high()));
+}
+
+static inline Vec8f tanh(Vec8f const& x)
+{  // hyperbolic tangent
+  return Vec8f(tanh(x.get_low()), tanh(x.get_high()));
+}
+static inline Vec4d tanh(Vec4d const& x)
+{  // hyperbolic tangent
+  return Vec4d(tanh(x.get_low()), tanh(x.get_high()));
+}
+
+static inline Vec8f asinh(Vec8f const& x)
+{  // inverse hyperbolic sine
+  return Vec8f(asinh(x.get_low()), asinh(x.get_high()));
+}
+static inline Vec4d asinh(Vec4d const& x)
+{  // inverse hyperbolic sine
+  return Vec4d(asinh(x.get_low()), asinh(x.get_high()));
+}
+
+static inline Vec8f acosh(Vec8f const& x)
+{  // inverse hyperbolic cosine
+  return Vec8f(acosh(x.get_low()), acosh(x.get_high()));
+}
+static inline Vec4d acosh(Vec4d const& x)
+{  // inverse hyperbolic cosine
+  return Vec4d(acosh(x.get_low()), acosh(x.get_high()));
+}
+
+static inline Vec8f atanh(Vec8f const& x)
+{  // inverse hyperbolic tangent
+  return Vec8f(atanh(x.get_low()), atanh(x.get_high()));
+}
+static inline Vec4d atanh(Vec4d const& x)
+{  // inverse hyperbolic tangent
+  return Vec4d(atanh(x.get_low()), atanh(x.get_high()));
+}
+
+// error function
+static inline Vec8f erf(Vec8f const& x)
+{  // error function
+  return Vec8f(erf(x.get_low()), erf(x.get_high()));
+}
+static inline Vec4d erf(Vec4d const& x)
+{  // error function
+  return Vec4d(erf(x.get_low()), erf(x.get_high()));
+}
+
+static inline Vec8f erfc(Vec8f const& x)
+{  // error function complement
+  return Vec8f(erfc(x.get_low()), erfc(x.get_high()));
+}
+static inline Vec4d erfc(Vec4d const& x)
+{  // error function complement
+  return Vec4d(erfc(x.get_low()), erfc(x.get_high()));
+}
+
+static inline Vec8f erfinv(Vec8f const& x)
+{  // inverse error function
+  return Vec8f(erfinv(x.get_low()), erfinv(x.get_high()));
+}
+static inline Vec4d erfinv(Vec4d const& x)
+{  // inverse error function
+  return Vec4d(erfinv(x.get_low()), erfinv(x.get_high()));
+}
+
+static inline Vec8f cdfnorm(Vec8f const& x)
+{  // cumulative normal distribution function
+  return Vec8f(cdfnorm(x.get_low()), cdfnorm(x.get_high()));
+}
+static inline Vec4d cdfnorm(Vec4d const& x)
+{  // cumulative normal distribution function
+  return Vec4d(cdfnorm(x.get_low()), cdfnorm(x.get_high()));
+}
+
+static inline Vec8f cdfnorminv(Vec8f const& x)
+{  // inverse cumulative normal distribution function
+  return Vec8f(cdfnorminv(x.get_low()), cdfnorminv(x.get_high()));
+}
+static inline Vec4d cdfnorminv(Vec4d const& x)
+{  // inverse cumulative normal distribution function
+  return Vec4d(cdfnorminv(x.get_low()), cdfnorminv(x.get_high()));
+}
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+static inline Vec8f cexp(Vec8f const& x)
+{  // complex exponential function
+  return Vec8f(cexp(x.get_low()), cexp(x.get_high()));
+}
+static inline Vec4d cexp(Vec4d const& x)
+{  // complex exponential function
+  return Vec4d(cexp(x.get_low()), cexp(x.get_high()));
+}
+
+#endif  // VECTORF256_H == 1
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORMATH_LIB_H
diff --git a/src/vectorclass/vectormath_trig.h b/src/vectorclass/vectormath_trig.h
new file mode 100644
index 0000000000000000000000000000000000000000..1135dc0f0f49e7c6b546759f534732e496541000
--- /dev/null
+++ b/src/vectorclass/vectormath_trig.h
@@ -0,0 +1,973 @@
+/****************************  vectormath_trig.h   ******************************
+ * Author:        Agner Fog
+ * Date created:  2014-04-18
+ * Last modified: 2016-05-02
+ * Version:       1.22
+ * Project:       vector classes
+ * Description:
+ * Header file containing inline version of trigonometric functions
+ * and inverse trigonometric functions
+ * sin, cos, sincos, tan
+ * asin, acos, atan, atan2
+ *
+ * Theory, methods and inspiration based partially on these sources:
+ * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
+ *   Ellis Horwood, 1989.
+ * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
+ *   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
+ * > Cephes math library by Stephen L. Moshier 1992,
+ *   http://www.netlib.org/cephes/
+ *
+ * For detailed instructions, see vectormath_common.h and VectorClass.pdf
+ *
+ * (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
+ ******************************************************************************/
+
+#ifndef VECTORMATH_TRIG_H
+#define VECTORMATH_TRIG_H 1
+
+#include "vectormath_common.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE
+{
+#endif
+
+// Different overloaded functions for template resolution.
+// These are used to fix the problem that the quadrant index uses
+// a vector of 32-bit integers which doesn't fit the size of the
+// 64-bit double precision vector:
+// VTYPE | ITYPE | ITYPEH
+// -----------------------
+// Vec2d | Vec2q | Vec4i
+// Vec4d | Vec4q | Vec4i
+// Vec8d | Vec8q | Vec8i
+
+// define overloaded truncate functions
+static inline Vec4i vm_truncate_low_to_int(Vec2d const& x) { return truncate_to_int(x, x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4i vm_truncate_low_to_int(Vec4d const& x) { return truncate_to_int(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8i vm_truncate_low_to_int(Vec8d const& x) { return truncate_to_int(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// define int -> double conversions
+template <class VTYPE, class ITYPE>
+static inline VTYPE vm_half_int_vector_to_double(ITYPE const& x);
+
+template <>
+inline Vec2d vm_half_int_vector_to_double<Vec2d, Vec4i>(Vec4i const& x)
+{
+  return to_double_low(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+template <>
+inline Vec4d vm_half_int_vector_to_double<Vec4d, Vec4i>(Vec4i const& x)
+{
+  return to_double(x);
+}
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+template <>
+inline Vec8d vm_half_int_vector_to_double<Vec8d, Vec8i>(Vec8i const& x)
+{
+  return to_double(x);
+}
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// define int32_t to int64_t conversions
+template <class ITYPE, class ITYPEH>
+static inline ITYPE vm_half_int_vector_to_full(ITYPEH const& x);
+
+template <>
+inline Vec2q vm_half_int_vector_to_full<Vec2q, Vec4i>(Vec4i const& x)
+{
+  return extend_low(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+template <>
+inline Vec4q vm_half_int_vector_to_full<Vec4q, Vec4i>(Vec4i const& x)
+{
+  return extend_low(Vec8i(x, x));
+}
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+template <>
+inline Vec8q vm_half_int_vector_to_full<Vec8q, Vec8i>(Vec8i const& x)
+{
+  return extend_low(Vec16i(x, x));
+}
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// *************************************************************
+//             sincos template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  integer vector type with same element size
+// ITYPEH: integer vector type with half the element size
+// BVTYPE: boolean vector type
+// SC:     1 = sin, 2 = cos, 3 = sincos
+// Paramterers:
+// xx = input x (radians)
+// cosret = return pointer (only if SC = 3)
+template <class VTYPE, class ITYPE, class ITYPEH, class BVTYPE, int SC>
+static inline VTYPE sincos_d(VTYPE* cosret, VTYPE const& xx)
+{
+  // define constants
+  const double ONEOPIO4 = 4. / VM_PI;
+
+  const double P0sin = -1.66666666666666307295E-1;
+  const double P1sin = 8.33333333332211858878E-3;
+  const double P2sin = -1.98412698295895385996E-4;
+  const double P3sin = 2.75573136213857245213E-6;
+  const double P4sin = -2.50507477628578072866E-8;
+  const double P5sin = 1.58962301576546568060E-10;
+
+  const double P0cos = 4.16666666666665929218E-2;
+  const double P1cos = -1.38888888888730564116E-3;
+  const double P2cos = 2.48015872888517045348E-5;
+  const double P3cos = -2.75573141792967388112E-7;
+  const double P4cos = 2.08757008419747316778E-9;
+  const double P5cos = -1.13585365213876817300E-11;
+
+  const double DP1 = 7.853981554508209228515625E-1;
+  const double DP2 = 7.94662735614792836714E-9;
+  const double DP3 = 3.06161699786838294307E-17;
+  /*
+  const double DP1sc = 7.85398125648498535156E-1;
+  const double DP2sc = 3.77489470793079817668E-8;
+  const double DP3sc = 2.69515142907905952645E-15;
+  */
+  VTYPE xa, x, y, x2, s, c, sin1, cos1;  // data vectors
+  ITYPEH q;                              // integer vectors, 32 bit
+  ITYPE qq, signsin, signcos;            // integer vectors, 64 bit
+  BVTYPE swap, overflow;                 // boolean vectors
+
+  xa = abs(xx);
+
+  // Find quadrant
+  //      0 -   pi/4 => 0
+  //   pi/4 - 3*pi/4 => 2
+  // 3*pi/4 - 5*pi/4 => 4
+  // 5*pi/4 - 7*pi/4 => 6
+  // 7*pi/4 - 8*pi/4 => 8
+
+  // truncate to integer (magic number conversion is not faster here)
+  q = vm_truncate_low_to_int(xa * ONEOPIO4);
+  q = (q + 1) & ~1;
+
+  y = vm_half_int_vector_to_double<VTYPE>(q);  // quadrant, as double
+
+  // Reduce by extended precision modular arithmetic
+  x = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa)));  // x = ((xa - y * DP1) - y * DP2) - y * DP3;
+
+  // Expansion of sin and cos, valid for -pi/4 <= x <= pi/4
+  x2 = x * x;
+  s  = polynomial_5(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin);
+  c  = polynomial_5(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos);
+  s  = mul_add(x * x2, s, x);                        // s = x + (x * x2) * s;
+  c  = mul_add(x2 * x2, c, nmul_add(x2, 0.5, 1.0));  // c = 1.0 - x2 * 0.5 + (x2 * x2) * c;
+
+  // correct for quadrant
+  qq   = vm_half_int_vector_to_full<ITYPE, ITYPEH>(q);
+  swap = BVTYPE((qq & 2) != 0);
+
+  // check for overflow
+  if(horizontal_or(q < 0))
+    {
+      overflow = (y < 0) & is_finite(xa);
+      s        = select(overflow, 0., s);
+      c        = select(overflow, 1., c);
+    }
+
+  if(SC & 1)
+    {  // calculate sin
+      sin1    = select(swap, c, s);
+      signsin = ((qq << 61) ^ ITYPE(reinterpret_i(xx))) & ITYPE(1ULL << 63);
+      sin1 ^= reinterpret_d(signsin);
+    }
+  if(SC & 2)
+    {  // calculate cos
+      cos1    = select(swap, s, c);
+      signcos = ((qq + 2) << 61) & (1ULL << 63);
+      cos1 ^= reinterpret_d(signcos);
+    }
+  if(SC == 3)
+    {  // calculate both. cos returned through pointer
+      *cosret = cos1;
+    }
+  if(SC & 1)
+    return sin1;
+  else
+    return cos1;
+}
+
+// instantiations of sincos_d template:
+
+static inline Vec2d sin(Vec2d const& x) { return sincos_d<Vec2d, Vec2q, Vec4i, Vec2db, 1>(0, x); }
+
+static inline Vec2d cos(Vec2d const& x) { return sincos_d<Vec2d, Vec2q, Vec4i, Vec2db, 2>(0, x); }
+
+static inline Vec2d sincos(Vec2d* cosret, Vec2d const& x) { return sincos_d<Vec2d, Vec2q, Vec4i, Vec2db, 3>(cosret, x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d sin(Vec4d const& x) { return sincos_d<Vec4d, Vec4q, Vec4i, Vec4db, 1>(0, x); }
+
+static inline Vec4d cos(Vec4d const& x) { return sincos_d<Vec4d, Vec4q, Vec4i, Vec4db, 2>(0, x); }
+
+static inline Vec4d sincos(Vec4d* cosret, Vec4d const& x) { return sincos_d<Vec4d, Vec4q, Vec4i, Vec4db, 3>(cosret, x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d sin(Vec8d const& x) { return sincos_d<Vec8d, Vec8q, Vec8i, Vec8db, 1>(0, x); }
+
+static inline Vec8d cos(Vec8d const& x) { return sincos_d<Vec8d, Vec8q, Vec8i, Vec8db, 2>(0, x); }
+
+static inline Vec8d sincos(Vec8d* cosret, Vec8d const& x) { return sincos_d<Vec8d, Vec8q, Vec8i, Vec8db, 3>(cosret, x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// *************************************************************
+//             sincos template, single precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  integer vector type with same element size
+// BVTYPE: boolean vector type
+// SC:     1 = sin, 2 = cos, 3 = sincos, 4 = tan
+// Paramterers:
+// xx = input x (radians)
+// cosret = return pointer (only if SC = 3)
+template <class VTYPE, class ITYPE, class BVTYPE, int SC>
+static inline VTYPE sincos_f(VTYPE* cosret, VTYPE const& xx)
+{
+  // define constants
+  const float ONEOPIO4f = (float)(4. / VM_PI);
+
+  const float DP1F = 0.78515625f;
+  const float DP2F = 2.4187564849853515625E-4f;
+  const float DP3F = 3.77489497744594108E-8f;
+
+  const float P0sinf = -1.6666654611E-1f;
+  const float P1sinf = 8.3321608736E-3f;
+  const float P2sinf = -1.9515295891E-4f;
+
+  const float P0cosf = 4.166664568298827E-2f;
+  const float P1cosf = -1.388731625493765E-3f;
+  const float P2cosf = 2.443315711809948E-5f;
+
+  VTYPE xa, x, y, x2, s, c, sin1, cos1;  // data vectors
+  ITYPE q, signsin, signcos;             // integer vectors
+  BVTYPE swap, overflow;                 // boolean vectors
+
+  xa = abs(xx);
+
+  // Find quadrant
+  //      0 -   pi/4 => 0
+  //   pi/4 - 3*pi/4 => 2
+  // 3*pi/4 - 5*pi/4 => 4
+  // 5*pi/4 - 7*pi/4 => 6
+  // 7*pi/4 - 8*pi/4 => 8
+  q = truncate_to_int(xa * ONEOPIO4f);
+  q = (q + 1) & ~1;
+
+  y = to_float(q);  // quadrant, as float
+
+  // Reduce by extended precision modular arithmetic
+  x = nmul_add(y, DP3F, nmul_add(y, DP2F, nmul_add(y, DP1F, xa)));  // x = ((xa - y * DP1F) - y * DP2F) - y * DP3F;
+
+  // A two-step reduction saves time at the cost of precision for very big x:
+  // x = (xa - y * DP1F) - y * (DP2F+DP3F);
+
+  // Taylor expansion of sin and cos, valid for -pi/4 <= x <= pi/4
+  x2 = x * x;
+  s  = polynomial_2(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
+  c  = polynomial_2(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2) + nmul_add(0.5f, x2, 1.0f);
+
+  // correct for quadrant
+  swap = BVTYPE((q & 2) != 0);
+
+  // check for overflow
+  overflow = BVTYPE(q < 0);  // q = 0x80000000 if overflow
+  if(horizontal_or(overflow & is_finite(xa)))
+    {
+      s = select(overflow, 0.f, s);
+      c = select(overflow, 1.f, c);
+    }
+
+  if(SC & 5)
+    {  // calculate sin
+      sin1    = select(swap, c, s);
+      signsin = ((q << 29) ^ ITYPE(reinterpret_i(xx))) & ITYPE(1 << 31);
+      sin1 ^= reinterpret_f(signsin);
+    }
+  if(SC & 6)
+    {  // calculate cos
+      cos1    = select(swap, s, c);
+      signcos = ((q + 2) << 29) & (1 << 31);
+      cos1 ^= reinterpret_f(signcos);
+    }
+  if(SC == 1)
+    return sin1;
+  else if(SC == 2)
+    return cos1;
+  else if(SC == 3)
+    {  // calculate both. cos returned through pointer
+      *cosret = cos1;
+      return sin1;
+    }
+  else /*if (SC == 4)*/
+    return sin1 / cos1;
+}
+
+// instantiations of sincos_f template:
+
+static inline Vec4f sin(Vec4f const& x) { return sincos_f<Vec4f, Vec4i, Vec4fb, 1>(0, x); }
+
+static inline Vec4f cos(Vec4f const& x) { return sincos_f<Vec4f, Vec4i, Vec4fb, 2>(0, x); }
+
+static inline Vec4f sincos(Vec4f* cosret, Vec4f const& x) { return sincos_f<Vec4f, Vec4i, Vec4fb, 3>(cosret, x); }
+
+static inline Vec4f tan(Vec4f const& x) { return sincos_f<Vec4f, Vec4i, Vec4fb, 4>(0, x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f sin(Vec8f const& x) { return sincos_f<Vec8f, Vec8i, Vec8fb, 1>(0, x); }
+
+static inline Vec8f cos(Vec8f const& x) { return sincos_f<Vec8f, Vec8i, Vec8fb, 2>(0, x); }
+
+static inline Vec8f sincos(Vec8f* cosret, Vec8f const& x) { return sincos_f<Vec8f, Vec8i, Vec8fb, 3>(cosret, x); }
+
+static inline Vec8f tan(Vec8f const& x) { return sincos_f<Vec8f, Vec8i, Vec8fb, 4>(0, x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f sin(Vec16f const& x) { return sincos_f<Vec16f, Vec16i, Vec16fb, 1>(0, x); }
+
+static inline Vec16f cos(Vec16f const& x) { return sincos_f<Vec16f, Vec16i, Vec16fb, 2>(0, x); }
+
+static inline Vec16f sincos(Vec16f* cosret, Vec16f const& x) { return sincos_f<Vec16f, Vec16i, Vec16fb, 3>(cosret, x); }
+
+static inline Vec16f tan(Vec16f const& x) { return sincos_f<Vec16f, Vec16i, Vec16fb, 4>(0, x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// *************************************************************
+//             tan template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  integer vector type with same element size
+// ITYPEH: integer vector type with half the element size
+// BVTYPE: boolean vector type
+// Paramterers:
+// x = input x (radians)
+template <class VTYPE, class ITYPE, class ITYPEH, class BVTYPE>
+static inline VTYPE tan_d(VTYPE const& x)
+{
+  // define constants
+  const double ONEOPIO4 = 4. / VM_PI;
+
+  const double DP1 = 7.853981554508209228515625E-1;
+  const double DP2 = 7.94662735614792836714E-9;
+  const double DP3 = 3.06161699786838294307E-17;
+
+  const double P2tan = -1.30936939181383777646E4;
+  const double P1tan = 1.15351664838587416140E6;
+  const double P0tan = -1.79565251976484877988E7;
+
+  const double Q3tan = 1.36812963470692954678E4;
+  const double Q2tan = -1.32089234440210967447E6;
+  const double Q1tan = 2.50083801823357915839E7;
+  const double Q0tan = -5.38695755929454629881E7;
+
+  VTYPE xa, y, z, zz, px, qx, tn, recip;  // data vectors
+  ITYPEH q;                               // integer vector, 32 bit
+  ITYPE qq;                               // integer vector, 64 bit
+  BVTYPE doinvert, xzero, overflow;       // boolean vectors
+
+  xa = abs(x);
+
+  // Find quadrant
+  //      0 -   pi/4 => 0
+  //   pi/4 - 3*pi/4 => 2
+  // 3*pi/4 - 5*pi/4 => 4
+  // 5*pi/4 - 7*pi/4 => 6
+  // 7*pi/4 - 8*pi/4 => 8
+
+  q = vm_truncate_low_to_int(xa * ONEOPIO4);
+  q = (q + 1) & ~1;
+
+  y = vm_half_int_vector_to_double<VTYPE>(q);  // quadrant, as double
+
+  // Reduce by extended precision modular arithmetic
+  z = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa)));  // z = ((xa - y * DP1) - y * DP2) - y * DP3;
+
+  // Pade expansion of tan, valid for -pi/4 <= x <= pi/4
+  zz = z * z;
+  px = polynomial_2(zz, P0tan, P1tan, P2tan);
+  qx = polynomial_4n(zz, Q0tan, Q1tan, Q2tan, Q3tan);
+
+  // qx cannot be 0 for x <= pi/4
+  tn = mul_add(px / qx, z * zz, z);  // tn = z + z * zz * px / qx;
+
+  // if (q&2) tn = -1/tn
+  qq       = vm_half_int_vector_to_full<ITYPE, ITYPEH>(q);
+  doinvert = BVTYPE((qq & 2) != 0);
+  xzero    = (xa == 0.);
+  // avoid division by 0. We will not be using recip anyway if xa == 0.
+  // tn never becomes exactly 0 when x = pi/2 so we only have to make
+  // a special case for x == 0.
+  recip = (-1.) / select(xzero, VTYPE(-1.), tn);
+  tn    = select(doinvert, recip, tn);
+  tn    = sign_combine(tn, x);  // get original sign
+
+  // check for overflow
+  if(horizontal_or(q < 0))
+    {
+      overflow = (y < 0) & is_finite(xa);
+      tn       = select(overflow, 0., tn);
+    }
+
+  return tn;
+}
+
+// instantiations of tan_d template:
+
+static inline Vec2d tan(Vec2d const& x) { return tan_d<Vec2d, Vec2q, Vec4i, Vec2db>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d tan(Vec4d const& x) { return tan_d<Vec4d, Vec4q, Vec4i, Vec4db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d tan(Vec8d const& x) { return tan_d<Vec8d, Vec8q, Vec8i, Vec8db>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+/*
+This is removed for the single precision version.
+It is faster to use tan(x) = sin(x)/cos(x)
+
+// *************************************************************
+//             tan template, single precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  integer vector type with same element size
+// BVTYPE: boolean vector type
+// Paramterers:
+// x = input x (radians)
+// cosret = return pointer (only if SC = 3)
+template<class VTYPE, class ITYPE, class BVTYPE>
+static inline VTYPE tan_f(VTYPE const & x) {
+
+    // define constants
+    const float ONEOPIO4f = (float)(4./VM_PI);
+
+    const float DP1F = 0.78515625f;
+    const float DP2F = 2.4187564849853515625E-4f;
+    const float DP3F = 3.77489497744594108E-8f;
+
+    const float P5tanf = 9.38540185543E-3f;
+    const float P4tanf = 3.11992232697E-3f;
+    const float P3tanf = 2.44301354525E-2f;
+    const float P2tanf = 5.34112807005E-2f;
+    const float P1tanf = 1.33387994085E-1f;
+    const float P0tanf = 3.33331568548E-1f;
+
+    VTYPE  xa, y, z, zz, tn, recip;  // data vectors
+    ITYPE  q;                        // integer vector
+    BVTYPE doinvert, xzero;          // boolean vectors
+
+    xa = abs(x);
+
+    // Find quadrant
+    //      0 -   pi/4 => 0
+    //   pi/4 - 3*pi/4 => 2
+    // 3*pi/4 - 5*pi/4 => 4
+    // 5*pi/4 - 7*pi/4 => 6
+    // 7*pi/4 - 8*pi/4 => 8
+    q = truncate_to_int(xa * ONEOPIO4f);
+    q = (q + 1) & ~1;
+
+    y = to_float(q);             // quadrant, as float
+
+    // Reduce by extended precision modular arithmetic
+    z = ((xa - y * DP1F) - y * DP2F) - y * DP3F;
+    //z = (xa - y * DP1F) - y * (DP2F + DP3F);
+    zz = z * z;
+
+    // Taylor expansion
+    tn = polynomial_5(zz, P0tanf, P1tanf, P2tanf, P3tanf, P4tanf, P5tanf) * (zz * z) + z;
+
+    // if (q&2) tn = -1/tn
+    doinvert = (q & 2) != 0;
+    xzero = (xa == 0.f);
+    // avoid division by 0. We will not be using recip anyway if xa == 0.
+    // tn never becomes exactly 0 when x = pi/2 so we only have to make
+    // a special case for x == 0.
+    recip = (-1.f) / select(xzero, VTYPE(-1.f), tn);
+    tn = select(doinvert, recip, tn);
+    tn = sign_combine(tn, x);          // get original sign
+
+    return tn;
+}
+
+// instantiations of tan_f template:
+
+static inline Vec4f tan(Vec4f const & x) {
+    return tan_f<Vec4f, Vec4i, Vec4fb>(x);
+}
+
+static inline Vec8f tan(Vec8f const & x) {
+    return tan_f<Vec8f, Vec8i, Vec8fb>(x);
+}
+*/
+
+// *************************************************************
+//             asin/acos template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// BVTYPE: boolean vector type
+// AC: 0 = asin, 1 = acos
+// Paramterers:
+// x = input x
+template <class VTYPE, class BVTYPE, int AC>
+static inline VTYPE asin_d(VTYPE const& x)
+{
+  // define constants
+  const double R4asin = 2.967721961301243206100E-3;
+  const double R3asin = -5.634242780008963776856E-1;
+  const double R2asin = 6.968710824104713396794E0;
+  const double R1asin = -2.556901049652824852289E1;
+  const double R0asin = 2.853665548261061424989E1;
+
+  const double S3asin = -2.194779531642920639778E1;
+  const double S2asin = 1.470656354026814941758E2;
+  const double S1asin = -3.838770957603691357202E2;
+  const double S0asin = 3.424398657913078477438E2;
+
+  const double P5asin = 4.253011369004428248960E-3;
+  const double P4asin = -6.019598008014123785661E-1;
+  const double P3asin = 5.444622390564711410273E0;
+  const double P2asin = -1.626247967210700244449E1;
+  const double P1asin = 1.956261983317594739197E1;
+  const double P0asin = -8.198089802484824371615E0;
+
+  const double Q4asin = -1.474091372988853791896E1;
+  const double Q3asin = 7.049610280856842141659E1;
+  const double Q2asin = -1.471791292232726029859E2;
+  const double Q1asin = 1.395105614657485689735E2;
+  const double Q0asin = -4.918853881490881290097E1;
+
+  VTYPE xa, xb, x1, x2, x3, x4, x5, px, qx, rx, sx, vx, wx, y1, yb, z, z1, z2;
+  BVTYPE big;
+  bool dobig, dosmall;
+
+  xa  = abs(x);
+  big = xa >= 0.625;
+
+  /*
+  Small: xa < 0.625
+  ------------------
+  x = xa * xa;
+  px = PX(x);
+  qx = QX(x);
+  y1 = x*px/qx;
+  y1 = xa * y1 + xa;
+
+  Big: xa >= 0.625
+  ------------------
+  x = 1.0 - xa;
+  rx = RX(x);
+  sx = SX(x);
+  y1 = x * rx/sx;
+  x3 = sqrt(x+x);
+  y3 = x3 * y1 - MOREBITS;
+  z = pi/2 - x3 - y3
+  */
+
+  // select a common x for all polynomials
+  // This allows sharing of powers of x through common subexpression elimination
+  x1 = select(big, 1.0 - xa, xa * xa);
+
+  // calculate powers of x1 outside branches to make sure they are only calculated once
+  x2 = x1 * x1;
+  x4 = x2 * x2;
+  x5 = x4 * x1;
+  x3 = x2 * x1;
+
+  dosmall = !horizontal_and(big);  // at least one element is small
+  dobig   = horizontal_or(big);    // at least one element is big
+
+  // calculate polynomials (reuse powers of x)
+  if(dosmall)
+    {
+      // px = polynomial_5 (x1, P0asin, P1asin, P2asin, P3asin, P4asin, P5asin);
+      // qx = polynomial_5n(x1, Q0asin, Q1asin, Q2asin, Q3asin, Q4asin);
+      px = mul_add(x3, P3asin, P0asin) + mul_add(x4, P4asin, x1 * P1asin) + mul_add(x5, P5asin, x2 * P2asin);
+      qx = mul_add(x4, Q4asin, x5) + mul_add(x3, Q3asin, x1 * Q1asin) + mul_add(x2, Q2asin, Q0asin);
+    }
+  if(dobig)
+    {
+      // rx = polynomial_4 (x1, R0asin, R1asin, R2asin, R3asin, R4asin);
+      // sx = polynomial_4n(x1, S0asin, S1asin, S2asin, S3asin);
+      rx = mul_add(x3, R3asin, x2 * R2asin) + mul_add(x4, R4asin, mul_add(x1, R1asin, R0asin));
+      sx = mul_add(x3, S3asin, x4) + mul_add(x2, S2asin, mul_add(x1, S1asin, S0asin));
+    }
+
+  // select and divide outside branches to avoid dividing twice
+  vx = select(big, rx, px);
+  wx = select(big, sx, qx);
+  y1 = vx / wx * x1;
+
+  // results for big
+  if(dobig)
+    {                            // avoid square root if all are small
+      xb = sqrt(x1 + x1);        // this produces NAN if xa > 1 so we don't need a special case for xa > 1
+      z1 = mul_add(xb, y1, xb);  // yb = xb * y1; z1 = xb + yb;
+    }
+
+  // results for small
+  z2 = mul_add(xa, y1, xa);  // z2 = xa * y1 + xa;
+
+  // correct for sign
+  if(AC)
+    {  // acos
+      z1 = select(x < 0., VM_PI - z1, z1);
+      z2 = VM_PI_2 - sign_combine(z2, x);
+      z  = select(big, z1, z2);
+    }
+  else
+    {  // asin
+      z1 = VM_PI_2 - z1;
+      z  = select(big, z1, z2);
+      z  = sign_combine(z, x);
+    }
+  return z;
+}
+
+// instantiations of asin_d template:
+
+static inline Vec2d asin(Vec2d const& x) { return asin_d<Vec2d, Vec2db, 0>(x); }
+
+static inline Vec2d acos(Vec2d const& x) { return asin_d<Vec2d, Vec2db, 1>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d asin(Vec4d const& x) { return asin_d<Vec4d, Vec4db, 0>(x); }
+
+static inline Vec4d acos(Vec4d const& x) { return asin_d<Vec4d, Vec4db, 1>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d asin(Vec8d const& x) { return asin_d<Vec8d, Vec8db, 0>(x); }
+
+static inline Vec8d acos(Vec8d const& x) { return asin_d<Vec8d, Vec8db, 1>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// *************************************************************
+//             asin/acos template, single precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// BVTYPE: boolean vector type
+// AC: 0 = asin, 1 = acos
+// Paramterers:
+// x = input x
+template <class VTYPE, class BVTYPE, int AC>
+static inline VTYPE asin_f(VTYPE const& x)
+{
+  // define constants
+  const float P4asinf = 4.2163199048E-2f;
+  const float P3asinf = 2.4181311049E-2f;
+  const float P2asinf = 4.5470025998E-2f;
+  const float P1asinf = 7.4953002686E-2f;
+  const float P0asinf = 1.6666752422E-1f;
+
+  VTYPE xa, x1, x2, x3, x4, xb, z, z1, z2;
+  BVTYPE big;
+
+  xa  = abs(x);
+  big = xa > 0.5f;
+
+  x1 = 0.5f * (1.0f - xa);
+  x2 = xa * xa;
+  x3 = select(big, x1, x2);
+
+  // if (horizontal_or(big))
+  {
+    xb = sqrt(x1);
+  }
+  x4 = select(big, xb, xa);
+
+  z  = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
+  z  = mul_add(z, x3 * x4, x4);  // z = z * (x3*x4) + x4;
+  z1 = z + z;
+
+  // correct for sign
+  if(AC)
+    {  // acos
+      z1 = select(x < 0., float(VM_PI) - z1, z1);
+      z2 = float(VM_PI_2) - sign_combine(z, x);
+      z  = select(big, z1, z2);
+    }
+  else
+    {  // asin
+      z1 = float(VM_PI_2) - z1;
+      z  = select(big, z1, z);
+      z  = sign_combine(z, x);
+    }
+
+  return z;
+}
+
+// instantiations of asin_f template:
+
+static inline Vec4f asin(Vec4f const& x) { return asin_f<Vec4f, Vec4fb, 0>(x); }
+
+static inline Vec4f acos(Vec4f const& x) { return asin_f<Vec4f, Vec4fb, 1>(x); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f asin(Vec8f const& x) { return asin_f<Vec8f, Vec8fb, 0>(x); }
+static inline Vec8f acos(Vec8f const& x) { return asin_f<Vec8f, Vec8fb, 1>(x); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f asin(Vec16f const& x) { return asin_f<Vec16f, Vec16fb, 0>(x); }
+static inline Vec16f acos(Vec16f const& x) { return asin_f<Vec16f, Vec16fb, 1>(x); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// *************************************************************
+//             atan template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// BVTYPE: boolean vector type
+// T2:     0 = atan, 1 = atan2
+// Paramterers:
+// y, x. calculate tan(y/x)
+// result is between -pi/2 and +pi/2 when x > 0
+// result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
+// atan2(0,0) gives NAN. Future versions may give 0
+template <class VTYPE, class BVTYPE, int T2>
+static inline VTYPE atan_d(VTYPE const& y, VTYPE const& x)
+{
+  // define constants
+  // const double ONEOPIO4 = 4./VM_PI;
+  const double MOREBITS   = 6.123233995736765886130E-17;
+  const double MOREBITSO2 = MOREBITS * 0.5;
+  const double T3PO8      = VM_SQRT2 + 1.;  // 2.41421356237309504880;
+
+  const double P4atan = -8.750608600031904122785E-1;
+  const double P3atan = -1.615753718733365076637E1;
+  const double P2atan = -7.500855792314704667340E1;
+  const double P1atan = -1.228866684490136173410E2;
+  const double P0atan = -6.485021904942025371773E1;
+
+  const double Q4atan = 2.485846490142306297962E1;
+  const double Q3atan = 1.650270098316988542046E2;
+  const double Q2atan = 4.328810604912902668951E2;
+  const double Q1atan = 4.853903996359136964868E2;
+  const double Q0atan = 1.945506571482613964425E2;
+
+  VTYPE t, x1, x2, y1, y2, s, fac, a, b, z, zz, px, qx, re;  // data vectors
+  BVTYPE swapxy, notbig, notsmal;                            // boolean vectors
+
+  if(T2)
+    {  // atan2(y,x)
+      // move in first octant
+      x1     = abs(x);
+      y1     = abs(y);
+      swapxy = (y1 > x1);
+      // swap x and y if y1 > x1
+      x2 = select(swapxy, y1, x1);
+      y2 = select(swapxy, x1, y1);
+      t  = y2 / x2;  // x = y = 0 gives NAN here
+    }
+  else
+    {  // atan(y)
+      t = abs(y);
+    }
+
+  // small:  t < 0.66
+  // medium: 0.66 <= t <= 2.4142 (1+sqrt(2))
+  // big:    t > 2.4142
+  notbig  = t <= T3PO8;  // t <= 2.4142
+  notsmal = t >= 0.66;   // t >= 0.66
+
+  s   = select(notbig, VTYPE(VM_PI_4), VTYPE(VM_PI_2));
+  s   = notsmal & s;  // select(notsmal, s, 0.);
+  fac = select(notbig, VTYPE(MOREBITSO2), VTYPE(MOREBITS));
+  fac = notsmal & fac;  // select(notsmal, fac, 0.);
+
+  // small:  z = t / 1.0;
+  // medium: z = (t-1.0) / (t+1.0);
+  // big:    z = -1.0 / t;
+  a = notbig & t;  // select(notbig, t, 0.);
+  a = if_add(notsmal, a, -1.);
+  b = notbig & VTYPE(1.);  //  select(notbig, 1., 0.);
+  b = if_add(notsmal, b, t);
+  z = a / b;  // division by 0 will not occur unless x and y are both 0
+
+  zz = z * z;
+
+  px = polynomial_4(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
+  qx = polynomial_5n(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
+
+  re = mul_add(px / qx, z * zz, z);  // re = (px / qx) * (z * zz) + z;
+  re += s + fac;
+
+  if(T2)
+    {  // atan2(y,x)
+      // move back in place
+      re = select(swapxy, VM_PI_2 - re, re);
+      re = select(x < 0., VM_PI - re, re);
+      re = select((x | y) == 0., 0., re);  // atan2(0,0) = 0 by convention
+    }
+  // get sign bit
+  re = sign_combine(re, y);
+
+  return re;
+}
+
+// instantiations of atan_d template:
+
+static inline Vec2d atan2(Vec2d const& y, Vec2d const& x) { return atan_d<Vec2d, Vec2db, 1>(y, x); }
+
+static inline Vec2d atan(Vec2d const& y) { return atan_d<Vec2d, Vec2db, 0>(y, 0.); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d atan2(Vec4d const& y, Vec4d const& x) { return atan_d<Vec4d, Vec4db, 1>(y, x); }
+
+static inline Vec4d atan(Vec4d const& y) { return atan_d<Vec4d, Vec4db, 0>(y, 0.); }
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d atan2(Vec8d const& y, Vec8d const& x) { return atan_d<Vec8d, Vec8db, 1>(y, x); }
+
+static inline Vec8d atan(Vec8d const& y) { return atan_d<Vec8d, Vec8db, 0>(y, 0.); }
+#endif  // MAX_VECTOR_SIZE >= 512
+
+// *************************************************************
+//             atan template, single precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// BVTYPE: boolean vector type
+// T2:     0 = atan, 1 = atan2
+// Paramterers:
+// y, x. calculate tan(y/x)
+// result is between -pi/2 and +pi/2 when x > 0
+// result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
+// atan2(0,0) gives NAN. Future versions may give 0
+template <class VTYPE, class BVTYPE, int T2>
+static inline VTYPE atan_f(VTYPE const& y, VTYPE const& x)
+{
+  // define constants
+  const float P3atanf = 8.05374449538E-2f;
+  const float P2atanf = -1.38776856032E-1f;
+  const float P1atanf = 1.99777106478E-1f;
+  const float P0atanf = -3.33329491539E-1f;
+
+  VTYPE t, x1, x2, y1, y2, s, a, b, z, zz, re;  // data vectors
+  BVTYPE swapxy, notbig, notsmal;               // boolean vectors
+
+  if(T2)
+    {  // atan2(y,x)
+      // move in first octant
+      x1     = abs(x);
+      y1     = abs(y);
+      swapxy = (y1 > x1);
+      // swap x and y if y1 > x1
+      x2 = select(swapxy, y1, x1);
+      y2 = select(swapxy, x1, y1);
+
+      // do we need to protect against x = y = 0? It will just produce NAN, probably without delay
+      t = y2 / x2;
+    }
+  else
+    {  // atan(y)
+      t = abs(y);
+    }
+
+  // small:  t < 0.4142
+  // medium: 0.4142 <= t <= 2.4142
+  // big:    t > 2.4142  (not for atan2)
+  if(!T2)
+    {                                       // atan(y)
+      notsmal = t >= float(VM_SQRT2 - 1.);  // t >= tan  pi/8
+      notbig  = t <= float(VM_SQRT2 + 1.);  // t <= tan 3pi/8
+
+      s = select(notbig, VTYPE(float(VM_PI_4)), VTYPE(float(VM_PI_2)));
+      s = notsmal & s;  // select(notsmal, s, 0.);
+
+      // small:  z = t / 1.0;
+      // medium: z = (t-1.0) / (t+1.0);
+      // big:    z = -1.0 / t;
+      a = notbig & t;  // select(notbig, t, 0.);
+      a = if_add(notsmal, a, -1.f);
+      b = notbig & VTYPE(1.f);  //  select(notbig, 1., 0.);
+      b = if_add(notsmal, b, t);
+      z = a / b;  // division by 0 will not occur unless x and y are both 0
+    }
+  else
+    {  // atan2(y,x)
+      // small:  z = t / 1.0;
+      // medium: z = (t-1.0) / (t+1.0);
+      notsmal = t >= float(VM_SQRT2 - 1.);
+      a       = if_add(notsmal, t, -1.f);
+      b       = if_add(notsmal, 1.f, t);
+      s       = notsmal & VTYPE(float(VM_PI_4));
+      z       = a / b;
+    }
+
+  zz = z * z;
+
+  // Taylor expansion
+  re = polynomial_3(zz, P0atanf, P1atanf, P2atanf, P3atanf);
+  re = mul_add(re, zz * z, z) + s;
+
+  if(T2)
+    {  // atan2(y,x)
+      // move back in place
+      re = select(swapxy, float(VM_PI_2) - re, re);
+      re = select(x < 0., float(VM_PI) - re, re);
+      re = select((x | y) == 0.f, 0.f, re);  // atan2(0,0) = 0 by convention
+    }
+  // get sign bit
+  re = sign_combine(re, y);
+
+  return re;
+}
+
+// instantiations of atan_f template:
+
+static inline Vec4f atan2(Vec4f const& y, Vec4f const& x) { return atan_f<Vec4f, Vec4fb, 1>(y, x); }
+
+static inline Vec4f atan(Vec4f const& y) { return atan_f<Vec4f, Vec4fb, 0>(y, 0.); }
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f atan2(Vec8f const& y, Vec8f const& x) { return atan_f<Vec8f, Vec8fb, 1>(y, x); }
+
+static inline Vec8f atan(Vec8f const& y) { return atan_f<Vec8f, Vec8fb, 0>(y, 0.); }
+
+#endif  // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f atan2(Vec16f const& y, Vec16f const& x) { return atan_f<Vec16f, Vec16fb, 1>(y, x); }
+
+static inline Vec16f atan(Vec16f const& y) { return atan_f<Vec16f, Vec16fb, 0>(y, 0.); }
+
+#endif  // MAX_VECTOR_SIZE >= 512
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif