From e30e3fec7a7d72dc54238ac693dba9a7a3b82b8f Mon Sep 17 00:00:00 2001
From: "Marcel H. Schubert" <schubert@coll.mpg.de>
Date: Mon, 28 Jun 2021 16:18:51 +0200
Subject: [PATCH] fixed shell script; added 80 core processing

---
 Scripts/Preprocessing/preprocess.py                  |  9 +++++++--
 .../{preprocess_all.cmd => preprocess_all.sh}        |  2 +-
 ...{preprocess_creator.cmd => preprocess_creator.sh} |  8 ++++----
 ...{preprocess_manager.cmd => preprocess_manager.sh} |  8 ++++----
 ...process_performer.cmd => preprocess_performer.sh} | 12 ++++++------
 .../{preprocess_sports.cmd => preprocess_sports.sh}  |  8 ++++----
 6 files changed, 26 insertions(+), 21 deletions(-)
 rename Scripts/Preprocessing/{preprocess_all.cmd => preprocess_all.sh} (89%)
 rename Scripts/Preprocessing/{preprocess_creator.cmd => preprocess_creator.sh} (70%)
 rename Scripts/Preprocessing/{preprocess_manager.cmd => preprocess_manager.sh} (70%)
 rename Scripts/Preprocessing/{preprocess_performer.cmd => preprocess_performer.sh} (65%)
 rename Scripts/Preprocessing/{preprocess_sports.cmd => preprocess_sports.sh} (70%)

diff --git a/Scripts/Preprocessing/preprocess.py b/Scripts/Preprocessing/preprocess.py
index 033179ff..f7fab336 100644
--- a/Scripts/Preprocessing/preprocess.py
+++ b/Scripts/Preprocessing/preprocess.py
@@ -843,8 +843,13 @@ def _main(args):
     manager = mp.Manager()
     #create a two queues to split work into two parts
     q = [manager.Queue(), manager.Queue()]
-
-    pool = mp.Pool(4, maxtasksperchild=1)
+    if not args['test']:
+        ncpus = mp.cpu_count()
+        if ncpus < 80:
+            print('failed to get 80 cpus - only got '.format(ncpus))
+    else:
+        ncpus = 4
+    pool = mp.Pool(ncpus, maxtasksperchild=1)
     #pool = mp.Pool(mp.cpu_count(), maxtasksperchild=1)
     print('create listener for saving of data...')
     sys.stdout.flush()
diff --git a/Scripts/Preprocessing/preprocess_all.cmd b/Scripts/Preprocessing/preprocess_all.sh
similarity index 89%
rename from Scripts/Preprocessing/preprocess_all.cmd
rename to Scripts/Preprocessing/preprocess_all.sh
index 193462bd..360ec2d0 100644
--- a/Scripts/Preprocessing/preprocess_all.cmd
+++ b/Scripts/Preprocessing/preprocess_all.sh
@@ -41,7 +41,7 @@ module load scikit-learn/0.24.1
 names=(creator performer sports)
 for i in ${names[@]}; do 
 	# Run the program:
-	srun -N 1 -n 1 python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_$i.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c (1,5) -w (1,2) -t (1,3) -d (1,5) -o (1,3) --workset=workset --part=$i --rerun --both --asis --spacy --encase_list emoji emoticon
+	srun -N 1 -n 1 python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_$i.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=$i --rerun --both --asis --spacy --encase_list emoji emoticon
 done
 wait
 echo "job finished"
diff --git a/Scripts/Preprocessing/preprocess_creator.cmd b/Scripts/Preprocessing/preprocess_creator.sh
similarity index 70%
rename from Scripts/Preprocessing/preprocess_creator.cmd
rename to Scripts/Preprocessing/preprocess_creator.sh
index 3b488e31..3912c3ad 100644
--- a/Scripts/Preprocessing/preprocess_creator.cmd
+++ b/Scripts/Preprocessing/preprocess_creator.sh
@@ -1,9 +1,9 @@
 #!/bin/bash -l
 
-typ=creator
+
 # Standard output and error:
 #SBATCH -o ./../../jobscripts/out/preprocess_${typ}_out.%j
-#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err_part_2.%j
+#SBATCH -e ./../../jobscripts/out/preprocess_${typ}.%j
 # Initial working directory:
 #SBATCH -D /ptmp/mschuber/PAN/Scripts/Preprocessing
 # Job Name:
@@ -36,10 +36,10 @@ module load anaconda/3/2020.02
 module load tensorflow/cpu/2.5.0
 module load scikit-learn/0.24.1
 
+typ=creator
 
 
 
-
-python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon
+srun python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon
 
 echo "job finished"
\ No newline at end of file
diff --git a/Scripts/Preprocessing/preprocess_manager.cmd b/Scripts/Preprocessing/preprocess_manager.sh
similarity index 70%
rename from Scripts/Preprocessing/preprocess_manager.cmd
rename to Scripts/Preprocessing/preprocess_manager.sh
index 33cb63cc..63d3b099 100644
--- a/Scripts/Preprocessing/preprocess_manager.cmd
+++ b/Scripts/Preprocessing/preprocess_manager.sh
@@ -1,9 +1,9 @@
 #!/bin/bash -l
 
-typ=manager
+
 # Standard output and error:
 #SBATCH -o ./../../jobscripts/out/preprocess_${typ}_out.%j
-#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err_part_2.%j
+#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err.%j
 # Initial working directory:
 #SBATCH -D /ptmp/mschuber/PAN/Scripts/Preprocessing
 # Job Name:
@@ -38,8 +38,8 @@ module load scikit-learn/0.24.1
 
 
 
+typ=manager
 
-
-python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon
+srun python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon
 
 echo "job finished"
\ No newline at end of file
diff --git a/Scripts/Preprocessing/preprocess_performer.cmd b/Scripts/Preprocessing/preprocess_performer.sh
similarity index 65%
rename from Scripts/Preprocessing/preprocess_performer.cmd
rename to Scripts/Preprocessing/preprocess_performer.sh
index 204f99b0..ae375ae0 100644
--- a/Scripts/Preprocessing/preprocess_performer.cmd
+++ b/Scripts/Preprocessing/preprocess_performer.sh
@@ -1,9 +1,8 @@
 #!/bin/bash -l
 
-typ=performer
 # Standard output and error:
 #SBATCH -o ./../../jobscripts/out/preprocess_${typ}_out.%j
-#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err_part_2.%j
+#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err.%j
 # Initial working directory:
 #SBATCH -D /ptmp/mschuber/PAN/Scripts/Preprocessing
 # Job Name:
@@ -20,9 +19,10 @@ typ=performer
 #
 #SBATCH --mem=180000
 #SBATCH --mail-type=none
-#SBATCH --mail-user=schubert@coll.mpg.de
+#SBATCH --mail-user=<userid>@coll.mpg.de
+#
 # Wall clock limit:
-#SBATCH --time=24:00:00
+#SBATCH --time 24:00:00
 
 ##export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
 # For pinning threads correctly:
@@ -36,10 +36,10 @@ module load anaconda/3/2020.02
 module load tensorflow/cpu/2.5.0
 module load scikit-learn/0.24.1
 
+typ=performer
 
 
 
-
-python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon
+srun python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon
 
 echo "job finished"
\ No newline at end of file
diff --git a/Scripts/Preprocessing/preprocess_sports.cmd b/Scripts/Preprocessing/preprocess_sports.sh
similarity index 70%
rename from Scripts/Preprocessing/preprocess_sports.cmd
rename to Scripts/Preprocessing/preprocess_sports.sh
index 161765fa..f7825455 100644
--- a/Scripts/Preprocessing/preprocess_sports.cmd
+++ b/Scripts/Preprocessing/preprocess_sports.sh
@@ -1,9 +1,9 @@
 #!/bin/bash -l
 
-typ=sports
+
 # Standard output and error:
 #SBATCH -o ./../../jobscripts/out/preprocess_${typ}_out.%j
-#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err_part_2.%j
+#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err.%j
 # Initial working directory:
 #SBATCH -D /ptmp/mschuber/PAN/Scripts/Preprocessing
 # Job Name:
@@ -37,9 +37,9 @@ module load tensorflow/cpu/2.5.0
 module load scikit-learn/0.24.1
 
 
+typ=sports
 
 
-
-python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon
+srun python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon
 
 echo "job finished"
\ No newline at end of file
-- 
GitLab