From e30e3fec7a7d72dc54238ac693dba9a7a3b82b8f Mon Sep 17 00:00:00 2001 From: "Marcel H. Schubert" <schubert@coll.mpg.de> Date: Mon, 28 Jun 2021 16:18:51 +0200 Subject: [PATCH] fixed shell script; added 80 core processing --- Scripts/Preprocessing/preprocess.py | 9 +++++++-- .../{preprocess_all.cmd => preprocess_all.sh} | 2 +- ...{preprocess_creator.cmd => preprocess_creator.sh} | 8 ++++---- ...{preprocess_manager.cmd => preprocess_manager.sh} | 8 ++++---- ...process_performer.cmd => preprocess_performer.sh} | 12 ++++++------ .../{preprocess_sports.cmd => preprocess_sports.sh} | 8 ++++---- 6 files changed, 26 insertions(+), 21 deletions(-) rename Scripts/Preprocessing/{preprocess_all.cmd => preprocess_all.sh} (89%) rename Scripts/Preprocessing/{preprocess_creator.cmd => preprocess_creator.sh} (70%) rename Scripts/Preprocessing/{preprocess_manager.cmd => preprocess_manager.sh} (70%) rename Scripts/Preprocessing/{preprocess_performer.cmd => preprocess_performer.sh} (65%) rename Scripts/Preprocessing/{preprocess_sports.cmd => preprocess_sports.sh} (70%) diff --git a/Scripts/Preprocessing/preprocess.py b/Scripts/Preprocessing/preprocess.py index 033179ff..f7fab336 100644 --- a/Scripts/Preprocessing/preprocess.py +++ b/Scripts/Preprocessing/preprocess.py @@ -843,8 +843,13 @@ def _main(args): manager = mp.Manager() #create a two queues to split work into two parts q = [manager.Queue(), manager.Queue()] - - pool = mp.Pool(4, maxtasksperchild=1) + if not args['test']: + ncpus = mp.cpu_count() + if ncpus < 80: + print('failed to get 80 cpus - only got '.format(ncpus)) + else: + ncpus = 4 + pool = mp.Pool(ncpus, maxtasksperchild=1) #pool = mp.Pool(mp.cpu_count(), maxtasksperchild=1) print('create listener for saving of data...') sys.stdout.flush() diff --git a/Scripts/Preprocessing/preprocess_all.cmd b/Scripts/Preprocessing/preprocess_all.sh similarity index 89% rename from Scripts/Preprocessing/preprocess_all.cmd rename to Scripts/Preprocessing/preprocess_all.sh index 193462bd..360ec2d0 100644 --- a/Scripts/Preprocessing/preprocess_all.cmd +++ b/Scripts/Preprocessing/preprocess_all.sh @@ -41,7 +41,7 @@ module load scikit-learn/0.24.1 names=(creator performer sports) for i in ${names[@]}; do # Run the program: - srun -N 1 -n 1 python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_$i.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c (1,5) -w (1,2) -t (1,3) -d (1,5) -o (1,3) --workset=workset --part=$i --rerun --both --asis --spacy --encase_list emoji emoticon + srun -N 1 -n 1 python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_$i.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=$i --rerun --both --asis --spacy --encase_list emoji emoticon done wait echo "job finished" diff --git a/Scripts/Preprocessing/preprocess_creator.cmd b/Scripts/Preprocessing/preprocess_creator.sh similarity index 70% rename from Scripts/Preprocessing/preprocess_creator.cmd rename to Scripts/Preprocessing/preprocess_creator.sh index 3b488e31..3912c3ad 100644 --- a/Scripts/Preprocessing/preprocess_creator.cmd +++ b/Scripts/Preprocessing/preprocess_creator.sh @@ -1,9 +1,9 @@ #!/bin/bash -l -typ=creator + # Standard output and error: #SBATCH -o ./../../jobscripts/out/preprocess_${typ}_out.%j -#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err_part_2.%j +#SBATCH -e ./../../jobscripts/out/preprocess_${typ}.%j # Initial working directory: #SBATCH -D /ptmp/mschuber/PAN/Scripts/Preprocessing # Job Name: @@ -36,10 +36,10 @@ module load anaconda/3/2020.02 module load tensorflow/cpu/2.5.0 module load scikit-learn/0.24.1 +typ=creator - -python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon +srun python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon echo "job finished" \ No newline at end of file diff --git a/Scripts/Preprocessing/preprocess_manager.cmd b/Scripts/Preprocessing/preprocess_manager.sh similarity index 70% rename from Scripts/Preprocessing/preprocess_manager.cmd rename to Scripts/Preprocessing/preprocess_manager.sh index 33cb63cc..63d3b099 100644 --- a/Scripts/Preprocessing/preprocess_manager.cmd +++ b/Scripts/Preprocessing/preprocess_manager.sh @@ -1,9 +1,9 @@ #!/bin/bash -l -typ=manager + # Standard output and error: #SBATCH -o ./../../jobscripts/out/preprocess_${typ}_out.%j -#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err_part_2.%j +#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err.%j # Initial working directory: #SBATCH -D /ptmp/mschuber/PAN/Scripts/Preprocessing # Job Name: @@ -38,8 +38,8 @@ module load scikit-learn/0.24.1 +typ=manager - -python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon +srun python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon echo "job finished" \ No newline at end of file diff --git a/Scripts/Preprocessing/preprocess_performer.cmd b/Scripts/Preprocessing/preprocess_performer.sh similarity index 65% rename from Scripts/Preprocessing/preprocess_performer.cmd rename to Scripts/Preprocessing/preprocess_performer.sh index 204f99b0..ae375ae0 100644 --- a/Scripts/Preprocessing/preprocess_performer.cmd +++ b/Scripts/Preprocessing/preprocess_performer.sh @@ -1,9 +1,8 @@ #!/bin/bash -l -typ=performer # Standard output and error: #SBATCH -o ./../../jobscripts/out/preprocess_${typ}_out.%j -#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err_part_2.%j +#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err.%j # Initial working directory: #SBATCH -D /ptmp/mschuber/PAN/Scripts/Preprocessing # Job Name: @@ -20,9 +19,10 @@ typ=performer # #SBATCH --mem=180000 #SBATCH --mail-type=none -#SBATCH --mail-user=schubert@coll.mpg.de +#SBATCH --mail-user=<userid>@coll.mpg.de +# # Wall clock limit: -#SBATCH --time=24:00:00 +#SBATCH --time 24:00:00 ##export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} # For pinning threads correctly: @@ -36,10 +36,10 @@ module load anaconda/3/2020.02 module load tensorflow/cpu/2.5.0 module load scikit-learn/0.24.1 +typ=performer - -python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon +srun python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon echo "job finished" \ No newline at end of file diff --git a/Scripts/Preprocessing/preprocess_sports.cmd b/Scripts/Preprocessing/preprocess_sports.sh similarity index 70% rename from Scripts/Preprocessing/preprocess_sports.cmd rename to Scripts/Preprocessing/preprocess_sports.sh index 161765fa..f7825455 100644 --- a/Scripts/Preprocessing/preprocess_sports.cmd +++ b/Scripts/Preprocessing/preprocess_sports.sh @@ -1,9 +1,9 @@ #!/bin/bash -l -typ=sports + # Standard output and error: #SBATCH -o ./../../jobscripts/out/preprocess_${typ}_out.%j -#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err_part_2.%j +#SBATCH -e ./../../jobscripts/out/preprocess_${typ}_err.%j # Initial working directory: #SBATCH -D /ptmp/mschuber/PAN/Scripts/Preprocessing # Job Name: @@ -37,9 +37,9 @@ module load tensorflow/cpu/2.5.0 module load scikit-learn/0.24.1 +typ=sports - -python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon +srun python preprocess.py -p ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31 -f workset_${typ}.ndjson -s ../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed -c "(1,5)" -w "(1,2)" -t "(1,3)" -d "(1,5)" -o "(1,3)" --workset=workset --part=${typ} --rerun --both --asis --spacy --encase_list emoji emoticon echo "job finished" \ No newline at end of file -- GitLab