Commit 6ee4eea3 authored by Marcel Henrik Schubert's avatar Marcel Henrik Schubert
Browse files

stat script

parent 7fb358de
#!/bin/bash -l
# Standard output and error:
# #SBATCH --open-mode=truncate
#SBATCH -o ./out/stat.out
#SBATCH -e ./out/stat.err
# Initial working directory:
#SBATCH -D ./
# Job Name:
#SBATCH -J stat
# Queue:
#SBATCH --partition=small
# Number of nodes and MPI tasks per node:
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
# Enable Hyperthreading:
##SBATCH --ntasks-per-core=2
# for OpenMP:
#SBATCH --cpus-per-task=2
#SBATCH --mail-type=none
#SBATCH --mail-user=schubert@coll.mpg.de
# Wall clock limit:
#SBATCH --time=04:00:00
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
# For pinning threads correctly:
export OMP_PLACES=cores
module load gcc/8
module load anaconda/3/5.1
# Run the program:
srun python /draco/u/mschuber/PAN/attributionfeatures/Scripts/stats.py
echo "job finished"
\ No newline at end of file
#!/usr/bin/env python3
import pandas as pd
import numpy as np
import ndjson
import jsonlines
import json
import os
import sys
import random as rd
import json
import re, regex
import gc
from statistics import median, mean, stdev
import math
import time
import random
from joblib import dump, load
datapath = '/draco/ptmp/mschuber/PAN/Data/pan19-celebrity-profiling-training-dataset-2019-01-31/'
#datapath = '../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/'
subset = 'workset'
filebeg = 'workset_preprocessed_'
filend = '.ndjson'
grams = ['singlegram']
tweetInfo = {}
subgroups = {'male_child_21': [], 'male_young_adult_35': [], 'male_adult_50': [], 'male_old_adult_65': [], 'male_retiree': [],
'female_child_21': [], 'female_young_adult_35': [], 'female_adult_50': [], 'female_old_adult_65': [], 'female_retiree': []}
life_phase = {'child_21': [], 'young_adult_35': [], 'adult_50': [], 'old_adult_65': [], 'retiree': []}
gender = {'male': [], 'female':[]}
with open(datapath+subset+'/'+filebeg+grams[0]+filend, 'r', encoding = 'utf-8') as f:
i = 0
for line in f:
dic = ndjson.loads(line)[0]
autId = dic['author_id']
age = 2019 - dic['birthyear']
if autId in tweetInfo:
tweetInfo[autId]['amount'] += 1
tweetInfo[autId]['lengths'].append(len(dic['singlegram'])-2)
else:
if age <22:
lifePhase = 'child_21'
elif age <36:
lifePhase = 'young_adult_35'
elif age < 51:
lifePhase = 'adult_50'
elif age <66:
lifePhase = 'old_adult_65'
else:
lifePhase = 'retiree'
tweetInfo[autId] = {}
tweetInfo[autId]['amount'] = 1
tweetInfo[autId]['lengths'] = [len(dic['singlegram'])-2]
tweetInfo[autId]['life_phase'] = lifePhase
tweetInfo[autId]['gender'] = dic['gender']
tweetInfo[autId]['subgroup'] = dic['gender']+'_'+lifePhase
if i > 4000:
break
i+=1
amounts = []
len_comp = []
len_avg = []
std_avg = []
for key in tweetInfo.keys():
amounts.append(tweetInfo[key]['amount'])
len_comp.extend(tweetInfo[key]['lengths'])
len_avg.append(mean(tweetInfo[key]['lengths']))
std_avg.append(stdev(tweetInfo[key]['lengths']))
gender[tweetInfo[key]['gender']].append(tweetInfo[key]['lengths'])
subgroups[tweetInfo[key]['subgroup']].append(tweetInfo[key]['lengths'])
life_phase[tweetInfo[key]['life_phase']].append(tweetInfo[key]['lengths'])
stats = {}
avg = mean(amounts)
std = stdev(amounts)
len_avg_res = mean(len_comp)
len_std_res = stdev(len_comp)
len_avg_avg = mean(len_avg)
len_std_avg = stdev(len_avg)
len_mean_std = mean(std_avg)
len_std_std = stdev(std_avg)
stats['numTweets_avg_overall'] = avg
stats['numTweets_std_overall'] = std
stats['lenTweets_avg_overall'] = len_avg_res
stats['lenTweets_std_overall'] = len_std_res
stats['lenTweets_avg_avg'] = len_avg_avg
stats['lenTweets_std_avg'] = len_std_avg
stats['lenTweets_avg_std'] = len_mean_std
stats['lenTweets_std_std'] = len_std_std
stats['gender'] = {}
stats['subgroups'] = {}
stats['life_phase'] = {}
stats['subKeys'] = []
stats['statKeys'] = ['numTweets_avg_overall','numTweets_std_overall','lenTweets_avg_overall','lenTweets_std_overall','lenTweets_avg_avg','lenTweets_std_avg','lenTweets_avg_std','lenTweets_std_std']
for key in gender.keys():
if gender[key]:
stats['gender'][key] = {}
stats['subKeys'].append(str(key))
tmp_amount = [len(x) for x in gender[key]]
tmp_len = [x for sublist in gender[key] for x in sublist]
tmp_means = [mean(sublist) for sublist in gender[key]]
tmp_stds = [stdev(sublist) for sublist in gender[key] if len(sublist) > 1]
stats['gender'][key]['numTweets_avg_overall'] = mean(tmp_amount)
if len(tmp_amount)>1:
stats['gender'][key]['numTweets_std_overall'] = stdev(tmp_amount)
else:
stats['gender'][key]['numTweets_std_overall'] = None
stats['gender'][key]['len_avg_overall'] = mean(tmp_len)
if len(tmp_len) > 1:
stats['gender'][key]['lenTweets_std_'] = stdev(tmp_len)
else:
stats['gender'][key]['lenTweets_std_'] = None
stats['gender'][key]['lenTweets_avg_avg'] = mean(tmp_means)
if len(tmp_means) > 1:
stats['gender'][key]['lenTweets_std_avg'] = stdev(tmp_means)
else:
stats['gender'][key]['lenTweets_std_avg'] = None
stats['gender'][key]['lenTweets_avg_std'] = mean(tmp_stds)
if len(tmp_stds)>1:
stats['gender'][key]['lenTweets_std_std'] = stdev(tmp_stds)
else:
stats['gender'][key]['lenTweets_std_std'] = None
for key in subgroups.keys():
if subgroups[key]:
stats['subgroups'][key] = {}
stats['subKeys'].append(str(key))
tmp_amount = [len(x) for x in subgroups[key]]
tmp_len = [x for sublist in subgroups[key] for x in sublist]
tmp_means = [mean(sublist) for sublist in subgroups[key]]
tmp_stds = [stdev(sublist) for sublist in subgroups[key] if len(sublist) > 1]
stats['subgroups'][key]['numTweets_avg_overall'] = mean(tmp_amount)
if len(tmp_amount)>1:
stats['subgroups'][key]['numTweets_std_overall'] = stdev(tmp_amount)
else:
stats['subgroups'][key]['numTweets_std_overall'] = None
stats['subgroups'][key]['len_avg_overall'] = mean(tmp_len)
if len(tmp_len) > 1:
stats['subgroups'][key]['lenTweets_std_'] = stdev(tmp_len)
else:
stats['subgroups'][key]['lenTweets_std_'] = None
stats['subgroups'][key]['lenTweets_avg_avg'] = mean(tmp_means)
if len(tmp_means) > 1:
stats['subgroups'][key]['lenTweets_std_avg'] = stdev(tmp_means)
else:
stats['subgroups'][key]['lenTweets_std_avg'] = None
stats['subgroups'][key]['lenTweets_avg_std'] = mean(tmp_stds)
if len(tmp_stds)>1:
stats['subgroups'][key]['lenTweets_std_std'] = stdev(tmp_stds)
else:
stats['subgroups'][key]['lenTweets_std_std'] = None
for key in life_phase.keys():
if life_phase[key]:
stats['life_phase'][key] = {}
stats['subKeys'].append(str(key))
tmp_amount = [len(x) for x in life_phase[key]]
tmp_len = [x for sublist in life_phase[key] for x in sublist]
tmp_means = [mean(sublist) for sublist in life_phase[key]]
tmp_stds = [stdev(sublist) for sublist in life_phase[key] if len(sublist) > 1]
stats['life_phase'][key]['numTweets_avg_overall'] = mean(tmp_amount)
if len(tmp_amount)>1:
stats['life_phase'][key]['numTweets_std_overall'] = stdev(tmp_amount)
else:
stats['life_phase'][key]['numTweets_std_overall'] = None
stats['life_phase'][key]['len_avg_overall'] = mean(tmp_len)
if len(tmp_len) > 1:
stats['life_phase'][key]['lenTweets_std_'] = stdev(tmp_len)
else:
stats['life_phase'][key]['lenTweets_std_'] = None
stats['life_phase'][key]['lenTweets_avg_avg'] = mean(tmp_means)
if len(tmp_means) > 1:
stats['life_phase'][key]['lenTweets_std_avg'] = stdev(tmp_means)
else:
stats['life_phase'][key]['lenTweets_std_avg'] = None
stats['life_phase'][key]['lenTweets_avg_std'] = mean(tmp_stds)
if len(tmp_stds)>1:
stats['life_phase'][key]['lenTweets_std_std'] = stdev(tmp_stds)
else:
stats['life_phase'][key]['lenTweets_std_std'] = None
dump(stats, datapath+subset+'/statistics.jlib')
with open(datapath+subset+'/statistics.json', 'w', encoding ='utf-8') as f:
json.dump(stats, f, indent = 4)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment