Commit 7068db75 authored by Marcel Henrik Schubert's avatar Marcel Henrik Schubert
Browse files

added count vectorizer svm

parent f29f3be9
#!/bin/bash -l
# Standard output and error:
# #SBATCH --open-mode=truncate
#SBATCH -o ./out/svm_count_1000.out
#SBATCH -e ./out/svm_count_1000.err
# Initial working directory:
#SBATCH -D ./
# Job Name:
#SBATCH -J svm_count_1000
# Queue:
#SBATCH --partition=general
# Number of nodes and MPI tasks per node:
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
# Enable Hyperthreading:
#SBATCH --ntasks-per-core=2
# for OpenMP:
#SBATCH --cpus-per-task=64
#SBATCH --mail-type=none
#SBATCH --mail-user=schubert@coll.mpg.de
# Wall clock limit:
#SBATCH --time=24:00:00
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
# For pinning threads correctly:
export OMP_PLACES=threads
export SLURM_HINT=multithread
module load gcc/8
module load anaconda/3/5.1
module load scikit-learn/0.19.1
# Run the program:
srun python /draco/u/mschuber/PAN/attributionfeatures/Scripts/svm_count.py stratified_subsample bigram 4 1000 org
echo "job finished"
\ No newline at end of file
#!/bin/bash -l
# Standard output and error:
# #SBATCH --open-mode=truncate
#SBATCH -o ./out/svm_count_2000.out
#SBATCH -e ./out/svm_count_2000.err
# Initial working directory:
#SBATCH -D ./
# Job Name:
#SBATCH -J svm_count_2000
# Queue:
#SBATCH --partition=general
# Number of nodes and MPI tasks per node:
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
# Enable Hyperthreading:
#SBATCH --ntasks-per-core=2
# for OpenMP:
#SBATCH --cpus-per-task=64
#SBATCH --mail-type=none
#SBATCH --mail-user=schubert@coll.mpg.de
# Wall clock limit:
#SBATCH --time=24:00:00
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
# For pinning threads correctly:
export OMP_PLACES=threads
export SLURM_HINT=multithread
module load gcc/8
module load anaconda/3/5.1
module load scikit-learn/0.19.1
# Run the program:
srun python /draco/u/mschuber/PAN/attributionfeatures/Scripts/svm_count.py stratified_subsample bigram 3 2000 org
echo "job finished"
\ No newline at end of file
#!/bin/bash -l
# Standard output and error:
# #SBATCH --open-mode=truncate
#SBATCH -o ./out/svm_count_200.out
#SBATCH -e ./out/svm_count_200.err
# Initial working directory:
#SBATCH -D ./
# Job Name:
#SBATCH -J svm_count_200
# Queue:
#SBATCH --partition=short
# Number of nodes and MPI tasks per node:
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
# Enable Hyperthreading:
#SBATCH --ntasks-per-core=2
# for OpenMP:
#SBATCH --cpus-per-task=64
#SBATCH --mail-type=none
#SBATCH --mail-user=schubert@coll.mpg.de
# Wall clock limit:
#SBATCH --time=04:00:00
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
# For pinning threads correctly:
export OMP_PLACES=threads
export SLURM_HINT=multithread
module load gcc/8
module load anaconda/3/5.1
module load scikit-learn/0.19.1
# Run the program:
srun python /draco/u/mschuber/PAN/attributionfeatures/Scripts/svm_count.py stratified_subsample bigram 10 200 org
echo "job finished"
\ No newline at end of file
#!/bin/bash -l
# Standard output and error:
# #SBATCH --open-mode=truncate
#SBATCH -o ./out/svm_count_500.out
#SBATCH -e ./out/svm_count_500.err
# Initial working directory:
#SBATCH -D ./
# Job Name:
#SBATCH -J svm_count_500
# Queue:
#SBATCH --partition=general
# Number of nodes and MPI tasks per node:
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
# Enable Hyperthreading:
#SBATCH --ntasks-per-core=2
# for OpenMP:
#SBATCH --cpus-per-task=64
#SBATCH --mail-type=none
#SBATCH --mail-user=schubert@coll.mpg.de
# Wall clock limit:
#SBATCH --time=24:00:00
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
# For pinning threads correctly:
export OMP_PLACES=threads
export SLURM_HINT=multithread
module load gcc/8
module load anaconda/3/5.1
module load scikit-learn/0.19.1
# Run the program:
srun python /draco/u/mschuber/PAN/attributionfeatures/Scripts/svm_count.py stratified_subsample bigram 4 500 org
echo "job finished"
\ No newline at end of file
#!/usr/bin/env python3
import pandas as pd
import numpy as np
import ndjson
import jsonlines
import json
import pickle
import os
import sys
import random as rd
import json
import multiprocessing as mp
import re, regex
import gc
import psutil
import time
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
import scipy as sp
from joblib import dump, load
import collections
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
###dask
#from dask_ml.model_selection import GridSearchCV
#from sklearn.externals.joblib import parallel_backend
#from dklearn.grid_search import GridSearchCV
#Paramters for train-testsplit
test_size = 0.1
random_state = 123456
direct = '/draco/ptmp/mschuber/PAN/Data/pan19-celebrity-profiling-training-dataset-2019-01-31/'
#direct = '../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/workset/'
savedir = 'ml/'
subset = sys.argv[1]
grams = sys.argv[2]
size = sys.argv[4]
subfolder = sys.argv[5]
copies = int(sys.argv[3])
filen = '_preprocessed_'+grams+'_'+size+'_count.ndjson'
cores = mp.cpu_count()
#cores = 16
subdir = subset +'/'+ subfolder + '/' +size
filebeg = subset
clf = SGDClassifier(penalty = 'elasticnet', l1_ratio = 0.5, n_jobs = cores, max_iter = 1000, tol = 1e-3)
#classifiernames = ["PA", "MNB", "SVM", "LOG", "Ridge", "GBC"]
classifiernames = ["SVM"]
def make_file_paths(dirname):
os.makedirs(dirname, exist_ok=True)
def read_data(fname,size=np.NaN):
text = []
author_id = []
gender = []
indices = []
age = []
tweet_length = []
i = 0
if np.isnan(size):
print('size is nan...look at single lines only')
with open(fname, 'r', encoding='utf-8') as f:
for line in f:
dic = ndjson.loads(line)[0]
text.append(dic[grams])
tweet_length.append(round(len(text[-1])/280, 4))
author_id.append(dic['author_id'])
gender.append(dic['gender'])
age.append(dic['birthyear'])
indices.append(i)
i+=1
else:
sys.exit('Chunking not yet implemented')
return text, author_id, indices, gender, age, i, tweet_length
##function for identity since we already made ngrams and do not want any further tokenization or preprocessing
def identity_tokenizer(text):
return text
def remove_one_tweeter(author_id, indices):
count = collections.Counter(author_id).most_common()
count.reverse()
removeList = []
for el in count:
if el[1]<2:
removeList.append(el[0])
c = 0
for i in range(0, len(indices)):
if author_id[indices[i-c]-c] in removeList:
tmp = author_id.pop(indices[i-c]-c)
tmp = indices.pop(i-c)
print('removed {} one-tweters'.format(len(removeList)))
sys.stdout.flush()
return author_id, indices
# Turn text data into lists of integers from dictionary
def text_to_numbers(tokenlist, word_dict):
# Initialize the returned data
data = []
for sentence in tokenlist:
sentence_data = []
# For each word, either use selected index or rare word index
for word in sentence.split():
if word in word_dict:
word_ix = word_dict[word]
else:
word_ix = 0
sentence_data.append(word_ix)
data.append(sentence_data)
return(data)
make_file_paths(direct+subdir+'/'+savedir)
make_file_paths(direct+subdir+'/'+'split_data')
#if True:
if not os.path.exists(direct+subdir+'/'+'split_data'+'/{}_{}_{}_test.json'.format(subset, size, grams)):
print('read data')
sys.stdout.flush()
text, author_id, indices, gender, age, line_num, tweet_length = read_data(direct+subdir+'/'+filebeg+filen)
print('The authors are in counter mode: {}'.format(list(collections.Counter(author_id).most_common())[-1]))
if subfolder != 'min_tweet_1000':
strat, indices = remove_one_tweeter(author_id, indices)
print('The authors after one_tweeter removal are in counter mode: {}'.format(list(collections.Counter(strat).most_common())[-1]))
else:
strat = author_id
print('make train-test split')
sys.stdout.flush()
train, test = train_test_split(indices, random_state = random_state, stratify= strat, test_size=test_size)
train_set = []
test_set = []
y_train_author = []
y_test_author = []
y_train_gender = []
y_test_gender = []
y_train_age = []
y_test_age = []
tweet_test = []
tweet_train = []
print('subset the data accordingly')
sys.stdout.flush()
####make test_ident dic for easy identification
for el in test:
test_set.append(text[el])
y_test_author.append(author_id[el])
y_test_gender.append(gender[el])
y_test_age.append(age[el])
tweet_test.append(tweet_length[el])
for el in train:
train_set.append(text[el])
y_train_author.append(author_id[el])
y_train_gender.append(gender[el])
y_train_age.append(age[el])
tweet_train.append(tweet_length[el])
del text
del author_id
del gender
del age
del indices
del tweet_length
gc.collect()
print('make vocab')
sys.stdout.flush()
words = [token for sublist in train_set for token in sublist]
vocab = {}
count = []
count.extend(collections.Counter(words).most_common((2**18)-1))
i = 0
for token, tcount in count:
vocab[token] = i
i += 1
del count
del words
with open(direct+subdir+'/'+'split_data'+'/{}_{}_{}_vocab.json'.format(subset, size, grams), 'w') as f:
json.dump(vocab, f)
gc.collect()
print('save the data')
sys.stdout.flush()
##save data splits
with open(direct+subdir+'/'+'split_data'+'/{}_{}_{}_vocab.json'.format(subset, size ,grams), 'w') as f:
json.dump(vocab, f)
with open(direct+subdir+'/'+'split_data'+'/{}_{}_{}_train.json'.format(subset, size ,grams), 'w', encoding='utf-8') as f:
json.dump(train_set, f)
with open(direct+subdir+'/'+'split_data'+'/{}_{}_{}_test.json'.format(subset, size ,grams), 'w', encoding='utf-8') as f:
json.dump(test_set, f)
with open(direct+subdir+'/'+'split_data'+'/{}_{}_author_train.json'.format(subset, size), 'w', encoding='utf-8') as f:
json.dump(y_train_author, f)
with open(direct+subdir+'/'+'split_data'+'/{}_{}_author_test.json'.format(subset, size), 'w', encoding='utf-8') as f:
json.dump(y_test_author, f)
with open(direct+subdir+'/'+'split_data'+'/{}_{}_gender_train.json'.format(subset, size), 'w', encoding='utf-8') as f:
json.dump(y_train_gender, f)
with open(direct+subdir+'/'+'split_data'+'/{}_{}_gender_test.json'.format(subset, size), 'w', encoding='utf-8') as f:
json.dump(y_test_gender, f)
with open(direct+subdir+'/'+'split_data'+'/{}_{}_age_train.json'.format(subset, size), 'w', encoding='utf-8') as f:
json.dump(y_train_age, f)
with open(direct+subdir+'/'+'split_data'+'/{}_{}_age_test.json'.format(subset, size), 'w', encoding='utf-8') as f:
json.dump(y_test_age, f)
with open(direct+subdir+'/'+'split_data'+'/{}_{}_length_train.json'.format(subset, size), 'w', encoding='utf-8') as f:
json.dump(tweet_train, f)
with open(direct+subdir+'/'+'split_data'+'/{}_{}_length_test.json'.format(subset, size), 'w', encoding='utf-8') as f:
json.dump(tweet_test, f)
else:
print('load data')
sys.stdout.flush()
train_set = json.load(open(direct+subdir+'/'+'split_data'+'/{}_{}_{}_train.json'.format(subset, size, grams)))
test_set = json.load(open(direct+subdir+'/'+'split_data'+'/{}_{}_{}_test.json'.format(subset, size, grams)))
y_train_author = json.load(open(direct+subdir+'/'+'split_data'+'/{}_{}_author_train.json'.format(subset, size)))
y_test_author = json.load(open(direct+subdir+'/'+'split_data'+'/{}_{}_author_test.json'.format(subset, size)))
y_train_gender = json.load(open(direct+subdir+'/'+'split_data'+'/{}_{}_gender_train.json'.format(subset, size)))
y_test_gender = json.load(open(direct+subdir+'/'+'split_data'+'/{}_{}_gender_test.json'.format(subset, size)))
y_train_age = json.load(open(direct+subdir+'/'+'split_data'+'/{}_{}_age_train.json'.format(subset, size)))
y_test_age = json.load(open(direct+subdir+'/'+'split_data'+'/{}_{}_age_test.json'.format(subset, size)))
tweet_train = json.load(open(direct+subdir+'/'+'split_data'+'/{}_{}_length_train.json'.format(subset, size)))
tweet_test = json.load(open(direct+subdir+'/'+'split_data'+'/{}_{}_length_test.json'.format(subset, size)))
if not os.path.exists(direct+subdir+'/'+'split_data'+'/{}_{}_{}_vocab.json'.format(subset, size, grams)):
print('make vocab')
sys.stdout.flush()
words = [token for sublist in train_set for token in sublist]
vocab = {}
count = []
count.extend(collections.Counter(words).most_common((2**18)-1))
i = 0
for token, tcount in count:
vocab[token] = i
i += 1
with open(direct+subdir+'/'+'split_data'+'/{}_{}_{}_vocab.json'.format(subset, size, grams), 'w') as f:
json.dump(vocab, f)
del count
del words
else:
vocab = json.load(open(direct+subdir+'/'+'split_data'+'/{}_{}_{}_vocab.json'.format(subset, size, grams)))
gc.collect()
y_list = [['author', y_train_author], ['gender', y_train_gender], ['age', y_train_age]]
if not os.path.exists(direct+subdir+'/'+'split_data'+'/{}_{}_sparse_train_count.jlib'.format(subset, size)):
vectorizer = CountVectorizer(tokenizer = identity_tokenizer, vocabulary = vocab, lowercase=False)
train_set = vectorizer.transform(train_set)
#tf = TfidfTransformer(use_idf = False)
#train_set = tf.fit_transform(train_set)
test_set = vectorizer.transform(test_set)
#test_set = tf.transform(test_set)
col_train = np.array(tweet_train).reshape(-1,1)
col_test = np.array(tweet_test).reshape(-1,1)
train_set = sp.sparse.hstack((train_set, col_train))
test_set = sp.sparse.hstack((test_set, col_test))
dump(train_set, direct+subdir+'/'+'split_data'+'/{}_{}_sparse_train_count.jlib'.format(subset, size))
dump(test_set, direct+subdir+'/'+'split_data'+'/{}_{}_sparse_test_count.jlib'.format(subset, size))
else:
train_set = load(direct+subdir+'/'+'split_data'+'/{}_{}_sparse_train_count.jlib'.format(subset, size))
test_set = load(direct+subdir+'/'+'split_data'+'/{}_{}_sparse_test_count.jlib'.format(subset, size))
# vectorizer = CountVectorizer(tokenizer = identity_tokenizer, vocabulary = vocab, lowercase=False)
# train_set = vectorizer.transform(train_set)
# test_set = vectorizer.transform(test_set)
# col_train = np.array(tweet_train).reshape(-1,1)
# col_test = np.array(tweet_test).reshape(-1,1)
# train_set = sp.sparse.hstack((train_set, col_train))
# test_set = sp.sparse.hstack((test_set, col_test))
gc.collect()
for classifier in classifiernames:
#print('begin grid search...')
sys.stdout.flush()
for y in y_list:
##encode labels
print('...for category {}'.format(y[0]))
sys.stdout.flush()
if not os.path.exists(direct+subdir+'/'+savedir+filebeg+'_{}_{}_encoder.jlib'.format(y[0], size)):
le = LabelEncoder()
le = le.fit(y[1])
sys.stdout.flush()
dump(le, direct+subdir+'/'+savedir+filebeg+'_{}_{}_encoder.jlib'.format(y[0], size))
else:
le = load(direct+subdir+'/'+savedir+filebeg+'_{}_{}_encoder.jlib'.format(y[0], size))
Y_train_cat = le.transform(y[1])
print('fit classifier {} and {}'.format(classifier, y[0]))
start = time.time()
sys.stdout.flush()
clf = clf.fit(train_set, Y_train_cat)
print('finished training...continue with predictions')
predictions = clf.predict(test_set)
dump(predictions, direct+subdir+'/'+savedir+filebeg+'_'+classifier+'_'+y[0]+'_'+str(size)+'_predictions_count.jlib')
dump(clf, direct+subdir+'/'+savedir+filebeg+'_'+classifier+'_'+y[0]+'_'+str(size)+'_svm_out_count.jlib')
print('finished for classifier {} and {} after {} hours '.format(classifier, y[0], ((time.time()-start)/3600)))
sys.stdout.flush()
start = time.time()
scores = cross_validate(clf, train_set, Y_train_cat, scoring = ['accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted'], cv = 5)
print('finished for cross validation for classifier {} and {} after {} hours '.format(classifier, y[0], ((time.time()-start)/3600)))
sys.stdout.flush()
dump(scores, direct+subdir+'/'+savedir+filebeg+'_'+classifier+'_'+y[0]+'_'+str(size)+'_cross_validation_count.jlib')
print('done')
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment