from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import Pipeline
import tensorflow as tf
import pandas as pd
import scipy
from . import ml_utils
import json
import os
import re
def make_logistic():
def make_tfidf(featuretyp:str, minTw_gram_auth:list, path:str, id_path:str, rerun =False):
minTW = minTw_gram_auth[0]
numAuthors = minTw_gram_auth[2]
gram_files = {}
subset ='train|val|test', id_path).group(0)
ids = set(pd.read_json(open(id_path, 'r', encoding='utf-8'))['uID'].to_list())
vectorizer = HashingVectorizer(analyzer=ml_utils.identity_analyzer, norm=None, alternate_sign=False)
for gram in sorted(minTw_gram_auth[1]):
##acess files i.e. char_grams_3_500, char_grams_4_500, char_grams_5_500
gram_files['smpl_load'] = os.path.exists(os.path.join(*(ml_utils.split_path_unix_win(path),
featuretyp, minTW ,subset,
'{}_{}_grams_{}_{}_{}_authors.ndjson'.format(subset, featuretyp, gram, minTW, numAuthors))))
if not gram_files['smpl_load']:
featuretyp, minTW ,subset, '{}_{}_grams_{}_{}.ndjson'.format(subset, featuretyp, gram, minTW))), exist_ok=True)
t = os.path.join(*(ml_utils.split_path_unix_win(path), featuretyp, minTW ,'{}_grams_{}_{}.ndjson'.format(featuretyp, gram, minTW)))
gram_files[gram+'save'] = open(os.path.join(*(ml_utils.split_path_unix_win(path),
featuretyp, minTW ,subset,
'{}_{}_grams_{}_{}_{}_authors.ndjson'.format(subset, featuretyp, gram, minTW, numAuthors))),
t = os.path.join(*(ml_utils.split_path_unix_win(path),featuretyp, minTW ,subset,
'{}_{}_grams_{}_{}_{}_authors.ndjson'.format(subset, featuretyp, gram, minTW)))
gram_files[gram] = open(t, 'r', encoding='utf-8')
#first we check whether the id for current line is present in ids
#if yes, we hash - we do this for every gram type
lines = [1]
while lines:
lines = ml_utils.batch_read(gram_files[gram], batchsize=1000, to_json=True)
if lines:
lines = pd.DataFrame.from_records(lines)
# if we do not have subseted the files yet
if not gram_files['simpl_load'] or rerun:
lines = ml_utils.select_uids(lines, ids)
#write subset to file
ml_utils.pandas_to_ndjson(lines, gram_files[gram+'save'])
#otherwise the lines are part of the subset
if gram + '_sparse' in gram_files.keys():
gram_files[gram + '_sparse'] = scipy.sparse.vstack((gram_files[gram + '_sparse'],
featuretyp, gram)]))
gram_files[gram + '_sparse'] = vectorizer.transform(lines['{}_grams_{}'.format(featuretyp, gram)])
#important all files with their tweetids are in the same order
#we iterate over the length of grams in ascending order...conseqently in each run we can make a tfidf-transformer and save it to disk
# i.e. first for chargrams1, then for chargrams 1-2, then for chargrams1-2-3 and so forth
#these here are the parameters taken from Custodio et al. 2021
trans = TfidfTransformer(norm='l2', use_idf=False, sublinear_tf=True)
#set all words to zero with a doc-frequency below 1%
numdocs = gram_files[gram + '_sparse'].get_shape()[0]
colsums = gram_files[gram + '_sparse'].sum(axis=0)
prop = colsums/numdocs
mask = prop > 0.01
#after opening files we have to iterate over main file if simply load
def find_variance_cutoff(featuretyp:str, gramd_dic:dict, path:str, n_components=-1):
#recrusive function to find the n_components cutoff
if n_components <0:
make_tfidf(featuretyp, gramd_dic, path)
def make_pipeline(featuretyp:str, target:str, gram_dic:dict, path:str):
#find the number of features for pca which explains 99% variance - this takes a while
n_components = find_variance_cutoff(featuretyp, gram_dic, path, -1)
def train(path:str):
import os
import sys
import gc
import json
import jsonlines
import pandas as pd
import tables
from scipy import sparse
import numpy as np
def store_sparse_mat(M, name, filename='store.h5'):
Store a csr matrix in HDF5
M : scipy.sparse.csr.csr_matrix
sparse matrix to be stored
name: str
node prefix in HDF5 hierarchy
filename: str
HDF5 filename
assert(M.__class__ == sparse.csr.csr_matrix), 'M must be a csr matrix'
with tables.open_file(filename, 'a') as f:
for attribute in ('data', 'indices', 'indptr', 'shape'):
full_name = f'{name}_{attribute}'
# remove existing nodes
n = getattr(f.root, full_name)
except AttributeError:
# add nodes
arr = np.array(getattr(M, attribute))
atom = tables.Atom.from_dtype(arr.dtype)
ds = f.create_carray(f.root, full_name, atom, arr.shape)
ds[:] = arr
def load_sparse_mat(name, filename='store.h5'):
Load a csr matrix from HDF5
name: str
node prefix in HDF5 hierarchy
filename: str
HDF5 filename
M : scipy.sparse.csr.csr_matrix
loaded sparse matrix
with tables.open_file(filename) as f:
# get nodes
attributes = []
for attribute in ('data', 'indices', 'indptr', 'shape'):
attributes.append(getattr(f.root, f'{name}_{attribute}').read())
# construct sparse matrix
M = sparse.csr_matrix(tuple(attributes[:3]), shape=attributes[3])
return M
def split_path_unix_win(path):
#make windows-unix problem go away
if '\\' in path:
path = path.split('\\')
elif '/' in path:
path = path.split('/')
return path
def batch_read(fileobj, batchsize, to_json=True):
lines = []
for _ in range(batchsize):
line = fileobj.readline()
if line and to_json:
line = jsonize(line)
line['uID'] = make_uids(line, 'tweetIDs')
elif line:
return lines
def jsonize(line):
line = json.loads(line)
line = line.split('{')[1]
line = line.split('}')[0]
line = json.loads('{'+line+'}')
print('manual line loading failed as well...')
return False
return line
def make_uids(dic, key):
dic[key] = '|'.join(dic[key])
return dic
def select_uids(df, ids):
df = df.loc[df['uID'].isin(ids),:]
return df
def pandas_to_ndjson(df:pd.DataFrame, f):
df = df.to_records()
writer = jsonlines.Writer(df)
print('wrote subset to file...')
def make_save_dirs(path):
# create files for later saving
path = split_path_unix_win(path)
ind = path.index('preprocessed')
os.makedirs(os.path.join(*(path[0:ind]+ ['models'] + path[(ind+1):])))
print('made models directory with subdirs...')
def identity_prepr(text):
return text
def identity_tokenizer(text):
return text
def identity_analyzer(text):
return text
import shutil
import os
import argparse
import jsonlines
import ndjson
import json
import sys
def get_last_n_lines(file_name, N):
# Create an empty list to keep the track of last N lines
list_of_lines = []
# Open file for reading in binary mode
with open(file_name, 'r', encoding='utf-8') as read_obj:
# Move the cursor to the end of the file, os.SEEK_END)
# Create a buffer to keep the last read line
buffer = list()
# Get the current position of pointer i.e eof
pointer_location = read_obj.tell()
# Loop till pointer reaches the top of the file
while pointer_location >= 0:
# Move the file pointer to the location pointed by pointer_location
# Shift pointer location by -1
pointer_location = pointer_location - 1
# read that byte / character
new_byte =
# If the read byte is new line character then it means one line is read
if new_byte == '}' and len(buffer) > 1:
# Save the line in list of lines
# If the size of list reaches N, then return the reversed list
if len(list_of_lines) == N:
return list(reversed(list_of_lines))
# Reinitialize the byte array to save next line
buffer = list()
# If last read character is not eol then add it in buffer
if not new_byte in ["\n", r'']:
# As file is read completely, if there is still data in buffer, then its first line.
if len(buffer) > 0:
# return the reversed list
return list(reversed(list_of_lines))
def read_file_list(savepath, subpart, group):
dirp = os.path.join(savepath, subpart, group)
dirs = {}
for it in os.scandir(dirp):
if os.path.isdir(it.path):
if os.path.basename(os.path.normpath(it.path)) != 'process':
dirs[it.path] = {}
for filen in os.listdir(it.path):
if "_part_" in filen:
proc = filen.split('_part_')[0]
dirs[it.path][proc] = dirs[it.path].get(proc, [])
dirs[it.path][proc] = sorted(dirs[it.path][proc], key=key_sort_f)
return dirs
def key_sort_f(filen):
return int(filen.split('_part_')[1].split('.')[0])
def concat_files(dirs_dic):
for proc_typ in dirs_dic.keys():
for ngrams in dirs_dic[proc_typ].keys():
outpath = os.path.join(proc_typ, ngrams + '_concat.ndjson')
with open(outpath, 'w', encoding='utf-8') as wfd:
for file in dirs_dic[proc_typ][ngrams]:
filepath = os.path.join(proc_typ, file)
with open(filepath, 'r', encoding='utf-8') as fd:
shutil.copyfileobj(fd, wfd)
#print('would have copied {} to {}'.format(file, outpath))
def make_tweet_id_dic(minchars):
tweetID_dic = {}
for chars in minchars:
tweetID_dic[chars] = {}
tweetID_dic[chars]['ID'] = []
tweetID_dic[chars]['tweetID'] = []
tweetID_dic[chars]['init_len'] = []
tweetID_dic[chars]['prepr_len'] = []
tweetID_dic[chars]['mentions'] = []
tweetID_dic[chars]['tags'] = []
tweetID_dic[chars]['urls'] = []
tweetID_dic[chars]['times'] = []
tweetID_dic[chars]['emotic_num'] = []
tweetID_dic[chars]['emojis_num'] = []
tweetID_dic[chars]['numericals'] = []
return tweetID_dic
def make_subsets(dirs_dic, minchars):
#add file handles for concatentated and subgroup files
for proc_typ in dirs_dic.keys():
if 'num' in proc_typ:
numident = proc_typ
for ngrams in dirs_dic[proc_typ].keys():
dirs_dic[proc_typ][ngrams] = {}
dirs_dic[proc_typ][ngrams]['concat'] = open(os.path.join(proc_typ, ngrams + '_concat.ndjson'), 'r', encoding='utf-8')
# make handles for new files
for minc in minchars:
#make dirs 100, 250, 500 in every subdir
os.makedirs(os.path.join(proc_typ, str(minc)), exist_ok=True)
dirs_dic[proc_typ][ngrams][minc] = {}
if ngrams not in ['emoticon_c', 'polarity']:
dirs_dic[proc_typ][ngrams][minc]['lines'] = []
elif ngrams == 'emoticon_c':
#special case for emoticon file because i was stupid
dirs_dic[proc_typ][ngrams][minc]['emoji'] = []
dirs_dic[proc_typ][ngrams][minc]['emoticon'] = []
elif ngrams == 'polarity':
dirs_dic[proc_typ][ngrams][minc]['polarity'] = []
dirs_dic[proc_typ][ngrams][minc]['subjectivity'] = []
dirs_dic[proc_typ][ngrams][minc]['ID'] = ''
dirs_dic[proc_typ][ngrams][minc]['tweetID'] = []
if not os.path.exists(os.path.join(proc_typ, str(minc),ngrams + '_{}.ndjson'.format(minc))):
mode = 'w'
mode = 'a'
dirs_dic[proc_typ][ngrams][minc]['f'] = open(os.path.join(proc_typ, str(minc),ngrams + '_{}.ndjson'.format(minc)), mode, encoding='utf-8')
dirs_dic[proc_typ][ngrams][minc]['writer'] = jsonlines.Writer(dirs_dic[proc_typ][ngrams][minc]['f'])
#iterate over num to select the appropriate number of tweets to concat - save the concat info in the num/minchars/.ndjson file
#put the concated tweets into the respective subfolders, i.e. char/500/concated_tweets500.ndsjon
#every file is in the same order, hence why we only need on pass.
minchars = sorted(minchars)
tweetID_dic = make_tweet_id_dic(minchars)
#set current id
current_id = ''
counter = 0
for line in dirs_dic[numident]['num']['concat']:
counter +=1
if counter % 10000 == 0:
print('doing line {}'.format(counter))
#sometimes last line is empty
if not line:
line = json.loads(line)
if counter % 10000 == 0:
print('loaded line')
#load all lines in the other files:
for proc_typ in dirs_dic.keys():
for ngrams in dirs_dic[proc_typ].keys():
if ngrams !='num':
l = dirs_dic[proc_typ][ngrams]['concat'].readline()
l = ndjson.loads(l)[0]
if line:
for minc in minchars:
#attach the tweet info the te respective minchar part of the dic including the actual tweet text
#test whether files are truly in same order
assert current_id == l['ID'] or l['ID'] == line['ID']
except AssertionError as e:
e.args += (str(current_id), str(l['ID']), str(line['ID']))
#put to existing tweets if still same person
if current_id == l['ID']:
if ngrams not in ['emoticon_c', 'num', 'polarity']:
elif ngrams == 'emoticon_c':
elif ngrams == 'polarity':
#empty and add next person
if ngrams not in ['emoticon_c', 'num', 'polarity']:
dirs_dic[proc_typ][ngrams][minc]['lines'] = l[ngrams]
elif ngrams == 'emoticon_c':
dirs_dic[proc_typ][ngrams][minc]['emoji'] = l['emoji']
dirs_dic[proc_typ][ngrams][minc]['emoticon'] =l['emoticon']
elif ngrams == 'polarity':
dirs_dic[proc_typ][ngrams][minc]['polarity'] = [l['polarity']]
dirs_dic[proc_typ][ngrams][minc]['subjectivity'] = [l['subjectivity']]
dirs_dic[proc_typ][ngrams][minc]['tweetID'] = [l['tweetID']]
dirs_dic[proc_typ][ngrams][minc]['ID'] = l['ID']
#check that we always work on the same person
if current_id == '':
current_id = line['ID']
#if next person set everything to 0 and continue with new id
if not current_id == line['ID']:
tweetID_dic = make_tweet_id_dic(minchars)
current_id = line['ID']
#iterate over the minc subsets to make
for minc in minchars:
#add all the info from current tweet to our tweetID dic
for key in line.keys():
#test whether we now have enough characters in tweet to concat it
if minc <= sum(tweetID_dic[minc]['init_len']):
#iterate over all files an push write the tweets to file
for proc_typ in dirs_dic.keys():
for ngrams in dirs_dic[proc_typ].keys():
#make sure that we are processing the dame ids
if ngrams != 'num':
assert dirs_dic[proc_typ][ngrams][minc]['ID'] == line['ID'] and line['ID'] == tweetID_dic[minc]['ID'][0]
except AssertionError as e:
e.args += (str(dirs_dic[proc_typ][ngrams][minc]['ID']),
#make sure we are processing the same tweets over all files
if ngrams != 'num':
assert set(tweetID_dic[minc]['tweetID']) == set(dirs_dic[proc_typ][ngrams][minc]['tweetID'])
except AssertionError as e:
e.args += (str(set(tweetID_dic[minc]['tweetID'])), str(set(dirs_dic[proc_typ][ngrams][minc]['tweetID'])))
if ngrams not in ['num', 'emoticon_c', 'polarity']:
dat = {'ID': line['ID'], 'tweetIDs':tweetID_dic[minc]['tweetID'],
ngrams: dirs_dic[proc_typ][ngrams][minc]['lines']}
#empty dic from lines
dirs_dic[proc_typ][ngrams][minc]['lines'] = []
elif ngrams == 'emoticon_c':
dat = {'ID': line['ID'], 'tweetIDs': tweetID_dic[minc]['tweetID'],
'emoticon': dirs_dic[proc_typ][ngrams][minc]['emoticon'],
'emoji': dirs_dic[proc_typ][ngrams][minc]['emoji']}
#empty dic from stored emojis
dirs_dic[proc_typ][ngrams][minc]['emoticon'] = []
dirs_dic[proc_typ][ngrams][minc]['emoji'] = []
elif ngrams == 'polarity':
dat = {'ID': line['ID'], 'tweetIDs': tweetID_dic[minc]['tweetID'],
'polarity': dirs_dic[proc_typ][ngrams][minc]['polarity'],
'subjectivity': dirs_dic[proc_typ][ngrams][minc]['subjectivity']}
#empty dic from stored emojis
dirs_dic[proc_typ][ngrams][minc]['polarity'] = []
dirs_dic[proc_typ][ngrams][minc]['subjectivity'] = []
dat = {}
for key in tweetID_dic[minc].keys():
dat[key] = tweetID_dic[minc][key]
dat['ID'] = line['ID']
#empty the dic for the current file, i.e. Char_grams_2 or so
dirs_dic[proc_typ][ngrams][minc]['tweetID'] = []
#clear the entries in the dic regarding tweet ids and everything else for the respective minc
for key in tweetID_dic[minc]:
tweetID_dic[minc][key] = []
def _main(savepath, workset, typ):
#make windows-unix problem go away
if '\\' in savepath:
savepath = os.path.join(*savepath.split('\\'))
elif '/' in savepath:
savepath = os.path.join(*savepath.split('/'))
for t in typ:
dirs_dic = read_file_list(savepath, workset, t)
make_subsets(dirs_dic, [100, 250, 500])
if __name__ == "__main__":
command = True
if not command:
workset = 'workset'
typ = ['creator', 'performer']
savepath = "../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed"
argparser = argparse.ArgumentParser(description='Arguements for preprocessing and making the ngrams')
argparser.add_argument('-s', '--save', help='Path to output directory (relative or absolute)', required=True)
help='Sub-Directory of parent input-directory (if it exists). Helpful if script is executed in loop on many worksets')
argparser.add_argument('--part', nargs="*",
help='Sub-Sub-Directory of parent input-directory (if it exists). Helpful if script is executed in loop on many types')
args = vars(argparser.parse_args())
workset = args.get('workset', '')
typ = args.get('part', '')
savepath = args['save']
_main(savepath, workset, typ)
