Commit 628bde6d authored by Marcel Henrik Schubert's avatar Marcel Henrik Schubert
Browse files

fixed a lot of errors

parent ba1f78e7
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import Pipeline
import tensorflow as tf
import pandas as pd
import scipy
from . import ml_utils
import json
import os
import re
def make_logistic():
pass
def make_tfidf(featuretyp:str, minTw_gram_auth:list, path:str, id_path:str, rerun =False):
minTW = minTw_gram_auth[0]
numAuthors = minTw_gram_auth[2]
gram_files = {}
subset = re.search('train|val|test', id_path).group(0)
ids = set(pd.read_json(open(id_path, 'r', encoding='utf-8'))['uID'].to_list())
vectorizer = HashingVectorizer(analyzer=ml_utils.identity_analyzer, norm=None, alternate_sign=False)
for gram in sorted(minTw_gram_auth[1]):
##acess files i.e. char_grams_3_500, char_grams_4_500, char_grams_5_500
gram_files['smpl_load'] = os.path.exists(os.path.join(*(ml_utils.split_path_unix_win(path),
featuretyp, minTW ,subset,
'{}_{}_grams_{}_{}_{}_authors.ndjson'.format(subset, featuretyp, gram, minTW, numAuthors))))
if not gram_files['smpl_load']:
os.makedirs(os.path.join(*(ml_utils.split_path_unix_win(path),
featuretyp, minTW ,subset, '{}_{}_grams_{}_{}.ndjson'.format(subset, featuretyp, gram, minTW))), exist_ok=True)
t = os.path.join(*(ml_utils.split_path_unix_win(path), featuretyp, minTW ,'{}_grams_{}_{}.ndjson'.format(featuretyp, gram, minTW)))
gram_files[gram+'save'] = open(os.path.join(*(ml_utils.split_path_unix_win(path),
featuretyp, minTW ,subset,
'{}_{}_grams_{}_{}_{}_authors.ndjson'.format(subset, featuretyp, gram, minTW, numAuthors))),
'w',encoding='utf-8')
else:
t = os.path.join(*(ml_utils.split_path_unix_win(path),featuretyp, minTW ,subset,
'{}_{}_grams_{}_{}_{}_authors.ndjson'.format(subset, featuretyp, gram, minTW)))
gram_files[gram] = open(t, 'r', encoding='utf-8')
#first we check whether the id for current line is present in ids
#if yes, we hash - we do this for every gram type
lines = [1]
while lines:
lines = ml_utils.batch_read(gram_files[gram], batchsize=1000, to_json=True)
if lines:
lines = pd.DataFrame.from_records(lines)
# if we do not have subseted the files yet
if not gram_files['simpl_load'] or rerun:
lines = ml_utils.select_uids(lines, ids)
#write subset to file
ml_utils.pandas_to_ndjson(lines, gram_files[gram+'save'])
#otherwise the lines are part of the subset
if gram + '_sparse' in gram_files.keys():
gram_files[gram + '_sparse'] = scipy.sparse.vstack((gram_files[gram + '_sparse'],
vectorizer.transform(lines['{}_grams_{}'.format(
featuretyp, gram)]))
)
else:
gram_files[gram + '_sparse'] = vectorizer.transform(lines['{}_grams_{}'.format(featuretyp, gram)])
#important all files with their tweetids are in the same order
#we iterate over the length of grams in ascending order...conseqently in each run we can make a tfidf-transformer and save it to disk
# i.e. first for chargrams1, then for chargrams 1-2, then for chargrams1-2-3 and so forth
#these here are the parameters taken from Custodio et al. 2021
trans = TfidfTransformer(norm='l2', use_idf=False, sublinear_tf=True)
#set all words to zero with a doc-frequency below 1%
numdocs = gram_files[gram + '_sparse'].get_shape()[0]
colsums = gram_files[gram + '_sparse'].sum(axis=0)
prop = colsums/numdocs
mask = prop > 0.01
#after opening files we have to iterate over main file if simply load
HashingVectorizer()
def find_variance_cutoff(featuretyp:str, gramd_dic:dict, path:str, n_components=-1):
#recrusive function to find the n_components cutoff
if n_components <0:
make_tfidf(featuretyp, gramd_dic, path)
def make_pipeline(featuretyp:str, target:str, gram_dic:dict, path:str):
#find the number of features for pca which explains 99% variance - this takes a while
n_components = find_variance_cutoff(featuretyp, gram_dic, path, -1)
def train(path:str):
ml_utils.make_save_dirs(path)
\ No newline at end of file
import os
import sys
import gc
import json
import jsonlines
import pandas as pd
import tables
from scipy import sparse
import numpy as np
def store_sparse_mat(M, name, filename='store.h5'):
"""
Store a csr matrix in HDF5
Parameters
----------
M : scipy.sparse.csr.csr_matrix
sparse matrix to be stored
name: str
node prefix in HDF5 hierarchy
filename: str
HDF5 filename
"""
assert(M.__class__ == sparse.csr.csr_matrix), 'M must be a csr matrix'
with tables.open_file(filename, 'a') as f:
for attribute in ('data', 'indices', 'indptr', 'shape'):
full_name = f'{name}_{attribute}'
# remove existing nodes
try:
n = getattr(f.root, full_name)
n._f_remove()
except AttributeError:
pass
# add nodes
arr = np.array(getattr(M, attribute))
atom = tables.Atom.from_dtype(arr.dtype)
ds = f.create_carray(f.root, full_name, atom, arr.shape)
ds[:] = arr
def load_sparse_mat(name, filename='store.h5'):
"""
Load a csr matrix from HDF5
Parameters
----------
name: str
node prefix in HDF5 hierarchy
filename: str
HDF5 filename
Returns
----------
M : scipy.sparse.csr.csr_matrix
loaded sparse matrix
"""
with tables.open_file(filename) as f:
# get nodes
attributes = []
for attribute in ('data', 'indices', 'indptr', 'shape'):
attributes.append(getattr(f.root, f'{name}_{attribute}').read())
# construct sparse matrix
M = sparse.csr_matrix(tuple(attributes[:3]), shape=attributes[3])
return M
def split_path_unix_win(path):
#make windows-unix problem go away
if '\\' in path:
path = path.split('\\')
elif '/' in path:
path = path.split('/')
return path
def batch_read(fileobj, batchsize, to_json=True):
lines = []
for _ in range(batchsize):
line = fileobj.readline()
if line and to_json:
line = jsonize(line)
line['uID'] = make_uids(line, 'tweetIDs')
elif line:
lines.append(line)
return lines
def jsonize(line):
try:
line = json.loads(line)
except:
try:
line = line.split('{')[1]
line = line.split('}')[0]
line = json.loads('{'+line+'}')
except:
print('manual line loading failed as well...')
sys.stdout.flush()
return False
return line
def make_uids(dic, key):
dic[key] = '|'.join(dic[key])
return dic
def select_uids(df, ids):
df = df.loc[df['uID'].isin(ids),:]
return df
def pandas_to_ndjson(df:pd.DataFrame, f):
df = df.to_records()
writer = jsonlines.Writer(df)
writer.write_all(df)
print('wrote subset to file...')
sys.stdout.flush()
writer.close()
def make_save_dirs(path):
# create files for later saving
path = split_path_unix_win(path)
ind = path.index('preprocessed')
os.makedirs(os.path.join(*(path[0:ind]+ ['models'] + path[(ind+1):])))
print('made models directory with subdirs...')
sys.stdout.flush()
def identity_prepr(text):
return text
def identity_tokenizer(text):
return text
def identity_analyzer(text):
return text
\ No newline at end of file
import shutil
import os
import argparse
import jsonlines
import ndjson
import json
import sys
def get_last_n_lines(file_name, N):
# Create an empty list to keep the track of last N lines
list_of_lines = []
# Open file for reading in binary mode
with open(file_name, 'r', encoding='utf-8') as read_obj:
# Move the cursor to the end of the file
read_obj.seek(0, os.SEEK_END)
# Create a buffer to keep the last read line
buffer = list()
# Get the current position of pointer i.e eof
pointer_location = read_obj.tell()
# Loop till pointer reaches the top of the file
while pointer_location >= 0:
# Move the file pointer to the location pointed by pointer_location
read_obj.seek(pointer_location)
# Shift pointer location by -1
pointer_location = pointer_location - 1
# read that byte / character
new_byte = read_obj.read(1)
# If the read byte is new line character then it means one line is read
if new_byte == '}' and len(buffer) > 1:
# Save the line in list of lines
list_of_lines.append(json.loads(''.join(buffer[::-1])))
# If the size of list reaches N, then return the reversed list
if len(list_of_lines) == N:
return list(reversed(list_of_lines))
# Reinitialize the byte array to save next line
buffer = list()
buffer.append(new_byte)
else:
# If last read character is not eol then add it in buffer
if not new_byte in ["\n", r'']:
buffer.append(new_byte)
# As file is read completely, if there is still data in buffer, then its first line.
if len(buffer) > 0:
list_of_lines.append(json.loads(''.join(buffer[::-1])))
# return the reversed list
return list(reversed(list_of_lines))
def read_file_list(savepath, subpart, group):
dirp = os.path.join(savepath, subpart, group)
dirs = {}
for it in os.scandir(dirp):
if os.path.isdir(it.path):
if os.path.basename(os.path.normpath(it.path)) != 'process':
dirs[it.path] = {}
for filen in os.listdir(it.path):
if "_part_" in filen:
proc = filen.split('_part_')[0]
dirs[it.path][proc] = dirs[it.path].get(proc, [])
dirs[it.path][proc].append(filen)
dirs[it.path][proc] = sorted(dirs[it.path][proc], key=key_sort_f)
return dirs
def key_sort_f(filen):
return int(filen.split('_part_')[1].split('.')[0])
def concat_files(dirs_dic):
for proc_typ in dirs_dic.keys():
for ngrams in dirs_dic[proc_typ].keys():
outpath = os.path.join(proc_typ, ngrams + '_concat.ndjson')
with open(outpath, 'w', encoding='utf-8') as wfd:
for file in dirs_dic[proc_typ][ngrams]:
filepath = os.path.join(proc_typ, file)
with open(filepath, 'r', encoding='utf-8') as fd:
shutil.copyfileobj(fd, wfd)
#print('would have copied {} to {}'.format(file, outpath))
def make_tweet_id_dic(minchars):
tweetID_dic = {}
for chars in minchars:
tweetID_dic[chars] = {}
tweetID_dic[chars]['ID'] = []
tweetID_dic[chars]['tweetID'] = []
tweetID_dic[chars]['init_len'] = []
tweetID_dic[chars]['prepr_len'] = []
tweetID_dic[chars]['mentions'] = []
tweetID_dic[chars]['tags'] = []
tweetID_dic[chars]['urls'] = []
tweetID_dic[chars]['times'] = []
tweetID_dic[chars]['emotic_num'] = []
tweetID_dic[chars]['emojis_num'] = []
tweetID_dic[chars]['numericals'] = []
return tweetID_dic
def make_subsets(dirs_dic, minchars):
#add file handles for concatentated and subgroup files
for proc_typ in dirs_dic.keys():
if 'num' in proc_typ:
numident = proc_typ
for ngrams in dirs_dic[proc_typ].keys():
dirs_dic[proc_typ][ngrams] = {}
dirs_dic[proc_typ][ngrams]['concat'] = open(os.path.join(proc_typ, ngrams + '_concat.ndjson'), 'r', encoding='utf-8')
# make handles for new files
for minc in minchars:
#make dirs 100, 250, 500 in every subdir
os.makedirs(os.path.join(proc_typ, str(minc)), exist_ok=True)
dirs_dic[proc_typ][ngrams][minc] = {}
if ngrams not in ['emoticon_c', 'polarity']:
dirs_dic[proc_typ][ngrams][minc]['lines'] = []
elif ngrams == 'emoticon_c':
#special case for emoticon file because i was stupid
dirs_dic[proc_typ][ngrams][minc]['emoji'] = []
dirs_dic[proc_typ][ngrams][minc]['emoticon'] = []
elif ngrams == 'polarity':
dirs_dic[proc_typ][ngrams][minc]['polarity'] = []
dirs_dic[proc_typ][ngrams][minc]['subjectivity'] = []
dirs_dic[proc_typ][ngrams][minc]['ID'] = ''
dirs_dic[proc_typ][ngrams][minc]['tweetID'] = []
if not os.path.exists(os.path.join(proc_typ, str(minc),ngrams + '_{}.ndjson'.format(minc))):
mode = 'w'
else:
mode = 'a'
dirs_dic[proc_typ][ngrams][minc]['f'] = open(os.path.join(proc_typ, str(minc),ngrams + '_{}.ndjson'.format(minc)), mode, encoding='utf-8')
dirs_dic[proc_typ][ngrams][minc]['writer'] = jsonlines.Writer(dirs_dic[proc_typ][ngrams][minc]['f'])
#iterate over num to select the appropriate number of tweets to concat - save the concat info in the num/minchars/.ndjson file
#put the concated tweets into the respective subfolders, i.e. char/500/concated_tweets500.ndsjon
#every file is in the same order, hence why we only need on pass.
minchars = sorted(minchars)
tweetID_dic = make_tweet_id_dic(minchars)
#set current id
current_id = ''
counter = 0
for line in dirs_dic[numident]['num']['concat']:
counter +=1
if counter % 10000 == 0:
print('doing line {}'.format(counter))
#sometimes last line is empty
if not line:
continue
line = json.loads(line)
if counter % 10000 == 0:
print('loaded line')
#load all lines in the other files:
for proc_typ in dirs_dic.keys():
for ngrams in dirs_dic[proc_typ].keys():
if ngrams !='num':
l = dirs_dic[proc_typ][ngrams]['concat'].readline()
l = ndjson.loads(l)[0]
if line:
for minc in minchars:
#attach the tweet info the te respective minchar part of the dic including the actual tweet text
#test whether files are truly in same order
try:
assert current_id == l['ID'] or l['ID'] == line['ID']
except AssertionError as e:
e.args += (str(current_id), str(l['ID']), str(line['ID']))
raise
#put to existing tweets if still same person
if current_id == l['ID']:
if ngrams not in ['emoticon_c', 'num', 'polarity']:
#print(minc)
#(ngrams)
#print(dirs_dic[proc_typ][ngrams][minc]['lines'])
dirs_dic[proc_typ][ngrams][minc]['lines'].extend(l[ngrams])
elif ngrams == 'emoticon_c':
dirs_dic[proc_typ][ngrams][minc]['emoji'].extend(l['emoji'])
dirs_dic[proc_typ][ngrams][minc]['emoticon'].extend(l['emoticon'])
elif ngrams == 'polarity':
dirs_dic[proc_typ][ngrams][minc]['polarity'].append(l['polarity'])
dirs_dic[proc_typ][ngrams][minc]['subjectivity'].append(l['subjectivity'])
dirs_dic[proc_typ][ngrams][minc]['tweetID'].append(l['tweetID'])
#empty and add next person
else:
if ngrams not in ['emoticon_c', 'num', 'polarity']:
dirs_dic[proc_typ][ngrams][minc]['lines'] = l[ngrams]
elif ngrams == 'emoticon_c':
dirs_dic[proc_typ][ngrams][minc]['emoji'] = l['emoji']
dirs_dic[proc_typ][ngrams][minc]['emoticon'] =l['emoticon']
elif ngrams == 'polarity':
dirs_dic[proc_typ][ngrams][minc]['polarity'] = [l['polarity']]
dirs_dic[proc_typ][ngrams][minc]['subjectivity'] = [l['subjectivity']]
dirs_dic[proc_typ][ngrams][minc]['tweetID'] = [l['tweetID']]
dirs_dic[proc_typ][ngrams][minc]['ID'] = l['ID']
#check that we always work on the same person
if current_id == '':
current_id = line['ID']
#if next person set everything to 0 and continue with new id
if not current_id == line['ID']:
tweetID_dic = make_tweet_id_dic(minchars)
current_id = line['ID']
#iterate over the minc subsets to make
for minc in minchars:
#add all the info from current tweet to our tweetID dic
for key in line.keys():
tweetID_dic[minc][key].append(line[key])
#print(sum(tweetID_dic[minc]['init_len']))
#test whether we now have enough characters in tweet to concat it
if minc <= sum(tweetID_dic[minc]['init_len']):
#iterate over all files an push write the tweets to file
for proc_typ in dirs_dic.keys():
for ngrams in dirs_dic[proc_typ].keys():
#print(proc_typ)
#print(ngrams)
#make sure that we are processing the dame ids
if ngrams != 'num':
try:
assert dirs_dic[proc_typ][ngrams][minc]['ID'] == line['ID'] and line['ID'] == tweetID_dic[minc]['ID'][0]
except AssertionError as e:
e.args += (str(dirs_dic[proc_typ][ngrams][minc]['ID']),
str(line['ID']),
str(tweetID_dic[minc]['ID'][0]))
raise
#make sure we are processing the same tweets over all files
#print(set(tweetID_dic[minc]['tweetID']))
#print(set(dirs_dic[proc_typ][ngrams][minc]['tweetID']))
if ngrams != 'num':
try:
assert set(tweetID_dic[minc]['tweetID']) == set(dirs_dic[proc_typ][ngrams][minc]['tweetID'])
except AssertionError as e:
e.args += (str(set(tweetID_dic[minc]['tweetID'])), str(set(dirs_dic[proc_typ][ngrams][minc]['tweetID'])))
raise
if ngrams not in ['num', 'emoticon_c', 'polarity']:
dat = {'ID': line['ID'], 'tweetIDs':tweetID_dic[minc]['tweetID'],
ngrams: dirs_dic[proc_typ][ngrams][minc]['lines']}
#empty dic from lines
dirs_dic[proc_typ][ngrams][minc]['lines'] = []
elif ngrams == 'emoticon_c':
dat = {'ID': line['ID'], 'tweetIDs': tweetID_dic[minc]['tweetID'],
'emoticon': dirs_dic[proc_typ][ngrams][minc]['emoticon'],
'emoji': dirs_dic[proc_typ][ngrams][minc]['emoji']}
#empty dic from stored emojis
dirs_dic[proc_typ][ngrams][minc]['emoticon'] = []
dirs_dic[proc_typ][ngrams][minc]['emoji'] = []
elif ngrams == 'polarity':
dat = {'ID': line['ID'], 'tweetIDs': tweetID_dic[minc]['tweetID'],
'polarity': dirs_dic[proc_typ][ngrams][minc]['polarity'],
'subjectivity': dirs_dic[proc_typ][ngrams][minc]['subjectivity']}
#empty dic from stored emojis
dirs_dic[proc_typ][ngrams][minc]['polarity'] = []
dirs_dic[proc_typ][ngrams][minc]['subjectivity'] = []
else:
dat = {}
for key in tweetID_dic[minc].keys():
dat[key] = tweetID_dic[minc][key]
dat['ID'] = line['ID']
dirs_dic[proc_typ][ngrams][minc]['writer'].write(dat)
#empty the dic for the current file, i.e. Char_grams_2 or so
dirs_dic[proc_typ][ngrams][minc]['tweetID'] = []
#clear the entries in the dic regarding tweet ids and everything else for the respective minc
for key in tweetID_dic[minc]:
tweetID_dic[minc][key] = []
print('done')
def _main(savepath, workset, typ):
#make windows-unix problem go away
if '\\' in savepath:
savepath = os.path.join(*savepath.split('\\'))
elif '/' in savepath:
savepath = os.path.join(*savepath.split('/'))
for t in typ:
dirs_dic = read_file_list(savepath, workset, t)
concat_files(dirs_dic)
make_subsets(dirs_dic, [100, 250, 500])
if __name__ == "__main__":
command = True
if not command:
workset = 'workset'
typ = ['creator', 'performer']
savepath = "../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed"
else:
argparser = argparse.ArgumentParser(description='Arguements for preprocessing and making the ngrams')
argparser.add_argument('-s', '--save', help='Path to output directory (relative or absolute)', required=True)
argparser.add_argument('--workset',
help='Sub-Directory of parent input-directory (if it exists). Helpful if script is executed in loop on many worksets')
argparser.add_argument('--part', nargs="*",
help='Sub-Sub-Directory of parent input-directory (if it exists). Helpful if script is executed in loop on many types')
args = vars(argparser.parse_args())
workset = args.get('workset', '')
typ = args.get('part', '')
savepath = args['save']
_main(savepath, workset, typ)
......@@ -74,15 +74,29 @@ def chunkify(linebytes, chunksize):