Commit f3ec5c57 authored by Marcel Henrik Schubert's avatar Marcel Henrik Schubert
Browse files

fixed preprocess errors

parent 261e79d7
......@@ -56,7 +56,7 @@ def get_linebytes(path, filename, test = False):
nextLineByte = f.tell() # returns the location of the next line
if test:
if i == 2:
if i == 3:
break
i+=1
if not line or line == '':
......@@ -419,7 +419,7 @@ def preprocess(IDs, tweetIDs, tweets, range_CHAR = (2,5),
putter += 1
#if putter == 29473:
# print((ID, grams[ln], key.upper(), ln, tID))
q[1].put((ID, tweet['tensor'], 'VECTORS', '', tID))
#q[1].put((ID, tweet['tensor'], 'VECTORS', '', tID))
putter += 1
#if putter == 29473:
# print((ID, tweet['tensor'], 'VECTORS', '', tID))
......@@ -539,26 +539,22 @@ def make_files(savepath, typ, ident, rerun=False):
return f, writer
def listener(doInt, ranges, q, savepath, rerun=False, num_to_process =0, o=None):
#for two listeners
#for two listeners
toDo = {0:['PROCESS','POLARITY', "EMOTICON_C", "NUM", 'DIST', 'AsIS', 'CHAR', 'WORD'],
1: ['VECTORS', 'LEMMA', 'POS', 'TAG', 'DEP']}
1: ['LEMMA', 'POS', 'TAG', 'DEP']}#'VECTORS',
pid = mp.current_process()
typs = toDo[doInt]
#print('got my todo')
#print(os.getcwd())
#print(os.path.join(savepath, 'out_{}.txt'.format(doInt)))
#sys.stdout.flush()
#print('made out file')
#sys.stdout.flush()
o = open(os.path.join(savepath, 'out_{}.txt'.format(doInt)), 'w')
print('got my todo', file=o)
sys.stdout.flush()
#open/create files in dictionary for each tag which listener responsible for
filehandles = {}
for el in typs:
#print('el is {}'.format(el), file=o)
#sys.stdout.flush()
print('el is {}'.format(el), file=o)
sys.stdout.flush()
if el in ["VECTORS", "NUM", "POLARITY", "EMOTICON_C"]:
#print('Creating single start', file=o)
#sys.stdout.flush()
......@@ -645,7 +641,7 @@ def listener(doInt, ranges, q, savepath, rerun=False, num_to_process =0, o=None)
##this is for write results to file
if m[2] != 'PROCESS':
#print('got item from queue ID: {}, tweetID {} type: {} ngrams: {}'.format(m[0], m[4], m[2], m[3]), file=o)
# print('got item from queue ID: {}, tweetID {} type: {} ngrams: {}'.format(m[0], m[4], m[2], m[3]), file=o)
#now we have to write it tot the correct file and make correct key
#if last position of tuple is == '', we have a file without grams
......@@ -668,7 +664,7 @@ def listener(doInt, ranges, q, savepath, rerun=False, num_to_process =0, o=None)
#print((key in filehandles.keys()), file=o)
filehandles[key]["writer"].write(res)
#print('wrote item to file ID: {}, tweetID {}, type: {} ngrams: {}'.format(m[0], m[4], m[2], m[3])
# , file=o)
# , file=o)
#put the id into the processed queue
q[0].put((m[0], '', 'PROCESS', str(m[2]) + str(m[3]), m[4]))
del res, m
......@@ -689,7 +685,7 @@ def listener(doInt, ranges, q, savepath, rerun=False, num_to_process =0, o=None)
proc[m[0]][m[4]] = proc[m[0]].get(m[4], 0) + 1
current = proc[m[0]][m[4]]
# print(current, num_to_process, file=o)
print(current, num_to_process, file=o)
#num_to_process is number of files which is processing types
if current == num_to_process:
#print('entering filewriting process', file=o)
......@@ -764,7 +760,7 @@ def cal_num_comb(listener_dic, spacy, asis, both):
list(range(listener_dic['TAG'][0], listener_dic['TAG'][1]))+[1]+ \
list(range(listener_dic['DEP'][0], listener_dic['DEP'][1]))+[1]:
c+=1
c+=1 #for vectors
#c+=1 #for vectors
if asis:
for _ in list(range(listener_dic['AsIS'][0], listener_dic['AsIS'][1])) + [1]:
......@@ -833,15 +829,14 @@ def _main(args):
#create a two queues to split work into two parts
q = [manager.Queue(), manager.Queue()]
pool = mp.Pool(6, maxtasksperchild=1)
pool = mp.Pool(4, maxtasksperchild=1)
#pool = mp.Pool(mp.cpu_count(), maxtasksperchild=1)
print('create listener for saving of data...')
sys.stdout.flush()
#put listeners to work first
watchers = []
for i in range(len(q)):
o=open(os.path.join(savepath, 'out_{}.txt'.format(i)), 'w')
watchers.append(pool.apply_async(listener, (i, listener_dic, q, savepath, args['rerun'],c,o,)))
watchers.append(pool.apply_async(listener, (i, listener_dic, q, savepath, args['rerun'],c,)))
#time.sleep(30)
#fire off workers
......@@ -893,78 +888,60 @@ def _main(args):
if __name__ == "__main__":
#args = {}
#args['datapath'] = "../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31"
#args['save'] = "../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed"
#args['file'] ='workset_manager.ndjson'
#args['char'] = (2,5)
#args['word'] =(1,2)
#args['tag'] =(1,3)
#args['dep'] =(1,3)
#args['pos'] =(1,3)
#args ['workset'] ='workset'
#args['part'] ='manager'
#args['rerun'] =True
#args['spacy'] =True
#args['both'] =True
#args['asis'] =True
#args['test'] =False
#args['encase_list'] = ["emoji","emoticon"]
#tag_dic = {'url': "url",
# 'hashtag': "tag",
# 'mention': "user",
# 'emoji': "emoji",
# 'emoticon': "smile",
# 'time': "time",
# 'number': "number",
# }
#args['encase_list'] = args.get('encase_list', [])
#args['encase_list'] = [tag_dic[el] for el in args['encase_list']]
# #make windows-unix problem go away
#if '\\' in args['save']:
# args['datapath'] = os.path.join(*args['datapath'].split('\\'))
#elif '/' in args['save']:
# args['save'] = os.path.join(*args['save'].split('/'))
#if '\\' in args['datapath']:
# args['datapath'] = os.path.join(*args['datapath'].split('\\'))
#elif '/' in args['datapath']:
# args['datapath'] = os.path.join(*args['datapath'].split('/'))
#parse arguements
argparser = argparse.ArgumentParser(description='Arguements for preprocessing and making the ngrams')
argparser.add_argument_group('required arguments')
argparser.add_argument('-p', '--datapath', help='Path to parent input directory (relative or absolute)', required=True)
argparser.add_argument('-f', '--file', help='Name of input file', required=True)
argparser.add_argument('-s', '--save', help='Path to output directory (relative or absolute)', required=True)
argparser.add_argument('-c', '--char', help='Range (l,u) for char ngrams', required=True)
argparser.add_argument('-w', '--word', help='Range (l,u) for word ngrams', required=True)
argparser.add_argument('-t', '--tag', help='Range (l,u) for spacy tag ngrams', required=True)
argparser.add_argument('-d', '--dep', help='Range (l,u) for spacy dep ngrams', required=True)
argparser.add_argument('-o', '--pos', help='Range (l,u) for spacy pos ngrams', required=True)
argparser.add_argument_group('optional arguments')
argparser.add_argument('--workset', help='Sub-Directory of parent input-directory (if it exists). Helpful if script is executed in loop on many worksets')
argparser.add_argument('--part', help='Sub-Sub-Directory of parent input-directory (if it exists). Helpful if script is executed in loop on many types')
argparser.add_argument('--test', help='Set this if it is a testrun only', action='store_true')
argparser.add_argument('--rerun', help='Set this if it is you want to rerun and ignore old files', action='store_true')
argparser.add_argument('--spacy', help='Set this if you want to run spacy', action='store_true')
argparser.add_argument('--both', help='Set this if spacy is set and you want to run the normal ngrams as well', action='store_true')
argparser.add_argument('--asis', help='Set this if you want to make as-is CHAR ngrams', action='store_true')
argparser.add_argument('--encase_list',
nargs='*',
type=str,
help='''Set this if you want to encase tags ("url", "hashtag", "mention",
"emoji", "emoticon", "time", "number", "date")''')
#parse arguements
args = vars(argparser.parse_args())
command = True
if not command:
args = {}
args['datapath'] = "../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31"
args['save'] = "../../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/preprocessed"
args['file'] ='workset_manager.ndjson'
args['char'] = (2,5)
args['word'] =(1,2)
args['tag'] =(1,3)
args['dep'] =(1,3)
args['pos'] =(1,3)
args ['workset'] ='workset'
args['part'] ='manager'
args['rerun'] =True
args['spacy'] =True
args['both'] =True
args['asis'] =True
args['test'] =True
args['encase_list'] = ["emoji","emoticon"]
else:
#parse arguements
argparser = argparse.ArgumentParser(description='Arguements for preprocessing and making the ngrams')
argparser.add_argument_group('required arguments')
argparser.add_argument('-p', '--datapath', help='Path to parent input directory (relative or absolute)', required=True)
argparser.add_argument('-f', '--file', help='Name of input file', required=True)
argparser.add_argument('-s', '--save', help='Path to output directory (relative or absolute)', required=True)
argparser.add_argument('-c', '--char', help='Range (l,u) for char ngrams', required=True)
argparser.add_argument('-w', '--word', help='Range (l,u) for word ngrams', required=True)
argparser.add_argument('-t', '--tag', help='Range (l,u) for spacy tag ngrams', required=True)
argparser.add_argument('-d', '--dep', help='Range (l,u) for spacy dep ngrams', required=True)
argparser.add_argument('-o', '--pos', help='Range (l,u) for spacy pos ngrams', required=True)
argparser.add_argument_group('optional arguments')
argparser.add_argument('--workset', help='Sub-Directory of parent input-directory (if it exists). Helpful if script is executed in loop on many worksets')
argparser.add_argument('--part', help='Sub-Sub-Directory of parent input-directory (if it exists). Helpful if script is executed in loop on many types')
argparser.add_argument('--test', help='Set this if it is a testrun only', action='store_true')
argparser.add_argument('--rerun', help='Set this if it is you want to rerun and ignore old files', action='store_true')
argparser.add_argument('--spacy', help='Set this if you want to run spacy', action='store_true')
argparser.add_argument('--both', help='Set this if spacy is set and you want to run the normal ngrams as well', action='store_true')
argparser.add_argument('--asis', help='Set this if you want to make as-is CHAR ngrams', action='store_true')
argparser.add_argument('--encase_list',
nargs='*',
type=str,
help='''Set this if you want to encase tags ("url", "hashtag", "mention",
"emoji", "emoticon", "time", "number", "date")''')
#parse arguements
args = vars(argparser.parse_args())
# convert to tuples
for key in ['char', 'word', 'dep', 'tag', 'pos']:
args[key] = ast.literal_eval(args[key])
#make windows-unix problem go away
if '\\' in args['save']:
......@@ -977,9 +954,6 @@ if __name__ == "__main__":
elif '/' in args['datapath']:
args['datapath'] = os.path.join(*args['datapath'].split('/'))
#convert to tuples
for key in ['char', 'word', 'dep', 'tag', 'pos']:
args[key] = ast.literal_eval(args[key])
args['workset'] = args.get('workset', '')
......@@ -989,6 +963,7 @@ if __name__ == "__main__":
args['spacy'] = args.get('spacy', False)
args['both'] = args.get('both', False)
args['asis'] = args.get('asis', False)
print(args['rerun'])
tag_dic = {'url': "url",
'hashtag': "tag",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment