#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import ndjson import jsonlines import json import pickle import os import sys import random as rd import json import re, regex from joblib import dump, load import collections import math import statistics import itertools import multiprocessing as mp from sklearn.preprocessing import LabelEncoder from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score import matplotlib matplotlib.use('agg') import matplotlib.colors as cl import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix import seaborn as sns # data visualization library # In[2]: path = '/draco/ptmp/mschuber/PAN/Data/pan19-celebrity-profiling-training-dataset-2019-01-31/stratified_subsample/' savedir = '/draco/ptmp/mschuber/PAN/Data/results/' #path = '../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/stratified_subsample/' subana_l = ['org/', 'min_tweets_1000/', 'complete_balance/'] subana_l = ['org/'] subsets_l = [200, 500, 1000, 2000] subsets_l = [200] classifiers = ['SVM'] datafolder = 'split_data/' ml_results = 'ml/' filebeg = 'stratified_subsample_' labels_l = ['age', 'gender', 'author'] phases_l = ['child_21', 'young_adult_35', 'adult_50', 'old_adult_65', 'retiree'] #get_ipython().run_line_magic('matplotlib', 'inline') #import matplotlib cmap = cl.LinearSegmentedColormap.from_list("", ["skyblue","cadetblue","darkblue", "steelblue"]) #define colors # In[3]: ####tester #subana = ['org/'] #subsets = [200] #classifiers = ['SVM'] #datafolder = 'split_data/' #ml_results = 'ml/' #filebeg = 'stratified_subsample_' #labels = ['age', 'gender', 'author'] # In[4]: def identity_tokenizer(text): return text def plot_confusion_matrix(cm, classes,normalize=True,title='Confusion matrix',cmap=plt.cm.Blues, ax = None): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ if not ax: ax = plt.gca() cm_old = cm if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') #print(cm) im = ax.imshow(cm, interpolation='nearest', cmap=cmap) #plt.title(title) cbar = ax.figure.colorbar(im, ax = ax, ticks=[], label = "Heat per Row (Normalized from 0 to 1)") tick_marks = np.arange(len(classes)) #ax.set_xticks(tick_marks) #ax.set_yticks(tick_marks) #ax.set_xticklabels(classes, rotation=45) #ax.set_yticklabels(classes) ax.set(xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]), # ... and label them with the respective list entries xticklabels=classes, yticklabels=classes, ylabel='True label', xlabel='Predicted label') # Rotate the tick labels and set their alignment. plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") #make it so that text is on minor label and not on edge of boundary (i.e. half cutoff at top and bottom) ax.set_xticks(np.arange(cm.shape[1]+1)-.5, minor=True) ax.set_yticks(np.arange(cm.shape[0]+1)-.5, minor=True) fmt = '.2f' if normalize else 'd' fmt = 'd' thresh = cm.max() / 1.5 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): ax.text(j, i, format(cm_old[i, j], fmt), ha="center", va="center", color="white" if cm[i, j] > thresh else "black") # Start with analysis of precision and recall as well as heatmaps/confusion matrices: def plotter(subsets, subana, phases, labels): df_dic = {} res_dic = {} author_dic = {} for st in subsets: res_dic[st] = {} df_dic[st] = {} author_dic[st] = {} for ana in subana: res_dic[st][ana.split('/')[0]] = {} df_dic[st][ana.split('/')[0]] = {} author_dic[st][ana.split('/')[0]] = {} ###make dic with all authors with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_author_train.json', 'r', encoding='utf-8') as f: authors = json.load(f) with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_gender_train.json', 'r', encoding='utf-8') as f: gender = json.load(f) with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_age_train.json', 'r', encoding='utf-8') as f: year = json.load(f) for i in range(0, len(year)): age = 2019 - year[i] if age <22: lifePhase = 'child_21' elif age <36: lifePhase = 'young_adult_35' elif age < 51: lifePhase = 'adult_50' elif age <66: lifePhase = 'old_adult_65' else: lifePhase = 'retiree' author_dic[st][ana.split('/')[0]][authors[i]] = {} author_dic[st][ana.split('/')[0]][authors[i]]['life_phase'] = lifePhase author_dic[st][ana.split('/')[0]][authors[i]]['age'] = year[i] author_dic[st][ana.split('/')[0]][authors[i]]['gender'] = gender[i] df = pd.DataFrame() with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_bigram_vocab.json', 'r', encoding='utf-8') as f: vocab = json.load(f) res_dic[st][ana.split('/')[0]]['vocab'] = vocab res_dic[st][ana.split('/')[0]]['vocab_inverse'] = {v:k for k,v in vocab.items()} ##update "vocab" to include the tweet length as feature to display if len(vocab) not in res_dic[st][ana.split('/')[0]]['vocab_inverse']: leng = len(vocab) res_dic[st][ana.split('/')[0]]['vocab']['§LENGTH§'] = leng res_dic[st][ana.split('/')[0]]['vocab_inverse'][leng] = '§LENGTH§' else: print('error; key already exists') print(res_dic[st][ana.split('/')[0]]['vocab_inverse'][len(vocab)]) sys.exit(1) for label in labels: res_dic[st][ana.split('/')[0]][label] = {} enc = load(path+ana+str(st)+'/'+ml_results+filebeg+label+'_'+str(st)+'_encoder.jlib') res_dic[st][ana.split('/')[0]][label]['label_encoder'] = enc for clf in classifiers: clf_l = load(path+ana+str(st)+'/'+ml_results+filebeg+clf+'_'+label+'_'+str(st)+'_svm_out_count.jlib') with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_'+label+'_test.json') as f: lab = json.load(f) df[ana.split('/')[0]+'_'+str(st)+'_'+clf+'_'+label] = lab print(str(st)+'_'+ana+'_'+label+'_'+clf) if label == 'age': phase = [] for el in lab: age = 2019 - el if age <22: lifePhase = 'child_21' elif age <36: lifePhase = 'young_adult_35' elif age < 51: lifePhase = 'adult_50' elif age <66: lifePhase = 'old_adult_65' else: lifePhase = 'retiree' phase.append(lifePhase) df[ana.split('/')[0]+'_'+str(st)+'_life_phase'] = phase #print(len(json.load(f))) res_dic[st][ana.split('/')[0]][label][clf] = clf_l.coef_ df[ana.split('/')[0]+'_'+str(st)+'_'+clf+'_'+label+'_pred_enc'] = list(load(path+ana+str(st)+'/'+ml_results+filebeg+clf+'_'+label+'_'+str(st)+'_predictions_count.jlib')) rev_enc = list(enc.inverse_transform(load(path+ana+str(st)+'/'+ml_results+filebeg+clf+'_'+label+'_'+str(st)+'_predictions_count.jlib'))) df[ana.split('/')[0]+'_'+str(st)+'_'+clf+'_'+label+'_pred'] = rev_enc if label == 'age': phase = [] for el in rev_enc: age = 2019 - el if age <22: lifePhase = 'child_21' elif age <36: lifePhase = 'young_adult_35' elif age < 51: lifePhase = 'adult_50' elif age <66: lifePhase = 'old_adult_65' else: lifePhase = 'retiree' phase.append(lifePhase) df[ana.split('/')[0]+'_'+str(st)+'_life_phase_pred'] = phase res_dic[st][ana.split('/')[0]][label]['labels'] = {} for l in lab: res_dic[st][ana.split('/')[0]][label]['labels'][l] = {} df_dic[st][ana.split('/')[0]]['df'] = df # In[ ]: # In[ ]: # In[ ]: for st in subsets: for ana in subana: an = ana.split('/')[0] for label in labels: enc = res_dic[st][an][label]['label_encoder'] coef = res_dic[st][an][label]['SVM'] key_len = len(res_dic[st][an][label]['labels'].keys()) df = df_dic[st][an]['df'] res_dic[st][an][label]['acc'] = accuracy_score(df[an+'_'+str(st)+'_SVM_'+label], df[an+'_'+str(st)+'_SVM_'+label+'_pred']).round(3) res_dic[st][an][label]['prec'] = precision_score(df[an+'_'+str(st)+'_SVM_'+label], df[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3) res_dic[st][an][label]['rec'] = recall_score(df[an+'_'+str(st)+'_SVM_'+label], df[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3) res_dic[st][an][label]['f1'] = f1_score(df[an+'_'+str(st)+'_SVM_'+label], df[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3) for key in res_dic[st][an][label]['labels'].keys(): key_enc = enc.transform([key])[0] subDf = df.loc[df[an+'_'+str(st)+'_SVM_'+label] == key] if label == 'author': row = df.loc[df[an+'_'+str(st)+'_'+'SVM'+'_'+'author'] == key].iloc[0] res_dic[st][an][label]['labels'][key]['gender'] = row[an+'_'+str(st)+'_'+'SVM'+'_'+'gender'] res_dic[st][an][label]['labels'][key]['age'] = row[an+'_'+str(st)+'_'+'SVM'+'_'+'age'] res_dic[st][an][label]['labels'][key]['life_phase'] = row[an+'_'+str(st)+'_'+'life_phase'] elif label == 'age': age = 2019 -key if age <22: lifePhase = 'child_21' elif age <36: lifePhase = 'young_adult_35' elif age < 51: lifePhase = 'adult_50' elif age <66: lifePhase = 'old_adult_65' else: lifePhase = 'retiree' res_dic[st][an][label]['labels'][key]['life_phase'] = lifePhase res_dic[st][an][label]['labels'][key]['acc'] = accuracy_score(subDf[an+'_'+str(st)+'_SVM_'+label], subDf[an+'_'+str(st)+'_SVM_'+label+'_pred']).round(3) ##no second category for subanalysis: prec = 1 #res_dic[st][an][label]['labels'][key]['prec'] = precision_score(subDf[an+'_'+str(st)+'_SVM_'+label], subDf[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3) ## precision always equals recall in subanalyis #res_dic[st][an][label]['labels'][key]['rec'] = recall_score(subDf[an+'_'+str(st)+'_SVM_'+label], subDf[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3) ##f1 score is ill defined #res_dic[st][an][label]['labels'][key]['f1'] = f1_score(subDf[an+'_'+str(st)+'_SVM_'+label], subDf[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3) if key_len > 2: res_dic[st][an][label]['labels'][key]['feature_vec'] = coef[key_enc] elif key_enc > 0: res_dic[st][an][label]['labels'][key]['feature_vec'] = coef[0] ###save top 25 most predictive labels for each set and each subset in groups most_pred = {} for st in subsets: most_pred[st] = {} for ana in subana: an = ana.split('/')[0] most_pred[st][an] = {} for label in labels: most_pred[st][an][label] = {'feature_vecs':[]} for ph in phases: most_pred[st][an][label][ph] = {'feature_vecs_pos': [], 'feature_vecs_neg': [], 'val_array': []} if label in ['author']: for sex in ['male', 'female']: most_pred[st][an][label][ph][sex] = {'feature_vecs_pos':[], 'feature_vecs_neg':[], 'val_array': []} most_pred[st][an][label][sex] = {'feature_vecs_pos':[], 'feature_vecs_neg':[], 'val_array': []} with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_bigram_vocab.json' , 'r', encoding = 'utf-8') as f: most_pred[st][an]['vocab'] = json.load(f) ##author and age to have different depth than gender and can be used for life phases if label in ['age']: for key in res_dic[st][an][label]['labels'].keys(): ph = res_dic[st][an][label]['labels'][key]['life_phase'] sort = list(np.argsort(res_dic[st][an][label]['labels'][key]['feature_vec'])) array = res_dic[st][an][label]['labels'][key]['feature_vec'] most_pred[st][an][label][ph]['feature_vecs_pos'].append([i for i in sort if array[i] > 0 ]) most_pred[st][an][label][ph]['feature_vecs_neg'].append([i for i in sort if array[i] < 0 ]) if type(most_pred[st][an][label][ph]['val_array']) != type([]): A = most_pred[st][an][label][ph]['val_array'] most_pred[st][an][label][ph]['val_array'] = np.vstack((A, array)) else: most_pred[st][an][label][ph]['val_array'] = array for ph in phases: most_pred[st][an][label][ph]['min_array'] = np.amin(most_pred[st][an][label][ph]['val_array'], axis=0) most_pred[st][an][label][ph]['max_array'] = np.amax(most_pred[st][an][label][ph]['val_array'], axis=0) most_pred[st][an][label][ph]['number'] = most_pred[st][an][label][ph]['val_array'].shape[0] most_pred[st][an][label][ph]['val_array'] = np.mean(most_pred[st][an][label][ph]['val_array'], axis = 0) elif label in ['author']: for key in res_dic[st][an][label]['labels'].keys(): ph = res_dic[st][an][label]['labels'][key]['life_phase'] sex = res_dic[st][an][label]['labels'][key]['gender'] sort = list(np.argsort(res_dic[st][an][label]['labels'][key]['feature_vec'])) array = res_dic[st][an][label]['labels'][key]['feature_vec'] m_pred_pos = [i for i in sort if array[i] > 0 ] m_pred_neg = [i for i in sort if array[i] < 0 ] most_pred[st][an][label][ph]['feature_vecs_pos'].append(m_pred_pos) most_pred[st][an][label][sex]['feature_vecs_pos'].append(m_pred_pos) most_pred[st][an][label][ph][sex]['feature_vecs_pos'].append(m_pred_pos) most_pred[st][an][label][ph]['feature_vecs_neg'].append(m_pred_neg) most_pred[st][an][label][sex]['feature_vecs_neg'].append(m_pred_neg) most_pred[st][an][label][ph][sex]['feature_vecs_neg'].append(m_pred_neg) if type(most_pred[st][an][label][ph]['val_array']) != type([]): A = most_pred[st][an][label][ph]['val_array'] most_pred[st][an][label][ph]['val_array'] = np.vstack((A, array)) else: most_pred[st][an][label][ph]['val_array'] = array if type(most_pred[st][an][label][sex]['val_array']) != type([]): A = most_pred[st][an][label][sex]['val_array'] most_pred[st][an][label][sex]['val_array'] = np.vstack((A, array)) else: most_pred[st][an][label][sex]['val_array'] = array if type(most_pred[st][an][label][ph][sex]['val_array']) != type([]): A = most_pred[st][an][label][ph][sex]['val_array'] most_pred[st][an][label][ph][sex]['val_array'] = np.vstack((A, array)) else: most_pred[st][an][label][ph][sex]['val_array'] = array check = True for ph in phases: most_pred[st][an][label][ph]['min_array'] = np.amin(most_pred[st][an][label][ph]['val_array'], axis = 0) most_pred[st][an][label][ph]['max_array'] = np.amax(most_pred[st][an][label][ph]['val_array'], axis = 0) most_pred[st][an][label][ph]['number'] = most_pred[st][an][label][ph]['val_array'].shape[0] most_pred[st][an][label][ph]['val_array'] = np.mean(most_pred[st][an][label][ph]['val_array'], axis = 0) for sex in ['female', 'male']: if check: #print(sex) #print(most_pred[st][an][label][sex]['val_array']) most_pred[st][an][label][sex]['min_array'] = np.amin(most_pred[st][an][label][sex]['val_array'], axis = 0) most_pred[st][an][label][sex]['max_array'] = np.amax(most_pred[st][an][label][sex]['val_array'], axis = 0) most_pred[st][an][label][sex]['number'] = most_pred[st][an][label][sex]['val_array'].shape[0] most_pred[st][an][label][sex]['val_array'] = np.mean(most_pred[st][an][label][sex]['val_array'], axis = 0) #print(most_pred[st][an][label][sex]['val_array']) #print(most_pred[st][an][label][sex]['min_array']) #print(most_pred[st][an][label][sex]['max_array']) most_pred[st][an][label][ph][sex]['min_array']= np.amin(most_pred[st][an][label][ph][sex]['val_array'], axis = 0) most_pred[st][an][label][ph][sex]['max_array'] =np.amax(most_pred[st][an][label][ph][sex]['val_array'], axis = 0) most_pred[st][an][label][ph][sex]['number'] = most_pred[st][an][label][ph][sex]['val_array'].shape[0] most_pred[st][an][label][ph][sex]['val_array'] = np.mean(most_pred[st][an][label][ph][sex]['val_array'], axis = 0) check = False else: most_pred[st][an][label]['feature_vecs'].append(list(np.argsort(res_dic[st][an][label]['labels']['male']['feature_vec']))) #sys.exit(1) for st in subsets: for ana in subana: an = ana.split('/')[0] for label in labels: if label == 'age': for ph in phases: c = collections.Counter() for vec in most_pred[st][an][label][ph]['feature_vecs_pos']: c.update(list(vec)) most_pred[st][an][label][ph]['count_tot_pos'] = c.most_common() c = collections.Counter() for vec in most_pred[st][an][label][ph]['feature_vecs_pos']: c.update(list(vec[-25:])) most_pred[st][an][label][ph]['count_top25_pos'] = c.most_common() c = collections.Counter() for vec in most_pred[st][an][label][ph]['feature_vecs_neg']: c.update(list(vec)) most_pred[st][an][label][ph]['count_tot_neg'] = c.most_common() c = collections.Counter() for vec in most_pred[st][an][label][ph]['feature_vecs_neg']: c.update(list(vec[-25:])) most_pred[st][an][label][ph]['count_top25_neg'] = c.most_common() if label == 'author': for ph in phases: c = collections.Counter() for vec in most_pred[st][an][label][ph]['feature_vecs_pos']: c.update(list(vec)) most_pred[st][an][label][ph]['count_tot_pos'] = c.most_common() c = collections.Counter() for vec in most_pred[st][an][label][ph]['feature_vecs_pos']: c.update(list(vec[-25:])) most_pred[st][an][label][ph]['count_top25_pos'] = c.most_common() for sex in ['male', 'female']: c = collections.Counter() for vec in most_pred[st][an][label][ph][sex]['feature_vecs_pos']: c.update(list(vec)) most_pred[st][an][label][ph][sex]['count_tot_pos'] = c.most_common() c = collections.Counter() for vec in most_pred[st][an][label][ph][sex]['feature_vecs_pos']: c.update(list(vec[-25:])) most_pred[st][an][label][ph][sex]['count_top25_pos'] = c.most_common() c = collections.Counter() for vec in most_pred[st][an][label][ph]['feature_vecs_neg']: c.update(list(vec)) most_pred[st][an][label][ph]['count_tot_neg'] = c.most_common() c = collections.Counter() for vec in most_pred[st][an][label][ph]['feature_vecs_neg']: c.update(list(vec[-25:])) most_pred[st][an][label][ph]['count_top25_neg'] = c.most_common() for sex in ['male', 'female']: c = collections.Counter() for vec in most_pred[st][an][label][ph][sex]['feature_vecs_neg']: c.update(list(vec)) most_pred[st][an][label][ph][sex]['count_tot_neg'] = c.most_common() c = collections.Counter() for vec in most_pred[st][an][label][ph][sex]['feature_vecs_neg']: c.update(list(vec[-25:])) most_pred[st][an][label][ph][sex]['count_top25_neg'] = c.most_common() for sex in ['male', 'female']: c = collections.Counter() for vec in most_pred[st][an][label][sex]['feature_vecs_pos']: c.update(list(vec)) most_pred[st][an][label][sex]['count_tot_pos'] = c.most_common() c = collections.Counter() for vec in most_pred[st][an][label][sex]['feature_vecs_pos']: c.update(list(vec[-25:])) most_pred[st][an][label][sex]['count_top25_pos'] = c.most_common() c = collections.Counter() for vec in most_pred[st][an][label][sex]['feature_vecs_neg']: c.update(list(vec)) most_pred[st][an][label][sex]['count_tot_neg'] = c.most_common() c = collections.Counter() for vec in most_pred[st][an][label][sex]['feature_vecs_neg']: c.update(list(vec[-25:])) most_pred[st][an][label][sex]['count_top25_neg'] = c.most_common() for st in subsets: for ana in subana: an = ana.split('/')[0] tmp = [] index = [] for label in labels: comp = {'accuracy': accuracy_score(df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_'+label], df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_'+label+'_pred']).round(3), 'precision': precision_score(df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_'+label], df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3), 'recall': recall_score(df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_'+label], df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3), 'f1-score': f1_score(df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_'+label], df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3), } tmp.append(comp) index.append(label) if label == 'age': comp = {'accuracy': accuracy_score(df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase'], df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase_pred']).round(3), 'precision': precision_score(df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase'], df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase_pred'], average='weighted').round(3), 'recall': recall_score(df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase'], df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase_pred'], average='weighted').round(3), 'f1-score': f1_score(df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase'], df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase_pred'], average='weighted').round(3) } tmp.append(comp) index.append('age by life_phase') np.set_printoptions(precision=2) cnf_matrix = confusion_matrix(df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase'], df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase_pred'], labels=phases) f = plt.figure() ax = f.subplots() plot_confusion_matrix(cnf_matrix, classes=phases,title=None, ax=ax) plt.tight_layout() #plt.show() #f.savefig('../Data/results/heatmaps/test.png') f.savefig(savedir+ 'heatmaps/cm_{st}_{an}_{label}_{group}.png'.format(st = st, an=an, label=label, group='life_phase')) if label == 'author': gen_pred_auth = [] life_ph_pred_auth = [] gen_pred_auth_wrong = [] life_ph_pred_auth_wrong = [] auth = list(df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_author_pred']) sub_wrong = df_dic[st][an]['df'].loc[df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_author_pred'] != df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_author']] auth_sub_wrong = list(sub_wrong[an+'_'+str(st)+'_SVM_author_pred']) for au in auth: gen_pred_auth.append(author_dic[st][an][au]['gender']) life_ph_pred_auth.append(author_dic[st][an][au]['life_phase']) for au in auth_sub_wrong: gen_pred_auth_wrong.append(author_dic[st][an][au]['gender']) life_ph_pred_auth_wrong.append(author_dic[st][an][au]['life_phase']) comp = {'accuracy': accuracy_score(df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase'], life_ph_pred_auth).round(3), 'precision': precision_score(df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase'], life_ph_pred_auth, average='weighted').round(3), 'recall': recall_score(df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase'], life_ph_pred_auth, average='weighted').round(3), 'f1-score': f1_score(df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase'], life_ph_pred_auth, average='weighted').round(3) } tmp.append(comp) index.append('author by life_phase') cnf_matrix = confusion_matrix(df_dic[st][an]['df'][an+'_'+str(st)+'_life_phase'], life_ph_pred_auth, labels=phases) f = plt.figure() ax = f.subplots() plot_confusion_matrix(cnf_matrix, classes=phases,title=None, ax=ax) plt.show() plt.tight_layout() f.savefig(savedir+'heatmaps/cm_{st}_{an}_{label}_{group}.png'.format(st = st, an=an, label=label, group='life_phase')) comp = {'accuracy': accuracy_score(df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_gender'], gen_pred_auth).round(3), 'precision': precision_score(df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_gender'], gen_pred_auth, average='weighted').round(3), 'recall': recall_score(df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_gender'], gen_pred_auth, average='weighted').round(3), 'f1-score': f1_score(df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_gender'], gen_pred_auth, average='weighted').round(3) } tmp.append(comp) index.append('author by gender') cnf_matrix = confusion_matrix(df_dic[st][an]['df'][an+'_'+str(st)+'_SVM_gender'], gen_pred_auth, labels=['female', 'male']) f = plt.figure() ax = f.subplots() plot_confusion_matrix(cnf_matrix, classes=phases,title=None, ax=ax) plt.tight_layout() f.savefig(savedir+'heatmaps/cm_{st}_{an}_{label}_{group}.png'.format(st = st, an=an, label=label, group='gender')) ###make author cmap showing whether the author missclassified ##were confused with authors of similar gender or life_phase cnf_matrix = confusion_matrix(sub_wrong[an+'_'+str(st)+'_life_phase'], life_ph_pred_auth_wrong, labels=phases) f = plt.figure() ax = f.subplots() plot_confusion_matrix(cnf_matrix, classes=phases,title=None, ax=ax) plt.tight_layout() f.savefig(savedir+'heatmaps/cm_{st}_{an}_{label}_{group}_false.png'.format(st = st, an=an, label=label, group='life_phase')) cnf_matrix = confusion_matrix(sub_wrong[an+'_'+str(st)+'_SVM_gender'], gen_pred_auth_wrong, labels=['female', 'male']) f = plt.figure() ax = f.subplots() plot_confusion_matrix(cnf_matrix, classes=phases,title=None, ax=ax) plt.tight_layout() f.savefig(savedir+'heatmaps/cm_{st}_{an}_{label}_{group}_false.png'.format(st = st, an=an, label=label, group='gender')) f= plt.figure(figsize=(10,5)) tmp_df =pd.DataFrame(tmp, index = index) tmp_df.plot(kind='barh', colormap = cmap, ax=f.gca()) plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) #legend outside box plt.ylabel(ylabel='Evaluation Measures for Different Subsets',fontsize ='large', fontweight='roman') plt.tight_layout() plt.savefig(savedir+'barplots/overall_scores_{}_{}.png'.format(st, an)) # Make plots with most predicitve features # In[ ]: #import matplotlib as mpl #mpl.rcParams['font.sans-serif'] = ['Segoe UI Emoji'] #mpl.rcParams['font.serif'] = ['Segoe UI Emoji'] #sns.set_style({"font.sans-serif":['Segoe UI Emoji']}) for st in subsets: for ana in subana: an = ana.split('/')[0] vocab = res_dic[st][an]['vocab_inverse'] for label in labels: dic = {} ind_pos = [] ind_neg = [] if label == 'age': phase_key = [] for ph in phases: leng = most_pred[st][an][label][ph]['number'] arr = most_pred[st][an][label][ph]['val_array'] maxi = most_pred[st][an][label][ph]['max_array'] mini = most_pred[st][an][label][ph]['min_array'] ##make it so that the values of heat are relatively to the min and max values of the feature dic[ph+'\n({})'.format(leng)] = {el[0]:(arr[el[0]]/maxi[el[0]]).round(3) for el in most_pred[st][an][label][ph]['count_tot_pos'] if (el[1]/leng) >= .95} dic[ph+'\n({})'.format(leng)].update({el[0]:-(abs(arr[el[0]])/abs(mini[el[0]])).round(3) for el in most_pred[st][an][label][ph]['count_tot_neg'] if (el[1]/leng) >= .95}) ind_pos.extend([el[0] for el in most_pred[st][an][label][ph]['count_tot_pos'] if (el[1]/leng) >= .95]) ind_neg.extend([el[0] for el in most_pred[st][an][label][ph]['count_tot_neg'] if (el[1]/leng) >= .95]) phase_key.append(ph+'\n({})'.format(leng)) ind_pos = list(np.unique(ind_pos)) ind_neg = list(np.unique(ind_neg)) tmp = {} for ph in phase_key: tmp_l = [] for el in ind_pos+ind_neg: try: tmp_l.append(dic[ph][el]) except: tmp_l.append(0) tmp[ph] = tmp_l #print([vocab[el] for el in ind_pos + ind_neg]) ind = [vocab[el].replace('§', '') for el in ind_pos+ind_neg] ind = [re.sub(r'\s', 'BLANK', el) for el in ind] ind = [el.replace('$', r'\$') for el in ind] ind = [el.replace('\n', 'BREAK') for el in ind] #print(ind) for i in range(0, len(ind)): try: ind[i].encode('ascii') except: ind[i] = ind[i].encode('unicode-escape') df = pd.DataFrame(tmp , index = ind) f,ax = plt.subplots(figsize=(18, len(ind)/6)) sns.heatmap(df, fmt= '.1f',ax=ax, center = 0, yticklabels = True) f.savefig(savedir+'featureplots/features_heat_{}_{}_{}_phases.png'.format(st, an, label)) elif label == 'author': sex_key = [] phase_key = [] for sex in ['male', 'female']: leng = most_pred[st][an][label][sex]['number'] arr = most_pred[st][an][label][sex]['val_array'] maxi = most_pred[st][an][label][sex]['max_array'] mini = most_pred[st][an][label][sex]['min_array'] ##make it so that the values of heat are relatively to the min and max values of the feature dic[sex+'\n({})'.format(leng)] = {el[0]:(arr[el[0]]/maxi[el[0]]).round(3) for el in most_pred[st][an][label][sex]['count_tot_pos'] if (el[1]/leng) >= .95} dic[sex+'\n({})'.format(leng)].update({el[0]:-(abs(arr[el[0]])/abs(mini[el[0]])).round(3) for el in most_pred[st][an][label][sex]['count_tot_neg'] if (el[1]/leng) >= .95}) ind_pos.extend([el[0] for el in most_pred[st][an][label][sex]['count_tot_pos'] if (el[1]/leng) >= .95]) ind_neg.extend([el[0] for el in most_pred[st][an][label][sex]['count_tot_neg'] if (el[1]/leng) >= .95]) sex_key.append(sex+'\n({})'.format(leng)) for ph in phases: leng = most_pred[st][an][label][ph]['number'] arr = most_pred[st][an][label][ph]['val_array'] maxi = most_pred[st][an][label][ph]['max_array'] mini = most_pred[st][an][label][ph]['min_array'] ##make it so that the values of heat are relatively to the min and max values of the feature dic[ph+'\n({})'.format(leng)] = {el[0]:(arr[el[0]]/maxi[el[0]]).round(3) for el in most_pred[st][an][label][ph]['count_tot_pos'] if (el[1]/leng) >= 0.95} dic[ph+'\n({})'.format(leng)].update({el[0]:-(abs(arr[el[0]])/abs(mini[el[0]])).round(3) for el in most_pred[st][an][label][ph]['count_tot_neg'] if (el[1]/leng) >= .95}) ind_pos.extend([el[0] for el in most_pred[st][an][label][ph]['count_tot_pos'] if (el[1]/leng) >= .95]) ind_neg.extend([el[0] for el in most_pred[st][an][label][ph]['count_tot_neg'] if (el[1]/leng) >= .95]) phase_key.append(ph+'\n({})'.format(leng)) ind_pos = list(np.unique(ind_pos)) ind_neg = list(np.unique(ind_neg)) tmp = {} for ph in phase_key+sex_key: tmp_l = [] for el in ind_pos+ind_neg: try: tmp_l.append(dic[ph][el]) except: tmp_l.append(0) tmp[ph] = tmp_l #print([vocab[el] for el in ind_pos + ind_neg]) ind = [vocab[el].replace('§', '') for el in ind_pos+ind_neg] ind = [re.sub(r'\s', 'BLANK', el) for el in ind] ind = [el.replace('$', r'\$') for el in ind] ind = [el.replace('\n', 'BREAK') for el in ind] #print(ind) for i in range(0, len(ind)): try: ind[i].encode('ascii') except: ind[i] = ind[i].encode('unicode-escape') #print([vocab[el] for el in ind_pos + ind_neg]) df = pd.DataFrame(tmp, index = ind) f,ax = plt.subplots(figsize=(18, len(ind_pos+ind_neg)/6)) sns.heatmap(df, fmt= '.1f',ax=ax, center = 0, yticklabels = True) f.savefig(savedir+'featureplots/features_heat_{}_{}_{}_phases.png'.format(st, an, label)) dic = {} ind_pos = [] ind_neg = [] phase_key = [] for ph in phases: for sex in ['female', 'male']: leng = most_pred[st][an][label][ph][sex]['number'] arr = most_pred[st][an][label][ph][sex]['val_array'] maxi = most_pred[st][an][label][ph][sex]['max_array'] mini = most_pred[st][an][label][ph][sex]['min_array'] ##make it so that the values of heat are relatively to the min and max values of the feature dic[sex+'_'+ph+'\n({})'.format(leng)] = {el[0]:(arr[el[0]]/maxi[el[0]]).round(3) for el in most_pred[st][an][label][ph][sex]['count_tot_pos'] if (el[1]/leng) >= 0.95} dic[sex+'_'+ph+'\n({})'.format(leng)].update({el[0]:-(abs(arr[el[0]])/abs(mini[el[0]])).round(3) for el in most_pred[st][an][label][ph][sex]['count_tot_neg'] if (el[1]/leng) >= .95}) ind_pos.extend([el[0] for el in most_pred[st][an][label][ph][sex]['count_tot_pos'] if (el[1]/leng) >= .95]) ind_neg.extend([el[0] for el in most_pred[st][an][label][ph][sex]['count_tot_neg'] if (el[1]/leng) >= .95]) phase_key.append(sex+'_'+ph+'\n({})'.format(leng)) ind_pos = list(np.unique(ind_pos)) ind_neg = list(np.unique(ind_neg)) tmp = {} for ph in phase_key: tmp_l = [] for el in ind_pos+ind_neg: try: tmp_l.append(dic[ph][el]) except: tmp_l.append(0) tmp[ph] = tmp_l #print([vocab[el] for el in ind_pos + ind_neg]) ind = [vocab[el].replace('§', '') for el in ind_pos+ind_neg] ind = [re.sub(r'\s', 'BLANK', el) for el in ind] ind = [el.replace('$', r'\$') for el in ind] ind = [el.replace('\n', 'BREAK') for el in ind] #print(ind) for i in range(0, len(ind)): try: ind[i].encode('ascii') except: ind[i] = ind[i].encode('unicode-escape') df = pd.DataFrame(tmp, index = ind) f,ax = plt.subplots(figsize=(18, len(ind_pos+ind_neg)/6)) sns.heatmap(df, fmt= '.1f',ax=ax, center = 0, yticklabels = True) f.savefig(savedir+'featureplots/features_heat_{}_{}_{}_gender_phases.png'.format(st, an, label)) return 1 def main(): cpus = mp.cpu_count() pool = mp.Pool(cpus) #fire off workers jobs = [] #create jobs print('make job queue...') sys.stdout.flush() print('enter cycle...') for subsets in subsets_l: for subana in subana_l: job = pool.apply_async(plotter,([subsets], [subana], phases_l, labels_l)) jobs.append(job) print('collect results from jobs...') sys.stdout.flush() # collect results from the workers through the pool result queue for j in range(0, len(jobs)): tmp = jobs.pop(0) tmp = tmp.get() del tmp print('kill all remaining workers...') sys.stdout.flush() print('closing down the pool') sys.stdout.flush() pool.close() pool.join() print('done and exit :)') sys.stdout.flush() if __name__ == "__main__": main()