Commit 373c275d authored by Marcel Henrik Schubert's avatar Marcel Henrik Schubert
Browse files

fixed errors due to some unpredictable pattern

parent 3ff23fad
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import ndjson
import jsonlines
import json
import pickle
import os
import sys
import random as rd
import json
import re, regex
from joblib import dump, load
import collections
import math
import statistics
import itertools
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
```
%% Cell type:code id: tags:
``` python
path = '../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/stratified_subsample/'
subana = ['complete_balance/'}
subsets = [200, 500, 1000, 2000]
subsets = [200, 500, 1000]
#subsets = [2000]
classifiers = ['SVM']
datafolder = 'split_data/'
ml_results = 'ml/'
filebeg = 'stratified_subsample_'
labels = ['age', 'gender', 'author']
phases = ['child_21', 'young_adult_35', 'adult_50', 'old_adult_65', 'retiree']
```
%% Cell type:code id: tags:
``` python
def identity_tokenizer(text):
return text
df_dic = {}
res_dic = {}
author_dic = {}
for st in subsets:
res_dic[st] = {}
df_dic[st] = {}
author_dic[st] = {}
for ana in subana:
res_dic[st][ana.split('/')[0]] = {}
df_dic[st][ana.split('/')[0]] = {}
author_dic[st][ana.split('/')[0]] = {}
###make dic with all authors
with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_author_train.json', 'r', encoding='utf-8') as f:
authors = json.load(f)
with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_gender_train.json', 'r', encoding='utf-8') as f:
gender = json.load(f)
with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_age_train.json', 'r', encoding='utf-8') as f:
year = json.load(f)
for i in range(0, len(year)):
age = 2019 - year[i]
if age <22:
lifePhase = 'child_21'
elif age <36:
lifePhase = 'young_adult_35'
elif age < 51:
lifePhase = 'adult_50'
elif age <66:
lifePhase = 'old_adult_65'
else:
lifePhase = 'retiree'
author_dic[st][ana.split('/')[0]][authors[i]] = {}
author_dic[st][ana.split('/')[0]][authors[i]]['life_phase'] = lifePhase
author_dic[st][ana.split('/')[0]][authors[i]]['age'] = year[i]
author_dic[st][ana.split('/')[0]][authors[i]]['gender'] = gender[i]
df = pd.DataFrame()
with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_bigram_vocab.json', 'r', encoding='utf-8') as f:
vocab = json.load(f)
res_dic[st][ana.split('/')[0]]['vocab'] = vocab
res_dic[st][ana.split('/')[0]]['vocab_inverse'] = {v:k for k,v in vocab.items()}
##update "vocab" to include the tweet length as feature to display
if len(vocab) not in res_dic[st][ana.split('/')[0]]['vocab_inverse']:
leng = len(vocab)
res_dic[st][ana.split('/')[0]]['vocab']['§LENGTH§'] = leng
res_dic[st][ana.split('/')[0]]['vocab_inverse'][leng] = '§LENGTH§'
else:
print('error; key already exists')
print(res_dic[st][ana.split('/')[0]]['vocab_inverse'][len(vocab)])
sys.exit(1)
for label in labels:
res_dic[st][ana.split('/')[0]][label] = {}
enc = load(path+ana+str(st)+'/'+ml_results+filebeg+label+'_'+str(st)+'_encoder.jlib')
res_dic[st][ana.split('/')[0]][label]['label_encoder'] = enc
for clf in classifiers:
clf_l = load(path+ana+str(st)+'/'+ml_results+filebeg+clf+'_'+label+'_'+str(st)+'_svm_out_count.jlib')
with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_'+label+'_test.json') as f:
lab = json.load(f)
df[ana.split('/')[0]+'_'+str(st)+'_'+clf+'_'+label] = lab
print(str(st)+'_'+ana+'_'+label+'_'+clf)
if label == 'age':
phase = []
for el in lab:
age = 2019 - el
if age <22:
lifePhase = 'child_21'
elif age <36:
lifePhase = 'young_adult_35'
elif age < 51:
lifePhase = 'adult_50'
elif age <66:
lifePhase = 'old_adult_65'
else:
lifePhase = 'retiree'
phase.append(lifePhase)
df[ana.split('/')[0]+'_'+str(st)+'_life_phase'] = phase
#print(len(json.load(f)))
res_dic[st][ana.split('/')[0]][label][clf] = clf_l.coef_
df[ana.split('/')[0]+'_'+str(st)+'_'+clf+'_'+label+'_pred_enc'] = list(load(path+ana+str(st)+'/'+ml_results+filebeg+clf+'_'+label+'_'+str(st)+'_predictions_count.jlib'))
rev_enc = list(enc.inverse_transform(load(path+ana+str(st)+'/'+ml_results+filebeg+clf+'_'+label+'_'+str(st)+'_predictions_count.jlib')))
df[ana.split('/')[0]+'_'+str(st)+'_'+clf+'_'+label+'_pred'] = rev_enc
if label == 'age':
phase = []
for el in rev_enc:
age = 2019 - el
if age <22:
lifePhase = 'child_21'
elif age <36:
lifePhase = 'young_adult_35'
elif age < 51:
lifePhase = 'adult_50'
elif age <66:
lifePhase = 'old_adult_65'
else:
lifePhase = 'retiree'
phase.append(lifePhase)
df[ana.split('/')[0]+'_'+str(st)+'_life_phase_pred'] = phase
res_dic[st][ana.split('/')[0]][label]['labels'] = {}
for l in lab:
res_dic[st][ana.split('/')[0]][label]['labels'][l] = {}
df_dic[st][ana.split('/')[0]]['df'] = df
```
%%%% Output: stream
C:\Users\schubert\AppData\Roaming\Python\Python36\site-packages\sklearn\base.py:251: UserWarning: Trying to unpickle estimator LabelEncoder from version 0.19.1 when using version 0.20.2. This might lead to breaking code or invalid results. Use at your own risk.
UserWarning)
C:\Users\schubert\AppData\Roaming\Python\Python36\site-packages\sklearn\base.py:251: UserWarning: Trying to unpickle estimator SGDClassifier from version 0.19.1 when using version 0.20.2. This might lead to breaking code or invalid results. Use at your own risk.
UserWarning)
%%%% Output: stream
200_org/_age_SVM
200_org/_gender_SVM
200_org/_author_SVM
200_min_tweets_1000/_age_SVM
200_min_tweets_1000/_gender_SVM
200_min_tweets_1000/_author_SVM
200_complete_balance/_age_SVM
200_complete_balance/_gender_SVM
200_complete_balance/_author_SVM
500_org/_age_SVM
500_org/_gender_SVM
500_org/_author_SVM
500_min_tweets_1000/_age_SVM
500_min_tweets_1000/_gender_SVM
500_min_tweets_1000/_author_SVM
500_complete_balance/_age_SVM
500_complete_balance/_gender_SVM
500_complete_balance/_author_SVM
1000_org/_age_SVM
1000_org/_gender_SVM
1000_org/_author_SVM
1000_min_tweets_1000/_age_SVM
1000_min_tweets_1000/_gender_SVM
1000_min_tweets_1000/_author_SVM
1000_complete_balance/_age_SVM
1000_complete_balance/_gender_SVM
1000_complete_balance/_author_SVM
2000_org/_age_SVM
2000_org/_gender_SVM
2000_org/_author_SVM
%%%% Output: error
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-4-7f492b02077b> in <module>
46 author_dic[st][ana.split('/')[0]][authors[i]]['life_phase'] = lifePhase
47 author_dic[st][ana.split('/')[0]][authors[i]]['age'] = year[i]
---> 48 author_dic[st][ana.split('/')[0]][authors[i]]['gender'] = gender[i]
49
50
KeyboardInterrupt:
%% Cell type:code id: tags:
``` python
#df_dic[200]['org']['df']
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
for st in subsets:
for ana in subana:
an = ana.split('/')[0]
for label in labels:
enc = res_dic[st][an][label]['label_encoder']
coef = res_dic[st][an][label]['SVM']
key_len = len(res_dic[st][an][label]['labels'].keys())
df = df_dic[st][an]['df']
res_dic[st][an][label]['acc'] = accuracy_score(df[an+'_'+str(st)+'_SVM_'+label], df[an+'_'+str(st)+'_SVM_'+label+'_pred']).round(3)
res_dic[st][an][label]['prec'] = precision_score(df[an+'_'+str(st)+'_SVM_'+label], df[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3)
res_dic[st][an][label]['rec'] = recall_score(df[an+'_'+str(st)+'_SVM_'+label], df[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3)
res_dic[st][an][label]['f1'] = f1_score(df[an+'_'+str(st)+'_SVM_'+label], df[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3)
for key in res_dic[st][an][label]['labels'].keys():
key_enc = enc.transform([key])[0]
subDf = df.loc[df[an+'_'+str(st)+'_SVM_'+label] == key]
if label == 'author':
row = df.loc[df[an+'_'+str(st)+'_'+'SVM'+'_'+'author'] == key].iloc[0]
res_dic[st][an][label]['labels'][key]['gender'] = row[an+'_'+str(st)+'_'+'SVM'+'_'+'gender']
res_dic[st][an][label]['labels'][key]['age'] = row[an+'_'+str(st)+'_'+'SVM'+'_'+'age']
res_dic[st][an][label]['labels'][key]['life_phase'] = row[an+'_'+str(st)+'_'+'life_phase']
elif label == 'age':
age = 2019 -key
if age <22:
lifePhase = 'child_21'
elif age <36:
lifePhase = 'young_adult_35'
elif age < 51:
lifePhase = 'adult_50'
elif age <66:
lifePhase = 'old_adult_65'
else:
lifePhase = 'retiree'
)
res_dic[st][an][label]['labels'][key]['life_phase'] = lifePhase
res_dic[st][an][label]['labels'][key]['acc'] = accuracy_score(subDf[an+'_'+str(st)+'_SVM_'+label], subDf[an+'_'+str(st)+'_SVM_'+label+'_pred']).round(3)
##no second category for subanalysis: prec = 1
#res_dic[st][an][label]['labels'][key]['prec'] = precision_score(subDf[an+'_'+str(st)+'_SVM_'+label], subDf[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3)
## precision always equals recall in subanalyis
#res_dic[st][an][label]['labels'][key]['rec'] = recall_score(subDf[an+'_'+str(st)+'_SVM_'+label], subDf[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3)
##f1 score is ill defined
#res_dic[st][an][label]['labels'][key]['f1'] = f1_score(subDf[an+'_'+str(st)+'_SVM_'+label], subDf[an+'_'+str(st)+'_SVM_'+label+'_pred'], average='weighted').round(3)
if key_len > 2:
res_dic[st][an][label]['labels'][key]['feature_vec'] = coef[key_enc]
elif key_enc > 0:
res_dic[st][an][label]['labels'][key]['feature_vec'] = coef[0]
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
###save top 25 most predictive labels for each set and each subset in groups
most_pred = {}
for st in subsets:
most_pred[st] = {}
for ana in subana:
an = ana.split('/')[0]
most_pred[st][an] = {}
for label in labels:
most_pred[st][an][label] = {'feature_vecs':[]}
for ph in phases:
most_pred[st][an][label][ph] = {'feature_vecs_pos': [], 'feature_vecs_neg': [], 'val_array': []}
if label in ['author']:
for sex in ['male', 'female']:
most_pred[st][an][label][ph][sex] = {'feature_vecs_pos':[], 'feature_vecs_neg':[],
'val_array': []}
most_pred[st][an][label][sex] = {'feature_vecs_pos':[], 'feature_vecs_neg':[],
'val_array': []}
with open(path+ana+str(st)+'/'+datafolder+filebeg+str(st)+'_bigram_vocab.json' , 'r', encoding = 'utf-8') as f:
most_pred[st][an]['vocab'] = json.load(f)
##author and age to have different depth than gender and can be used for life phases
if label in ['age']:
for key in res_dic[st][an][label]['labels'].keys():
ph = res_dic[st][an][label]['labels'][key]['life_phase']
sort = list(np.argsort(res_dic[st][an][label]['labels'][key]['feature_vec']))
array = res_dic[st][an][label]['labels'][key]['feature_vec']
most_pred[st][an][label][ph]['feature_vecs_pos'].append([i for i in sort if array[i] > 0 ])
most_pred[st][an][label][ph]['feature_vecs_neg'].append([i for i in sort if array[i] < 0 ])
if type(most_pred[st][an][label][ph]['val_array']) != type([]):
A = most_pred[st][an][label][ph]['val_array']
most_pred[st][an][label][ph]['val_array'] = np.vstack((A, array))
else:
most_pred[st][an][label][ph]['val_array'] = array
for ph in phases:
most_pred[st][an][label][ph]['min_array'] = np.amin(most_pred[st][an][label][ph]['val_array'], axis=0)
most_pred[st][an][label][ph]['max_array'] = np.amax(most_pred[st][an][label][ph]['val_array'], axis=0)
most_pred[st][an][label][ph]['number'] = most_pred[st][an][label][ph]['val_array'].shape[0]
most_pred[st][an][label][ph]['val_array'] = np.mean(most_pred[st][an][label][ph]['val_array'], axis = 0)
elif label in ['author']:
for key in res_dic[st][an][label]['labels'].keys():
ph = res_dic[st][an][label]['labels'][key]['life_phase']
sex = res_dic[st][an][label]['labels'][key]['gender']
sort = list(np.argsort(res_dic[st][an][label]['labels'][key]['feature_vec']))
array = res_dic[st][an][label]['labels'][key]['feature_vec']
m_pred_pos = [i for i in sort if array[i] > 0 ]
m_pred_neg = [i for i in sort if array[i] < 0 ]
most_pred[st][an][label][ph]['feature_vecs_pos'].append(m_pred_pos)
most_pred[st][an][label][sex]['feature_vecs_pos'].append(m_pred_pos)
most_pred[st][an][label][ph][sex]['feature_vecs_pos'].append(m_pred_pos)
most_pred[st][an][label][ph]['feature_vecs_neg'].append(m_pred_neg)
most_pred[st][an][label][sex]['feature_vecs_neg'].append(m_pred_neg)
most_pred[st][an][label][ph][sex]['feature_vecs_neg'].append(m_pred_neg)
if type(most_pred[st][an][label][ph]['val_array']) != type([]):
A = most_pred[st][an][label][ph]['val_array']
most_pred[st][an][label][ph]['val_array'] = np.vstack((A, array))
else:
most_pred[st][an][label][ph]['val_array'] = array
if type(most_pred[st][an][label][sex]['val_array']) != type([]):
A = most_pred[st][an][label][sex]['val_array']
most_pred[st][an][label][sex]['val_array'] = np.vstack((A, array))
else:
most_pred[st][an][label][sex]['val_array'] = array
if type(most_pred[st][an][label][ph][sex]['val_array']) != type([]):
A = most_pred[st][an][label][ph][sex]['val_array']
most_pred[st][an][label][ph][sex]['val_array'] = np.vstack((A, array))
else:
most_pred[st][an][label][ph][sex]['val_array'] = array
check = True
for ph in phases:
most_pred[st][an][label][ph]['min_array'] = np.amin(most_pred[st][an][label][ph]['val_array'], axis = 0)
most_pred[st][an][label][ph]['max_array'] = np.amax(most_pred[st][an][label][ph]['val_array'], axis = 0)
most_pred[st][an][label][ph]['number'] = most_pred[st][an][label][ph]['val_array'].shape[0]
most_pred[st][an][label][ph]['val_array'] = np.mean(most_pred[st][an][label][ph]['val_array'], axis = 0)
for sex in ['female', 'male']:
if check:
#print(sex)
#print(most_pred[st][an][label][sex]['val_array'])
most_pred[st][an][label][sex]['min_array'] = np.amin(most_pred[st][an][label][sex]['val_array'], axis = 0)
most_pred[st][an][label][sex]['max_array'] = np.amax(most_pred[st][an][label][sex]['val_array'], axis = 0)
most_pred[st][an][label][sex]['number'] = most_pred[st][an][label][sex]['val_array'].shape[0]
most_pred[st][an][label][sex]['val_array'] = np.mean(most_pred[st][an][label][sex]['val_array'], axis = 0)
#print(most_pred[st][an][label][sex]['val_array'])
#print(most_pred[st][an][label][sex]['min_array'])
#print(most_pred[st][an][label][sex]['max_array'])
most_pred[st][an][label][ph][sex]['min_array']= np.amin(most_pred[st][an][label][ph][sex]['val_array'], axis = 0)
most_pred[st][an][label][ph][sex]['max_array'] =np.amax(most_pred[st][an][label][ph][sex]['val_array'], axis = 0)
most_pred[st][an][label][ph][sex]['number'] = most_pred[st][an][label][ph][sex]['val_array'].shape[0]
most_pred[st][an][label][ph][sex]['val_array'] = np.mean(most_pred[st][an][label][ph][sex]['val_array'], axis = 0)
check = False
else:
most_pred[st][an][label]['feature_vecs'].append(list(np.argsort(res_dic[st][an][label]['labels']['male']['feature_vec'])))
#sys.exit(1)
for st in subsets:
for ana in subana:
an = ana.split('/')[0]
for label in labels:
if label == 'age':
for ph in phases:
c = collections.Counter()
for vec in most_pred[st][an][label][ph]['feature_vecs_pos']:
c.update(list(vec))
most_pred[st][an][label][ph]['count_tot_pos'] = c.most_common()
c = collections.Counter()
for vec in most_pred[st][an][label][ph]['feature_vecs_pos']:
c.update(list(vec[-25:]))
most_pred[st][an][label][ph]['count_top25_pos'] = c.most_common()
c = collections.Counter()
for vec in most_pred[st][an][label][ph]['feature_vecs_neg']:
c.update(list(vec))
most_pred[st][an][label][ph]['count_tot_neg'] = c.most_common()
c = collections.Counter()
for vec in most_pred[st][an][label][ph]['feature_vecs_neg']:
c.update(list(vec[-25:]))
most_pred[st][an][label][ph]['count_top25_neg'] = c.most_common()
if label == 'author':
for ph in phases:
c = collections.Counter()
for vec in most_pred[st][an][label][ph]['feature_vecs_pos']:
c.update(list(vec))
most_pred[st][an][label][ph]['count_tot_pos'] = c.most_common()
c = collections.Counter()
for vec in most_pred[st][an][label][ph]['feature_vecs_pos']:
c.update(list(vec[-25:]))
most_pred[st][an][label][ph]['count_top25_pos'] = c.most_common()
for sex in ['male', 'female']:
c = collections.Counter()
for vec in most_pred[st][an][label][ph][sex]['feature_vecs_pos']:
c.update(list(vec))
most_pred[st][an][label][ph][sex]['count_tot_pos'] = c.most_common()
c = collections.Counter()
for vec in most_pred[st][an][label][ph][sex]['feature_vecs_pos']:
c.update(list(vec[-25:]))
most_pred[st][an][label][ph][sex]['count_top25_pos'] = c.most_common()
c = collections.Counter()
for vec in most_pred[st][an][label][ph]['feature_vecs_neg']:
c.update(list(vec))
most_pred[st][an][label][ph]['count_tot_neg'] = c.most_common()
c = collections.Counter()
for vec in most_pred[st][an][label][ph]['feature_vecs_neg']:
c.update(list(vec[-25:]))
most_pred[st][an][label][ph]['count_top25_neg'] = c.most_common()
for sex in ['male', 'female']:
c = collections.Counter()
for vec in most_pred[st][an][label][ph][sex]['feature_vecs_neg']:
c.update(list(vec))
most_pred[st][an][label][ph][sex]['count_tot_neg'] = c.most_common()
c = collections.Counter()
for vec in most_pred[st][an][label][ph][sex]['feature_vecs_neg']:
c.update(list(vec[-25:]))
most_pred[st][an][label][ph][sex]['count_top25_neg'] = c.most_common()
for sex in ['male', 'female']:
c = collections.Counter()
for vec in most_pred[st][an][label][sex]['feature_vecs_pos']:
c.update(list(vec))
most_pred[st][an][label][sex]['count_tot_pos'] = c.most_common()
c = collections.Counter()
for vec in most_pred[st][an][label][sex]['feature_vecs_pos']:
c.update(list(vec[-25:]))
most_pred[st][an][label][sex]['count_top25_pos'] = c.most_common()
c = collections.Counter()
for vec in most_pred[st][an][label][sex]['feature_vecs_neg']:
c.update(list(vec))
most_pred[st][an][label][sex]['count_tot_neg'] = c.most_common()
c = collections.Counter()
for vec in most_pred[st][an][label][sex]['feature_vecs_neg']:
c.update(list(vec[-25:]))
most_pred[st][an][label][sex]['count_top25_neg'] = c.most_common()
```
%% Cell type:code id: tags:
``` python
#arr = most_pred[200]['org']['author']['child_21']['min_array']
#arr
```
%% Cell type:code id: tags:
``` python
print(len(most_pred[200]['org']['author']['retiree']['feature_vecs_pos']))
pos = most_pred[200]['org']['author']['retiree']['count_tot_pos'][0:26]
arr = most_pred[200]['org']['author']['retiree']['val_array']
maxi = most_pred[200]['org']['author']['retiree']['max_array']
print(pos)
print([(el[0], arr[el[0]], maxi[el[0]]) for el in pos])
print(len(most_pred[200]['org']['author']['retiree']['feature_vecs_neg']))
neg = most_pred[200]['org']['author']['retiree']['count_tot_neg'][0:26]
mini = most_pred[200]['org']['author']['retiree']['min_array']
print(neg)
print([(el[0], arr[el[0]], mini[el[0]]) for el in neg])
arr
```
%% Cell type:code id: tags:
``` python
#df_dic[200]['org']['df'].columns
#sub = df_dic[200]['org']['df'].loc[df_dic[200]['org']['df']['org'+'_'+str(200)+'_SVM_author_pred'] != df_dic[200]['org']['df']['org'+'_'+str(200)+'_SVM_author']]
```
%% Cell type:code id: tags:
``` python
%matplotlib inline
#import matplotlib
import matplotlib.colors as cl
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
cmap = cl.LinearSegmentedColormap.from_list("", ["skyblue","cadetblue","darkblue", "steelblue"]) #define colors
def plot_confusion_matrix(cm, classes,
normalize=True,
title='Confusion matrix',
cmap=plt.cm.Blues, ax = None):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if not ax:
ax = plt.gca()
cm_old = cm
print(cm_old)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
#print(cm)
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
cbar = ax.figure.colorbar(im, ax = ax, ticks=[], label = "Heat per Row (Normalized from 0 to 1)")
tick_marks = np.arange(len(classes))
#ax.set_xticks(tick_marks)
#ax.set_yticks(tick_marks)
#ax.set_xticklabels(classes, rotation=45)
#ax.set_yticklabels(classes)
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
#make it so that text is on minor label and not on edge of boundary (i.e. half cutoff at top and bottom)
ax.set_xticks(np.arange(cm.shape[1]+1)-.5, minor=True)
ax.set_yticks(np.arange(cm.shape[0]+1)-.5, minor=True)
fmt = '.2f' if normalize else 'd'
fmt = 'd'
thresh = cm.max() / 1.5
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
ax.text(j, i, format(cm_old[i, j], fmt),
ha="center",
va="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
```
%% Cell type:markdown id: tags:
Start with analysis of precision and recall as well as heatmaps/confusion matrices:
%% Cell type:code id: tags:
``` python
for st in subsets:
for ana in subana:
an = ana.split('/')[0]
tmp = []
index = []