Commit 0edb30a6 authored by Marcel Henrik Schubert's avatar Marcel Henrik Schubert
Browse files

included all figures and also Brian's corrections

parent 0f328c36
This diff is collapsed.
This diff is collapsed.
import numpy as np
import scipy as sci
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.patches as mpatches
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
def y_coord_cal(x:pd.DataFrame):
s = x.shape[0]
x['y'] = list(range(0, s))
return x
def max_sorter(x:pd.DataFrame, to_plot:str):
x = x.reset_index(drop=False)
if to_plot != 'True_False':
aggs2 = x.loc[x['correct'] == True, :].groupby(['x']).agg({'y': 'max'}).sort_values(by=['y'],ascending=True).reset_index(drop=False)
aggs2['y'] = 1-aggs2['y']
else:
aggs2 = x.loc[x['correct'] == True, :].groupby(['x']).agg({'y': 'min'}).sort_values(by=['y'], ascending=True).reset_index(drop=False)
aggs = x.loc[x['correct']==False, :].groupby(['x']).agg({'y':'max'}).sort_values(by=['y'], ascending=False).reset_index(drop=False)
#import random
#random.seed(1234)
#ints = random.sample(range(0,1000), 500)
#aggs = aggs.loc[aggs.x.isin(ints), :]
selects = set(aggs['x'].to_list())
aggs2 = aggs2.loc[~aggs2.x.isin(selects),:]
aggs = pd.concat([aggs2, aggs], axis=0).drop_duplicates(subset=['x']).sort_values(by='y', ascending=False).reset_index(drop=True)
try:
assert set(aggs['x'].tolist()) == set(x['x'].to_list())
except:
print(x.head())
print(aggs['x'])
print(set(aggs['x'].tolist()).symmetric_difference(set(x['x'].to_list())))
minimum = min(aggs['x'])
relabel = {item: new+minimum for new, item in enumerate(aggs['x'].to_list())}
sums = [[],[]]
for key, item in relabel.items():
sums[0].append(key)
sums[1].append(item)
x['x'] = [relabel[el] for el in x['x'].to_list()]
return x
def add_y_sec(x:pd.DataFrame):
adder = x.loc[x['correct'] == False, 'y'].max()+1
x.loc[x['correct'] == True, ['y']] = x.loc[x['correct'] == True, ['y']] + adder
return x
def make_unique_x(df):
ids = df['ID'].unique().tolist()
xc = list(range(0, len(ids)))
xf = pd.DataFrame({'ID': ids, 'x': xc})
# print(df.columns.values)
df = pd.merge(df, xf, how='left', left_on='ID', right_on='ID')
return df
def relative_x_y(x:pd.DataFrame):
assert sum(x['ID'].to_list()[0] != x['ID']) ==0
x['y'] = x['y']/(x.shape[0]-1)
return x
def make_groupings(df):
df['groups'] = 'None'
#make groups of x - x max should be 999 but just to be sure it is dynamic
groups = [set(df.loc[df['# Authors'] == auth, 'x'].unique().tolist()) for auth in np.unique(df['# Authors'])]
#make unique parts
for i in range(len(groups)-1, 0, -1):
#print(groups[i-1].intersection(groups[i]))
groups[i] = groups[i] - groups[i-1]
for i, gr in enumerate(groups):
df.loc[df['x'].isin(gr), 'groups'] = i
#print(df['groups'].unique())
return df
def props(x, sh):
return round(len(x)/sh,2)
def plot_distrib_graph(df:pd.DataFrame, fig=None, ax=None, to_plot='True_False', color='correct', line=False, absolute=False):
colors_c = {True: "darkslategrey",
False: "lightslategrey",
'True': "darkslategrey",
'False': "lightslategrey"
}
colors_t = {'female': "lightsteelblue",
'male': "steelblue",
'1947': 'navy',
'1963': "darkolivegreen",
'1985': "saddlebrown",
'1975': 'black',
'1995': 'lightslategrey'}
relabel_c = {True: 'Correct',
False: 'Incorrect',
'True': 'Correct',
'False': 'Incorrect'}
relabel_t = {'female': 'Female',
'male': 'Male',
'1947': '1947',
'1963': '1963',
'1975': '1975',
'1985': '1985',
'1995': '1995'
}
if type(ax) == type(None):
fig, ax = plt.subplots(1, figsize=(18,4))
df = df.copy(deep=True)
#make x coordinations (in order of appearances and from low num authors to high num auhtors
df = make_unique_x(df)
#make y coordinates
df = df.groupby(['ID', 'correct']).apply(y_coord_cal)
if to_plot=='True_False':
#add y_section of false which will be the lower ones
df = df.groupby(['ID']).apply(add_y_sec).reset_index(drop=True)
if not absolute:
df = df.groupby(['x']).apply(relative_x_y).reset_index(drop=True)
df = make_groupings(df)
df = df.groupby(['groups']).apply(max_sorter, to_plot=to_plot).reset_index(drop=True)
bool_sel = df['correct'] == False
if color == 'correct':
# only look at wrong ones as that is the other group
props = {}
props['True'] = round(df.correct.sum()/df.shape[0], 2)
props['False'] = 1 - props['True']
# append proportion correct ones
else:
res = df.groupby(color)[[color]].agg(lambda x: round(len(x) / df.shape[0], 2))
props = {str(ind): val for ind, val in zip(list(res.index.values), res[color].to_list())} # append proportion correct ones
if to_plot == 'True':
bool_sel = df['correct'] == True
df = df.loc[bool_sel, :]
elif to_plot =='False':
df = df.loc[df['correct'] == False, :]
#from collections import Counter
#print(df.loc[df['centered_age']==1975, 'x'])
#print(Counter(df.loc[df['centered_age']==1975, 'x']).most_common(100))
# ages_map = {}
# for row in df.itertuples():
# id= getattr(row, 'ID')
# if id not in ages_map.keys():
# ages_map[id] = getattr(row, 'centered_age')
# else:
# try:
# assert ages_map[id] == getattr(row, 'centered_age')
# except:
# print('ID: {}'.format(id))
# print('First Age: {}'.format(ages_map[id]))
# print('New Age: {}'.format(getattr(row, 'centered_age')))
if color == 'correct':
relabel = relabel_c
title = 'Prediction'
colors = colors_c
hue_order = [True, False]
y_lab = "Proportion of Test Instances"
else:
relabel = relabel_t
t = color.split('_')[-1]
t = t[0].upper()+t[1:]
title = t
colors = colors_t
df[color] = df[color].astype('str')
if 'age' in color:
hue_order = ['1947','1963', '1975', '1985','1995']
else:
hue_order = ['female', 'male']
y_lab = 'Proportion Misclassified'
if not line:
ax = sns.scatterplot(data=df, x='x', y='y', palette=colors, hue=color, hue_order=hue_order, ax=ax, s=1)
for t in ax.get_legend().get_texts(): t.set_text(relabel[t.get_text()])
ax.get_legend().get_title().set_text(title)
else:
#stack the bars for groupings of data
x = np.unique(df['x']).tolist()
#print(len(x))
labeller = []
if to_plot == 'True_False':
#make the bars ranging to 1
tmp = pd.DataFrame({'x': x, 'y': [1 for _ in range(len(x))]}).reset_index(drop=True)
#ax = sns.barplot(x='x', y='y', data=tmp, ax=ax, color=colors['True'])
ax.bar(x=tmp['x'].to_list(), height=tmp['y'].to_list(), color=colors['True'], width=1)
df = df.loc[bool_sel, :].sort_values(by='y', ascending=False).groupby(by=['x'], as_index=False).first().reset_index(drop=True)
labeller.append('True')
#print(len(df['x'].unique()))
else:
#we only have the subset we want inside our df
df = df.sort_values(by='y', ascending=False).groupby(by=['x'], as_index=False).first().reset_index(drop=True)
#print(len(df['x'].unique()))
cats = df[color].unique().tolist()
#iterate over it in correct order
for cat in relabel.keys():
#ckeck that they cat exists
#print(cats)
if cat in cats:
#print(cat)
tmp = df.loc[df[color] == cat, :]
#print(df.shape)
#print(tmp.shape)
#ax = sns.barplot(x='x', y='y', data=tmp, ax=ax, color=colors[str(cat)])
ax.bar(x=tmp['x'].to_list(), height=tmp['y'].tolist(), color=colors[str(cat)], width=1)
#print(ax.get_xlim())
#print(ax.get_xticklabels)
#ax.set_xticks([0, 50, 150, 500, 1000])
labeller.append(str(cat))
bars=[]
for cat in labeller:
prop = props[cat]
bars.append(mpatches.Patch(color=colors[cat], label = relabel[cat] +' ({}%)'.format(prop*100)))
ax.legend(handles=bars, loc='upper right', title=title)
ax.set_xticks([0, 50, 150, 500, 1000])
ax.margins(x=0)
ax.tick_params(axis='both', which='major', labelsize=15)
ax.tick_params(axis='both', which='minor', labelsize=13)
ax.set_ylabel(y_lab, fontsize=16)
ax.set_xlabel("Individual Authors", fontsize=16)
plt.setp(ax.get_legend().get_texts(), fontsize='12')
plt.setp(ax.get_legend().get_title(), fontsize='14')
return fig, ax
def random_draw_select(x, cutoff):
#accuracy for this subset
ratio = x['correct'].sum()/x.shape[0]
#are we worse than random draw?
x['less_draw'] = ratio <= cutoff
return x
def calc_confusion_matrix(x:pd.DataFrame, target:str, preds:str, labs:list, normalize='true'):
cf= confusion_matrix(y_true=x[target].to_numpy(), y_pred=x[preds].to_numpy(), labels=labs, normalize=normalize)
return cf
def plot_heatmap(df:pd.DataFrame, target:str, pred:str, normalize='true', major_wrong = False, ax=None):
df = df.copy(deep=True)
if type(ax) == type(None):
fig, ax = plt.subplots(1, figsize=(18,4))
cf_dic = {}
#for the x-axiis of the plot
authors = sorted([int(el) for el in df['# Authors'].unique()])
if 'age' in target:
labs = sorted([int(el) for el in df[target].unique()])
else:
labs = ['female', 'male']
#select only those for which we are only as good as random draw
if major_wrong == True:
draw = 1/(len(labs))
df = df.groupby(by=['# Authors', 'ID']).apply(func=random_draw_select, cutoff=draw)
df = df.loc[df.less_draw == True, :]
elif major_wrong == 'wrong':
df = df.loc[df.correct == False, :]
cf = None
for aut in authors:
cf_dic[aut] = calc_confusion_matrix(x=df.loc[df['# Authors']==aut, :], target=target, preds=pred, labs=labs, normalize=normalize)
if type(cf) == type(None):
cf = cf_dic[aut].copy()
else:
cf = np.hstack((cf, cf_dic[aut]))
if normalize:
vmin = 0
vmax = 1
fmt = '.2f'
else:
vmin = None
vmax = None
fmt = '.0f'
annot = []
for _ in range(0, len(authors)):
annot = annot + labs
#print(cf)
ax = sns.heatmap(data=cf, vmin=vmin, vmax=vmax, annot=True, fmt=fmt, cbar=True, yticklabels=labs, xticklabels=annot,
ax=ax, cmap=sns.cubehelix_palette(start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True))
divider = make_axes_locatable(ax)
cax = divider.append_axes("top", size="11%", pad=0)
#set correct limits
cax.set_xlim(left=0, right=len(authors)*len(labs))
cax.get_xaxis().set_visible(False)
cax.get_yaxis().set_visible(False)
cax.set_facecolor('black')
for i in range(len(labs), len(labs)*(len(authors)), len(labs)):
ax.axvline(i, color='white', lw=4)
cax.axvline(i, color='white', lw=4)
#fix locs
locs = np.arange(len(labs))
plt.setp(ax.get_yticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
#ax.yaxis.set(ticks=locs+0.3, ticklabels=labs)
#at = AnchoredText('50 Authors', loc=2,
# prop=dict(backgroundcolor='black',
# size=12, color='white'))
#cax.add_artist(at)
for i, aut in enumerate(authors):
cax.text((1.8/5)*len(labs)+(len(labs)*i),0.4,s='{} Authors'.format(aut), bbox=dict(facecolor = 'black', boxstyle='square', ec='none'), c='white')
return fig
\FloatBarrier
\subsection{Aggregate Overview}
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/f1_scores_100.pdf}
\caption{Results for Target "Gender"}
\caption{Results for target \textit{gender}}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/f1_scores_100.pdf}
\caption{Results for Target "Age"}
\caption{Results for target \textit{age}}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows the boxplots for the extended $\rho$ of all models estimated for a given combination of featuretypes used and way of input, i.e. baseline, cumulated, or stacked.}
\caption{F1-Score for all Featuretype-Sets for an Instancelength of 100 Characters}
\caption*{\scriptsize \textit{Notes}: The figure shows the boxplots for the extended $\rho$ of all models estimated for a given combination of feature types used and way of input, i.e., baseline, cumulated, or stacked.}
\caption{F1-Score for all feature type-Sets for an input instance length of 100 characters.}
\label{fig:f1_100}
\end{figure}
......@@ -22,16 +24,16 @@
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/spearman_ext_100.pdf}
\caption{Results for Target "Gender"}
\caption{Results for target \textit{gender}}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/spearman_ext_100.pdf}
\caption{Results for Target "Age"}
\caption{Results for target \textit{age}}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows the boxplots for the extended $\rho$ of all models estimated for a given combination of featuretypes used and way of input, i.e. baseline, cumulated, or stacked.}
\caption{Extended Spearman Correlation for all Featuretype-Sets for an Instancelength of 100 Characters}
\caption*{\scriptsize \textit{Notes}: The figure shows the boxplots for the extended $\rho$ of all models estimated for a given combination of feature types used and way of input, i.e., baseline, cumulated, or stacked.}
\caption{Extended Spearman correlations for all feature type-Sets for an input instance length of 100 characters.}
\label{fig:ext_spearman_100}
\end{figure}
......@@ -40,15 +42,15 @@
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/f1_scores_250.pdf}
\caption{Results for Target "Gender"}
\caption{Results for target \textit{gender}}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/f1_scores_250.pdf}
\caption{Results for Target "Age"}
\caption{Results for target \textit{age}}
\end{subfigure}
\caption{F1-Score for all Featuretype-Sets for an Instancelength of 250 Characters}
\caption{F1-Score for all feature type-Sets for an input instance length of 250 characters}
\label{fig:f1_250}
\end{figure}
......@@ -57,19 +59,252 @@
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/spearman_ext_250.pdf}
\caption{Results for Target "Gender"}
\caption{Results for target \textit{gender}}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/spearman_ext_250.pdf}
\caption{Results for Target "Age"}
\caption{Results for target \textit{age}}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows the boxplots for the extended $\rho$ of all models estimated for a given combination of featuretypes used and way of input, i.e. baseline, cumulated, or stacked.}
\caption{Extended Spearman Correlation for all Featuretype-Sets for an Instancelength of 250 Characters}
\caption*{\scriptsize \textit{Notes}: The figure shows the boxplots for the extended $\rho$ of all models estimated for a given combination of feature types used and way of input, i.e., baseline, cumulated, or stacked.}
\caption{Extended Spearman correlation for all feature type sets for an input instance length of 250 characters.}
\label{fig:ext_spearman_250}
\end{figure}
\FloatBarrier
\clearpage
\subsection{Author-Level Analysis}
%authorlevel all
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/100/bars/proportions_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_100_color_gender.pdf}
\caption{Author-level errors for target \textit{gender}.}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/100/bars/proportions_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_100_color_centered_age.pdf}
\caption{Author-level errors for target \textit{age}.}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows the results when using the full feature set as cumulated input. Each author is a unique instance on the x-axis. The proportion per author is then shown as the y-value. The authors are sorted by their appearance in the respective subsets (i.e., 50, 150, 500, 1000) and according to the proportion of errors within those subsets. The result per author shows the result over all subsets.}
\caption{Author-Level Results for the Full feature set with an input instance length of 100 characters.}
\label{fig:acc_auth_100}
\end{figure}
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/250/bars/proportions_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_250_color_gender.pdf}
\caption{Author-level errors for target \textit{gender}.}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/250/bars/proportions_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_250_color_centered_age.pdf}
\caption{Author-level errors for target \textit{age}.}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows the results when using the full feature set as cumulated input. Each author is a unique instance on the x-axis. The proportion per author is then shown as the y-value. The authors are sorted by their appearance in the respective subsets (i.e., 50, 150, 500, 1000) and according to the proportion of errors within those subsets. The result per author shows the result over all subsets.}
\caption{Author-Level Results for the Full feature set in an input instance length of 100 characters.}
\label{fig:acc_auth_250}
\end{figure}
%authorlevel cwald
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/100/bars/proportions_cumulated_dist_char_asis_lemma_word_1_2_100_color_gender.pdf}
\caption{Author-level errors for target \textit{gender}.}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/100/bars/proportions_cumulated_dist_char_asis_lemma_word_1_2_100_color_centered_age.pdf}
\caption{Author-level errors for target \textit{age}.}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows the results when using the full feature set as cumulated input. Each author is a unique instance on the x-axis. The proportion per author is then shown as the y-value. The authors are sorted by their appearance in the respective subsets (i.e., 50, 150, 500, 1000) and according to the proportion of errors within those subsets. The result per author shows the result over all subsets.}
\caption{Author-Level Results for the full feature set with an input instance length of 100 characters - ASIS-CHAR-LEMMA-WORD.}
\label{fig:acc_auth_100_cwald}
\end{figure}
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/250/bars/proportions_cumulated_dist_char_asis_lemma_word_1_2_250_color_gender.pdf}
\caption{Author-level errors for target \textit{gender}.}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/250/bars/proportions_cumulated_dist_char_asis_lemma_word_1_2_250_color_centered_age.pdf}
\caption{Author-level errors for target \textit{age}.}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows the results when using the full feature set as cumulated input. Each author is a unique instance on the x-axis. The proportion per author is then shown as the y-value. The authors are sorted by their appearance in the respective subsets (i.e., 50, 150, 500, 1000) and according to the proportion of errors within those subsets. The result per author shows the result over all subsets.}
\caption{Author-Level Results for the full feature set in an input instance length of 100 characters - ASIS-CHAR-LEMMA-WORD.}
\label{fig:acc_auth_250_cwald}
\end{figure}
%confusion all
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/100/confusion/confusion_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_100_mode_basic.pdf}
\caption{All authors (row-wise normalization).}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/100/confusion/confusion_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_100_mode_major_false.pdf}
\caption{Below random guess accuracy (matrix-wise normalization).}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows confusion matrices for the results produced by using the full feature set as cumulated input on an input instance length of 100 characters. The matrix for the respective set of authors is calculated by looking at the respective set in isolation.}
\caption{Confusion matrices for target \textit{gender} with an input instance length 100 characters - all feature types.}
\label{fig:gender_conf_auth_100}
\end{figure}
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/250/confusion/confusion_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_250_mode_basic.pdf}
\caption{All authors (row-wise normalization).}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/250/confusion/confusion_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_250_mode_major_false.pdf}
\caption{Below random guess accuracy (matrix-wise normalization).}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows confusion matrices for the results produced by using the full feature set as cumulated input on an input instance length of 250 characters. The matrix for the respective set of authors is calculated by looking at the respective set in isolation.}
\caption{Confusion matrices for target \textit{gender} with an input instance length of 250 characters - all feature types.}
\label{fig:gender_conf_auth_250}
\end{figure}
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/100/confusion/confusion_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_100_mode_basic.pdf}
\caption{All authors (row-wise normalization).}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/100/confusion/confusion_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_100_mode_major_false.pdf}
\caption{Below random guess accuracy (matrix-wise normalization).}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows confusion matrices for the results produced by using the full feature set as cumulated input on an input instance length of 100 characters. The matrix for the respective set of authors is calculated by looking at the respective set in isolation.}
\caption{Confusion matrices for target \textit{age} with an input instance length 100 characters - all feature types.}
\label{fig:age_conf_auth_100}
\end{figure}
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/250/confusion/confusion_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_250_mode_basic.pdf}
\caption{All authors (row-wise normalization).}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/250/confusion/confusion_cumulated_dist_char_asis_pos_tag_dep_lemma_word_emoticon_c_polarity_num_1_250_mode_major_false.pdf}
\caption{Below random guess accuracy (matrix-wise normalization).}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows confusion matrices for the results produced by using the full feature set as cumulated input on an input instance length of 250 characters. The matrix for the respective set of authors is calculated by looking at the respective set in isolation.}
\caption{Confusion matrices for target \textit{age} with an input instance length of 250 characters - all feature types.}
\label{fig:age_conf_auth_250}
\end{figure}
%confusion cwald
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/100/confusion/confusion_cumulated_dist_char_asis_lemma_word_1_2_100_mode_basic.pdf}
\caption{All authors (row-wise normalization).}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.8\textwidth}
\centering