Commit 0f328c36 authored by Marcel Henrik Schubert's avatar Marcel Henrik Schubert
Browse files

changes to tex

parent a7331000
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
\ No newline at end of file
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.49\textwidth}
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/f1_scores_100.pdf}
\caption{Results for Target "Gender"}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.49\textwidth}
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/f1_scores_100.pdf}
\caption{Results for Target "Age"}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows the boxplots for the extended $\rho$ of all models estimated for a given combination of featuretypes used and way of input, i.e. baseline, cumulated, or stacked.}
\caption{F1-Score for all Featuretype-Sets for an Instancelength of 100 Characters}
\label{fig:f1_100}
\end{figure}
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.49\textwidth}
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/spearman_ext_100.pdf}
\caption{Results for Target "Gender"}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.49\textwidth}
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/spearman_ext_100.pdf}
\caption{Results for Target "Age"}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows the boxplots for the extended $\rho$ of all models estimated for a given combination of featuretypes used and way of input, i.e. baseline, cumulated, or stacked.}
\caption{Extended Spearman Correlation for all Featuretype-Sets for an Instancelength of 100 Characters}
\label{fig:ext_spearman_100}
\end{figure}
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.49\textwidth}
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/f1_scores_250.pdf}
\caption{Results for Target "Gender"}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.49\textwidth}
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/f1_scores_250.pdf}
\caption{Results for Target "Age"}
......@@ -52,17 +54,18 @@
\begin{figure}[!htpb]
\centering
\begin{subfigure}[b]{0.49\textwidth}
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/gender/spearman_ext_250.pdf}
\caption{Results for Target "Gender"}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.49\textwidth}
\begin{subfigure}[b]{.7\textwidth}
\centering
\includegraphics[width=\textwidth]{figures/age/spearman_ext_250.pdf}
\caption{Results for Target "Age"}
\end{subfigure}
\caption*{\scriptsize \textit{Notes}: The figure shows the boxplots for the extended $\rho$ of all models estimated for a given combination of featuretypes used and way of input, i.e. baseline, cumulated, or stacked.}
\caption{Extended Spearman Correlation for all Featuretype-Sets for an Instancelength of 250 Characters}
\label{fig:ext_spearman_250}
\end{figure}
......
......@@ -2,25 +2,147 @@
%%baseline 250 chars
%\begin{landscape}
\input{tables/gender/latex/results_gender_individual_baseline_250_acc.tex}
\subsection{Baseline}
\subsubsection{Minimum of Characters: 100}
\input{tables/gender/latex/results_gender_individual_baseline_100_acc.tex}
\input{tables/gender/latex/results_gender_individual_baseline_100_dist.tex}
\input{tables/age/latex/results_age_individual_baseline_100_acc.tex}
\input{tables/age/latex/results_age_individual_baseline_100_dist.tex}
%%dwald dynAA
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 250}
\input{tables/gender/latex/results_gender_individual_baseline_250_acc.tex}
\input{tables/gender/latex/results_gender_individual_baseline_250_dist.tex}
\input{tables/age/latex/results_age_individual_baseline_250_acc.tex}
\input{tables/age/latex/results_age_individual_baseline_250_dist.tex}
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 500}
\input{tables/gender/latex/results_gender_individual_baseline_500_acc.tex}
\input{tables/gender/latex/results_gender_individual_baseline_500_dist.tex}
\input{tables/age/latex/results_age_individual_baseline_500_acc.tex}
\input{tables/age/latex/results_age_individual_baseline_500_dist.tex}
\FloatBarrier
\clearpage
\subsection{POS, TAG, DEP, and NUM}
\subsubsection{Minimum of Characters: 100 \& Cumulated}
\input{tables/gender/latex/results_gender_dwald_direct_100_acc.tex}
\input{tables/gender/latex/results_gender_dwald_direct_100_dist.tex}
\input{tables/age/latex/results_age_dwald_direct_100_acc.tex}
\input{tables/age/latex/results_age_dwald_direct_100_dist.tex}
%%dwald dynAA
\input{tables/gender/latex/results_gender_dwald_dynAA_500_f1_dist.tex}
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 250 \& Cumulated}
\input{tables/gender/latex/results_gender_dwald_direct_250_acc.tex}
\input{tables/gender/latex/results_gender_dwald_direct_250_dist.tex}
\input{tables/age/latex/results_age_dwald_direct_250_acc.tex}
\input{tables/age/latex/results_age_dwald_direct_250_dist.tex}
%age
%%baseline 250 chars
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 500 \& Cumulated}
\input{tables/age/latex/results_age_individual_baseline_250_acc.tex}
\input{tables/gender/latex/results_gender_dwald_direct_500_acc.tex}
\input{tables/gender/latex/results_gender_dwald_direct_500_dist.tex}
\input{tables/age/latex/results_age_individual_baseline_250_dist.tex}
\input{tables/age/latex/results_age_dwald_direct_500_acc.tex}
\input{tables/age/latex/results_age_dwald_direct_500_dist.tex}
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 100 \& Stacked}
\input{tables/gender/latex/results_gender_dwald_direct_100_acc.tex}
\input{tables/gender/latex/results_gender_dwald_direct_100_dist.tex}
\input{tables/age/latex/results_age_dwald_direct_100_acc.tex}
\input{tables/age/latex/results_age_dwald_direct_100_dist.tex}
%%dwald dynAA
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 250 \& Stacked}
\input{tables/gender/latex/results_gender_dwald_direct_250_acc.tex}
\input{tables/gender/latex/results_gender_dwald_direct_250_dist.tex}
\input{tables/age/latex/results_age_dwald_direct_250_acc.tex}
\input{tables/age/latex/results_age_dwald_direct_250_dist.tex}
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 500 \& Stacked}
\input{tables/gender/latex/results_gender_dwald_direct_500_acc.tex}
\input{tables/gender/latex/results_gender_dwald_direct_500_dist.tex}
\input{tables/age/latex/results_age_dwald_direct_500_acc.tex}
\input{tables/age/latex/results_age_dwald_direct_500_dist.tex}
\FloatBarrier
\clearpage
\subsection{Full Set of Featuretypes}
\subsubsection{Minimum of Characters: 100 \& Cumulated}
\input{tables/gender/latex/results_gender_full_direct_100_acc.tex}
\input{tables/gender/latex/results_gender_full_direct_100_dist.tex}
\input{tables/age/latex/results_age_full_direct_100_acc.tex}
\input{tables/age/latex/results_age_full_direct_100_dist.tex}
%%full dynAA
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 250 \& Cumulated}
\input{tables/gender/latex/results_gender_full_direct_250_acc.tex}
\input{tables/gender/latex/results_gender_full_direct_250_dist.tex}
\input{tables/age/latex/results_age_full_direct_250_acc.tex}
\input{tables/age/latex/results_age_full_direct_250_dist.tex}
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 500 \& Cumulated}
\input{tables/gender/latex/results_gender_full_direct_500_acc.tex}
\input{tables/gender/latex/results_gender_full_direct_500_dist.tex}
\input{tables/age/latex/results_age_full_direct_500_acc.tex}
\input{tables/age/latex/results_age_full_direct_500_dist.tex}
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 100 \& Stacked}
\input{tables/gender/latex/results_gender_full_dynAA_100_acc.tex}
\input{tables/gender/latex/results_gender_full_dynAA_100_dist.tex}
\input{tables/age/latex/results_age_full_dynAA_250_acc.tex}
\input{tables/age/latex/results_age_full_dynAA_250_dist.tex}
%%full dynAA
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 250 \& Stacked}
\input{tables/gender/latex/results_gender_full_dynAA_250_acc.tex}
\input{tables/gender/latex/results_gender_full_dynAA_250_dist.tex}
%dwald 500 chars dynAA
\input{tables/age/latex/results_age_full_dynAA_250_acc.tex}
\input{tables/age/latex/results_age_full_dynAA_250_dist.tex}
\FloatBarrier
\clearpage
\subsubsection{Minimum of Characters: 500 \& Stacked}
\input{tables/gender/latex/results_gender_full_dynAA_500_acc.tex}
\input{tables/gender/latex/results_gender_full_dynAA_500_dist.tex}
\input{tables/age/latex/results_age_full_dynAA_500_acc.tex}
\input{tables/age/latex/results_age_full_dynAA_500_dist.tex}
\input{tables/age/latex/results_age_dwald_dynAA_500_f1_dist.tex}
%\end{landscape}
\ No newline at end of file
......@@ -7,6 +7,10 @@
\renewcommand{\thesubsection}{A\arabic{section}.\arabic{subsection}}\setcounter{subsection}{0}
\section{Figures}
\label{sec:att_app_figures}
\input{app_figures.tex}
\FloatBarrier
\section{Tables}
\label{sec:att_app_tables}
\input{app_tables.tex}
\section{Discussion}
Consequently, as an overall finding we can say that while the features mainly driving the prediction are not \textit{per-se} context-reliant features, increasing the available context does increase performance markedly.
Overall, we find that the classifier makes systematic errors on the author-level. For the target \textit{gender}, female authors seem to be difficult to classify in general. The underlying drivers seem to be that the group as such has a very heterogeneous pattern in the features which overlaps with the male authors. In other words, male authors seem so be simpler to classify. That however may also be driven by the fact that the context they are active in is more homogeneous than for female authors.
For \textit{age}, we find slightly more stable results as most age brackets exhibit clear patterns that make the distinguishable.
In terms of stability, however, the results are more mixed.
While the features mainly driving the prediction are not \textit{per-se} context-reliant features, increasing the available context does increase performance markedly.
This is especially evident from the fact that a wide n-gram window for character features yields the greatest relative increase in performance, outstripping the additional information available by including featuretypes of a higher context.
That, in itself is not directly surprising and not necessarily cause for any raised eyebrows. However, it is relevant to point to the fact that for a low number of authors, about 10\% of the prediction performance stem from an increase in context (e.g. for features CHAR and ASIS when predicting gender, see \autoref{tab:gender_f1_dist_500_baseline_individual}). When looking at the dataset with the largest number of authors, additional information also is what separates the model performance from slightly above random-guess performance and pushes it into the performance ranges found within the literature for comparable data \parencite{wiegmann2019overview}.
Moreover, building a model on top of a composite of featuretypes (or stacking it on top; see \autoref{tab:gender_f1_dist_500_dynAA_dwald} and \autoref{tab:age_f1_dist_500_dynAA_dwald}) is when we see additional performance increases, especially for longer input texts. Thus, giving additional context on top of non-context featuretypes yields a better decision boundary for the classifier.
These results, at first seem like technical details. In practice that implies that the context the model is trained on and in (as reflected in the data) largely carries over into its predictive performance. On one hand, that means models trained within one context may not simply be used in another. That is intuitive. What we show here, however, is that even by staying within one group of individuals (creators) and within one domain, an increase in the number of possible targets changes the relevant features and also the information as well as the context encoded within sizably. That becomes evident from the fact that the stability in the relevance of features simply does not exist.
Moreover, building a model on top of a composite of featuretypes (or stacking it on top; see \autoref{tab:gender_f1_dist_500_direct_dwald} and \autoref{tab:age_acc_500_direct_dwald}) is when we see additional performance increases, especially for longer input texts. Thus, giving additional context on top of non-context featuretypes yields a better decision boundary for the classifier.
These results, at first seem like technical details. However, in practice they show that the context the model is trained on and in (as reflected simulated by varying the number of authors) largely carries over into its predictive performance. On one hand, that means models trained within one context may not simply be used in another. That is intuitive. What we show here, however, is that even by staying within one group of individuals (creators) and within one domain, an increase in the number of possible targets changes the relevant features and also the information as well as the context encoded within sizably. That becomes evident from the fact that the stability in the relevance of features simply does not exist.
For social sciences, these findings are relevant on two fronts.
First, the models using the features presented here are indeed well-suited to find a pattern connecting their use to the prediction target. However, that pattern is unstable, changing with the number of authors or features available. Consequently, it hints at the fact that these patterns are merely correlations exploited by the model. Such correlations are difficult to rely upon, as their patterns - as shown by increasing the number of authors - may change at any time.
Thus, this calls for a careful assessment of the validity when employing pre-trained models within the field, especially when the prediction outcome is used as input for further models or for further analysis. Or spoken differently, a change in behavior by individuals - either over time or by choice - will render the learned context irrelevant. Thus, the environment during training must be carefully compared to the one when the model is used.
Thus, this calls for a careful assessment of the validity when employing pre-trained models within the field, especially when the prediction outcome is used as input for further models or for further analysis. Or spoken differently, a change in behavior by individuals - either over time or by choice - will render the learned context irrelevant. Thus, the environment during training must be carefully compared to the one in which the model is used.
The second aspect is tied more into the field of law and the wider debate of transparency and proportionality.
As law enforcement is faced with the problem of combing trough a large amount of online content, searching and assessing such content by hand is untenable. Thus, already today algorithms are employed by law enforcement. However, especially in such environments, it must be clear how much of the findings by an algorithm is merely correlational and also how stable these correlations are in different environments. Only then do law enforcement, the defendant, and also the courts have the possibility to assess the validity of an result \textit{before} acting upon it. After all, how valid is a result identifying traits of a suspect when the features are context-reliant to such a high degree that changing the use of some emojis or some words would alter the result completely. Or that when one is compared to a different number of authors, the result would be driven by completely different features.
Even more problematic, in real-use, the instances used for training and those used for testing are removed from each other in time. Thus, a real culprit could evade being identified by simply because the context changes.
As the findings of this study point towards such an unstable relationship, we argue that the features used in tasks related to authorship profiling and authorship attribution need much more research. Moreover, models should be assessed with a measure for defining the boundaries of their stability. Otherwise establishing a scientifically valid link that goes beyond merely showing that the model yields good correlational predictions might be impossible.
\ No newline at end of file
As the findings of this study point towards such an unstable relationship, we argue that the features used in tasks related to authorship profiling and authorship attribution need much more research. Moreover, models should be assessed with a measure for defining the boundaries of their stability. Otherwise establishing a scientifically valid link that goes beyond merely showing that the model yields good correlational predictions on some datasets might be impossible.
\ No newline at end of file
This diff is collapsed.
\section{Introduction}
\label{sec:intro}
Every second an additional 6,000 tweets are posted on Twitter\textsuperscript{\textregistered}\parencite{InternetLiveStats2019TwitterStatistics}.
That is only on of many examples how social media as changed the amount of text communication data produced. Moreover, with such large amounts of data, new methods in machine learning and natural language processing (NLP) gained great popularity not only in business but also in research, especially the social sciences.
In general the data generated is a wealth of information which was previously unavailable for research. In order to mine that treasure trove researchers increasingly turn to the application of NLP to answer prevailing questions in the social sciences \parencite[e.g.]{bail2016combining, pavlick2016gun, costa2019analysis, burley2020nlp}.
Here, the section of authorship analysis is becoming an area of interest. The main goal of that particular area is to profile characteristics such as age, gender but also political orientation or even the author's identity from texts written by the individual.
In traditional studies, these characteristics are of relevance in the social behavioral sciences in general as they have proven to be important keys in understanding and modeling human behavior \parencite[e.g.][]{lahey2000age, Gneezy2004GenderAge,Charness2012StrongTaking,Booth2012GenderMatter,Sutter2014GenderPersist, Bian2017GenderInterests}.
It is therefore not surprising that studies focusing on this new type of data tends to include them \parencite[e.g.][]{bail2016combining, colleoni2014echo}. However, these characteristics are not always available as ground truth. In order to compensate for that researchers began to use a layered approach, first inferring the missing characteristics with a trained classifier, and then in turn using that as input to their research approach \parencite{barbera2015understanding, huang2020contribution}.
Another adjacent field at the intersection of NLP and behavior making use of such analysis is the law community targetingincriminating online behavior, e.g. hate speech \parencite{Djuric2015HateEmbeddings,Laub2019HateComparisons,zufall2020operationalizing}.
Here, also the practical application of authorship analysis becomes relevant when pursuing offenses.
In traditional studies within the social sciences, characteristics such as age or gender are key traits as they have proven to be central to understanding and modeling human behavior \parencite[e.g.][]{lahey2000age, Gneezy2004GenderAge,Charness2012StrongTaking,Booth2012GenderMatter,Sutter2014GenderPersist, Bian2017GenderInterests}.
While traditionally studies within the field focused on lab experiments as well as questionnaires, large-scale datasets have, until recently, been of limited availability. Especially, when it comes to text data, the analysis often proved resource-intensive and the results are difficult to assess.
However, in general that data is a wealth of information, even more so since the amount of text data generated by individuals increased massively with the advent of services such as Facebook\textsuperscript{\textregistered} or Twitter\textsuperscript{\textregistered} \parencite{InternetLiveStats2019TwitterStatistics}.
With such large amounts of data, new methods in machine learning and natural language processing (NLP) gained great popularity, especially within the social sciences.
In order to mine that treasure-trove researchers increasingly turn to the application of NLP to answer prevailing questions in the social sciences \parencite[e.g.]{bail2016combining, pavlick2016gun, costa2019analysis, burley2020nlp}.
The technology as such is, depending on the algorithms, comparatively simple to use in terms of know-how when one considers the textual features for identification as well as the difficulty of setting up and training the respective algorithms \parencite{Narayanan2012OnIdentification}.
By reducing the entrance cost in such way, this technology has certainly great potential.
Here, the section of authorship analysis is becoming an area of special relevance to the social scientists. The reason is simply that while a large amount of text data is available, often characteristics of the individual, e.g. age, gender but also their identity as such, is not. The main goal of that particular area therefore is to profile characteristics such as age, gender but also political orientation or even the author's identity from texts written by the individual.
As such, while studies within the social sciences making use of that new type of data continue to include these characteristics due to their proven relevance in past research \parencite[e.g.][]{bail2016combining, colleoni2014echo}, authorship analysis is used to compensate for the lack of ground truth. That is often done by using a layered approach, first inferring the missing characteristics with a trained classifier, and then in turn using that as input to their research approach \parencite{barbera2015understanding, huang2020contribution}.
Moreover, these characteristics are not only important as an input for further research but also as identifying information. In the adjacent field where NLP and behavioral sciences intersect, the law community is making use of such analysis for the targeting and researching of incriminating online behavior, e.g. hate speech \parencite{Djuric2015HateEmbeddings,Laub2019HateComparisons,zufall2020operationalizing}.
In that, also the practical application of authorship analysis becomes relevant when pursuing offenses.
Often, online users do not use their real names, and finding out their identities becomes difficult either because it is not available or because the companies having access to the information are not willing to share it \parencite{facebook}.
Consequently, often during such an investigation \textit{forensic} authorship analysis is employed to gain additional information on the potential offender. Due to the high volume of data, these processes become increasingly automated and as such the research field of automated forensic authorship analysis is well established \parencite{Rocha2017AuthorshipForensics}.\\
The technology as such -- for research as well as forensics -- is, depending on the algorithms, comparatively simple to use in terms of know-how when one considers the textual features for identification as well as the difficulty of setting up and training the respective algorithms \parencite{Narayanan2012OnIdentification}.
By reducing the entrance cost in such way, this technology has certainly great potential. However, while state-of-the-art methods regularly manage to achieve a high accuracy for authorship analysis \parencite{Rocha2017AuthorshipForensics}, some call the field's scientific character into question \parencite{Chaski2001EmpiricalTechniques}.
However, while state-of-the-art methods regularly manage to achieve a high accuracy for authorship analysis \parencite{Rocha2017AuthorshipForensics}, some call the field's scientific character into question \parencite{Chaski2001EmpiricalTechniques}.
This is due to the fact that current research of automated authorship analysis mostly focuses on correct predictions, using a wide set of features which often varies between different papers \parencite{Rocha2017AuthorshipForensics}.
The focus seems to lie on achieving the best results, showing the viability of automated authorship analysis often to the detriment of rigorous explainability and transparency \parencite{Chaski2012BestIdentification}. This not only generally concerns research employing such models but, when used by law enforcement, it is also directly related to the admissibility of findings from automated authorship analysis in the courts as transparency as well as rigor might not satisfy the demands before the law \parencite{Chaski2012BestIdentification}. Consequently, the technology needs of further assessment before being used even more widely than already the case.\\
That result is somewhat surprising as one of the more extensive surveys on literature targeting \textit{forensic} authorship analysis notes the importance for a \textquote{well-defined process} \parencite{Rocha2017AuthorshipForensics} in order to satisfy the high requirements for explainability before the law.
The focus seems to lie on achieving the best results, showing the viability of automated authorship analysis often to the detriment of rigorous explainability and transparency \parencite{Chaski2012BestIdentification}. This not only generally concerns research employing such models but, when used by law enforcement, it is also directly related to the admissibility of findings from automated authorship analysis in the courts as transparency as well as rigor might not satisfy the demands before the law \parencite{Chaski2012BestIdentification}. Consequently, the technology is in need of further assessment before being used even more widely than already the case.\\
That result is somewhat surprising as one of the more extensive surveys on literature targeting \textit{forensic} authorship analysis notes the importance of and need for a \textquote{well-defined process} \parencite{Rocha2017AuthorshipForensics} in order to satisfy the high requirements for explainability before the law.
Besides understanding the algorithm, such an explainability would require two things:
One concerns the topic-independence which means that the features predictive of an author should not depend on the content of the text \parencite{Narayanan2012OnIdentification}.
The second aspect is rarely mentioned.
......@@ -26,10 +28,10 @@ Naturally that is merely a snapshot of the current environment.
It does not imply that the next generation features the same pattern and consequently, any model trained on the old pattern might inadvertently miss-classify when confronted with the new pattern.
For that reason, it is necessary to assess the stability of individual features used by the classifier, when the underlying data and thus the patterns change slightly.
There are only very few forays seeking to address such problems, for example like \textcite{Azarbonyad2015Time-awareStreams} do with their temporal weighting of features.
As stylometry is rooted in the humanities and thus the social sciences \parencite{Neal2017SurveyingApplications} it is surprising that not more efforts have been made so far to see whether some author characteristics result in some stable topic- and domain-independent features.\\
That is even more surprising as papers on \textit{forensic} authorship analysis like to link themselves to \textit{behavioral biometrics} \parencite{Narayanan2012OnIdentification}.
Therefore, investigating the stability of features would seem to be a natural extension in understanding something about the fundamentals of language and communication. Such a stability analysis would aid immensely in assessing the rigor of predictions therefore making them safer to use in the legal context. Moreover, it would also help to better establish the boundaries of transferability and stability of models and their predictions which is needed when used as input for further research.
As stylometry is rooted in the humanities and thus the social sciences \parencite{Neal2017SurveyingApplications} it is surprising that not more efforts have been made so far to see whether some author characteristics result in some stable topic- and domain-independent features and feature importance.\\
Therefore the central aspect of our approach is helping to answer that question about the feature stability as well as their relevance. We seek to extend the understanding about the fundamentals of language and communication. We think that such a stability analysis would aid immensely in assessing the rigor of predictions therefore making them safer to use in the legal context. Moreover, it would also help to better establish the boundaries of transferability and stability of models and their predictions which is needed when used as input for further research.
This contribution is therefore interdisciplinary in nature as it tries to address an issue affecting multiple fields.
While the lack of contributions was already pointed out as being of interest by \textcite{Rocha2017AuthorshipForensics}, only recently has there been any notable forays. In general, there has been an effort to make model predictions more explainable \parencite{Ribeiro2016WhyClassifier, samek2019explainable}. A systematic approach , however, looking at changes when features are systematically varied is as of yet limited. The study by \textcite{koppel2011authorship} look at authorship analysis "in the wild`` and systemtically vary the number of authors as well as the number of features to assess quantify performance gains and losses. They do not focus on featuretypes and do not extend their study towards analyzing the changes in within the model. Recently, \textcite{boenninghoff2019explainable} showed a method to make a complex model based on a neuronal net explainable. Their approach is limited to their specific model and does also not analyze what the decisive features correspond to, i.e. how much context they encode. In that vein, \textcite{sanchez2017comparison} is closer to our approach. They also seek to limit topic-dependency and focus on featuretypes. However, their goal is to find a good subset of ngrams for their featureset with high predictive power. The paper closest to ours is the one by \textcite{sage2020investigating}. Their analysis is focused on different featuretypes and the influence of varying ngram-lengths. They systematically vary both in order to find the impact on performance. However, they do not extend their analysis to different input sets and also focus on longer news articles instead of the more common data of microblog texts. Moreover, we also extend that analysis into the domain of stability, assessing whether there are shifts in feature-importance.
While the lack of contributions was already pointed out \textcite{Rocha2017AuthorshipForensics}, only recently has there been any notable forays. In general, there has been an effort to make model predictions more explainable \parencite{Ribeiro2016WhyClassifier, samek2019explainable}. A systematic approach , however, looking at changes when features are systematically varied is as of yet limited. The study by \textcite{koppel2011authorship} look at authorship analysis "in the wild`` and systemically vary the number of authors as well as the number of features to assess quantify performance gains and losses. They do not focus on featuretypes and do not extend their study towards analyzing the changes in within the model. Recently, \textcite{boenninghoff2019explainable} showed a method to make a complex model based on a neuronal net explainable. Their approach is limited to their specific model and does also not analyze what the decisive features correspond to, i.e. how much context they encode. In that vein, \textcite{sanchez2017comparison} is closer to our approach. They also seek to limit topic-dependency and focus on featuretypes. However, their goal is to find a good subset of ngrams for their featureset with high predictive power. The paper closest to ours is the one by \textcite{sage2020investigating}. Their analysis is focused on different featuretypes and the influence of varying ngram-lengths. They systematically vary both in order to find the impact on performance. However, they do not extend their analysis to different input sets and also focus on longer news articles instead of the more common data of microblog texts. Moreover, we also extend that analysis into the domain of stability, assessing whether there are shifts in feature-importance.
\section{Introduction}
\label{sec:intro}
Every second an additional 6,000 tweets are posted on Twitter\textsuperscript{\textregistered}\parencite{InternetLiveStats2019TwitterStatistics}.
That is only on of many examples how social media as changed the amount of text communication data produced. Moreover, with such large amounts of data, new methods in machine learning and natural language processing (NLP) gained great popularity not only in business but also in research, especially the social sciences.
In general the data generated is a wealth of information which was previously unavailable for research. In order to mine that treasure trove researchers increasingly turn to the application of NLP to answer prevailing questions in the social sciences \parencite[e.g.]{bail2016combining, pavlick2016gun, costa2019analysis, burley2020nlp}.
Here, the section of authorship analysis is becoming an area of interest. The main goal of that particular area is to profile characteristics such as age, gender but also political orientation or even the author's identity from texts written by the individual.
In traditional studies, these characteristics are of relevance in the social behavioral sciences in general as they have proven to be important keys in understanding and modeling human behavior \parencite[e.g.][]{lahey2000age, Gneezy2004GenderAge,Charness2012StrongTaking,Booth2012GenderMatter,Sutter2014GenderPersist, Bian2017GenderInterests}.
It is therefore not surprising that studies focusing on this new type of data tends to include them \parencite[e.g.][]{bail2016combining, colleoni2014echo}. However, these characteristics are not always available as ground truth. In order to compensate for that researchers began to use a layered approach, first inferring the missing characteristics with a trained classifier, and then in turn using that as input to their research approach \parencite{barbera2015understanding, huang2020contribution}.
Another adjacent field at the intersection of NLP and behavior making use of such analysis is the law community targetingincriminating online behavior, e.g. hate speech \parencite{Djuric2015HateEmbeddings,Laub2019HateComparisons,zufall2020operationalizing}.
Here, also the practical application of authorship analysis becomes relevant when pursuing offenses.
Often, online users do not use their real names, and finding out their identities becomes difficult either because it is not available or because the companies having access to the information are not willing to share it \parencite{facebook}.
Consequently, often during such an investigation \textit{forensic} authorship analysis is employed to gain additional information on the potential offender. Due to the high volume of data, these processes become increasingly automated and as such the research field of automated forensic authorship analysis is well established \parencite{Rocha2017AuthorshipForensics}.\\
The technology as such -- for research as well as forensics -- is, depending on the algorithms, comparatively simple to use in terms of know-how when one considers the textual features for identification as well as the difficulty of setting up and training the respective algorithms \parencite{Narayanan2012OnIdentification}.
By reducing the entrance cost in such way, this technology has certainly great potential. However, while state-of-the-art methods regularly manage to achieve a high accuracy for authorship analysis \parencite{Rocha2017AuthorshipForensics}, some call the field's scientific character into question \parencite{Chaski2001EmpiricalTechniques}.
This is due to the fact that current research of automated authorship analysis mostly focuses on correct predictions, using a wide set of features which often varies between different papers \parencite{Rocha2017AuthorshipForensics}.
The focus seems to lie on achieving the best results, showing the viability of automated authorship analysis often to the detriment of rigorous explainability and transparency \parencite{Chaski2012BestIdentification}. This not only generally concerns research employing such models but, when used by law enforcement, it is also directly related to the admissibility of findings from automated authorship analysis in the courts as transparency as well as rigor might not satisfy the demands before the law \parencite{Chaski2012BestIdentification}. Consequently, the technology needs of further assessment before being used even more widely than already the case.\\
That result is somewhat surprising as one of the more extensive surveys on literature targeting \textit{forensic} authorship analysis notes the importance for a \textquote{well-defined process} \parencite{Rocha2017AuthorshipForensics} in order to satisfy the high requirements for explainability before the law.
Besides understanding the algorithm, such an explainability would require two things:
One concerns the topic-independence which means that the features predictive of an author should not depend on the content of the text \parencite{Narayanan2012OnIdentification}.
The second aspect is rarely mentioned.
Most models are trained at one point in time, on one particular text corpus related to one domain.
Using this at a later point in time, however, assumes that in the meantime there was no shift in the underlying features used or shifts in the set used by a particular author.
If one looks for example at age, it may be that at the moment older people are using more grammatically correct language in the online environment compared to younger people \parencite{Flekova2016ExploringTwitter}.
Naturally that is merely a snapshot of the current environment.
It does not imply that the next generation features the same pattern and consequently, any model trained on the old pattern might inadvertently miss-classify when confronted with the new pattern.
For that reason, it is necessary to assess the stability of individual features used by the classifier, when the underlying data and thus the patterns change slightly.
There are only very few forays seeking to address such problems, for example like \textcite{Azarbonyad2015Time-awareStreams} do with their temporal weighting of features.
As stylometry is rooted in the humanities and thus the social sciences \parencite{Neal2017SurveyingApplications} it is surprising that not more efforts have been made so far to see whether some author characteristics result in some stable topic- and domain-independent features.\\
That is even more surprising as papers on \textit{forensic} authorship analysis like to link themselves to \textit{behavioral biometrics} \parencite{Narayanan2012OnIdentification}.
Therefore, investigating the stability of features would seem to be a natural extension in understanding something about the fundamentals of language and communication. Such a stability analysis would aid immensely in assessing the rigor of predictions therefore making them safer to use in the legal context. Moreover, it would also help to better establish the boundaries of transferability and stability of models and their predictions which is needed when used as input for further research.
This contribution is therefore interdisciplinary in nature as it tries to address an issue affecting multiple fields.
While the lack of contributions was already pointed out as being of interest by \textcite{Rocha2017AuthorshipForensics}, only recently has there been any notable forays. In general, there has been an effort to make model predictions more explainable \parencite{Ribeiro2016WhyClassifier, samek2019explainable}. A systematic approach , however, looking at changes when features are systematically varied is as of yet limited. The study by \textcite{koppel2011authorship} look at authorship analysis "in the wild`` and systemtically vary the number of authors as well as the number of features to assess quantify performance gains and losses. They do not focus on featuretypes and do not extend their study towards analyzing the changes in within the model. Recently, \textcite{boenninghoff2019explainable} showed a method to make a complex model based on a neuronal net explainable. Their approach is limited to their specific model and does also not analyze what the decisive features correspond to, i.e. how much context they encode. In that vein, \textcite{sanchez2017comparison} is closer to our approach. They also seek to limit topic-dependency and focus on featuretypes. However, their goal is to find a good subset of ngrams for their featureset with high predictive power. The paper closest to ours is the one by \textcite{sage2020investigating}. Their analysis is focused on different featuretypes and the influence of varying ngram-lengths. They systematically vary both in order to find the impact on performance. However, they do not extend their analysis to different input sets and also focus on longer news articles instead of the more common data of microblog texts. Moreover, we also extend that analysis into the domain of stability, assessing whether there are shifts in feature-importance.
......@@ -3,23 +3,22 @@
\usepackage[utf8]{inputenc}
\usepackage{fullpage} % changes the margin
\usepackage{url}
\usepackage{hyperref}
\usepackage{amsmath}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{titling}
\usepackage{caption}
\usepackage{graphicx}
\usepackage{adjustbox}
\usepackage{subcaption}
\usepackage{filecontents}
\usepackage{float}
\usepackage{enumitem}
\usepackage[flushleft]{threeparttable}
\usepackage{longtable, tabularx}
\usepackage{multirow}
\usepackage[toc,page]{appendix}
\usepackage{fancyvrb} %for txt input
\usepackage{adjustbox}
\usepackage[section, verbose]{placeins}
\usepackage{authblk}
\usepackage{placeins}
......@@ -29,11 +28,14 @@
\usepackage{rotating}
\usepackage{makecell}
\usepackage{color, colortbl}
\usepackage{float}
\restylefloat{table}
\definecolor{Gray}{gray}{0.9}
\definecolor{LightCyan}{rgb}{0.88,1,1}
%\renewcommand\theadfont{\small\bfseries} % for bold in table using \small
\renewcommand\theadgape{}
\title{Feature Attribution}
\title{Working Title: Feature Attribution}
\author[1]{Marcel H. Schubert\thanks{Corresponding author. schubert@coll.mpg.de}}
\affil[1]{Max-Planck Institute for Research on Collective Goods}
\date{ }
......@@ -42,22 +44,27 @@
%\bibliographystyle{plain}
\usepackage[backend=biber, style=authoryear, natbib= true, url=false, doi=false, eprint=false, isbn=false]{biblatex}
\addbibresource{references.bib}
\usepackage[hidelinks]{hyperref}
\begin{document}
\maketitle
\clearpage
\input{introduction.tex}
\FloatBarrier
\input{experimental_setup.tex}
\FloatBarrier
\input{results.tex}
\FloatBarrier
\input{robustness.tex}
\FloatBarrier
\input{discussion.tex}
\clearpage
\printbibliography
\clearpage
\begin{appendix}
%\begin{appendix}
\section*{Appendix}
\addcontentsline{toc}{section}{Appendix}
\input{appendix.tex}
\end{appendix}
%\end{appendix}
\end{document}
\ No newline at end of file
This diff is collapsed.
\section{Robustness of Feature-Importance}
In order to assess how the aggregate results from the previous section hold, when looking at individual featuretypes as well as different classifier approaches, this section presents a more fine-grained insight. In this section, we focus on the input in terms of featuretypes and ngrams. We analyse these aspects with a special look on the stability of the feature-importance.
\subsection{Baseline}
First, we now have a look at the results on featureset 1, i.e. the results gained by using each featuretype individually to predict the target. \autoref{tab:gender_f1_dist_500_baseline_individual} and \autoref{tab:age_f1_dist_500_baseline_individual} show the experimental results for targets \textit{gender} and \textit{age} respectively. Each row shows the result for a featuretype and the corresponding ngrams. The featuretypes themselves are sorted in an ascending order such that featuretypes in lower rows capture more context, e.g. the type CHAR (characters) captures, in principle, less contextual information, such as topic or structural information, when compared to, for example, word-based ngrams \parencite{Rocha2017AuthorshipForensics}.
Naturally, when the ngram-window is increased, e.g. for character-based features from 2 to 4, the character-ngrams also start to capture contextual information.
Consequently, also the ngram-combinations within the individual featuretypes are sorted in an ascending fashion.\\
\input{tables/gender/latex/results_gender_individual_baseline_500_f1_dist.tex}
Looking at results for the target \textit{gender} we first find that the most predictive featuretypes are those most closely related to the words but not necessarily the structure of the text. That can be seen from the fact that the latter is captured by POS, TAG, and DEP which show low predictive power no matter the number of authors within the subset.
Moreover, already CHAR-2-grams perform well ($F1_{50}^{CHAR-2}: 0.80$) on the small dataset comprised of 50 authors. However, when we increase the number of authors, the performance declines markedly ($F1_{1000}^{CHAR-5}: 0.67$), especially when compared to CHAR-(2,5)-grams ($F1_{1000}^{CHAR-5}:0.741$) which are close to the top performance ($F1_{1000}^{ASIS-5}: 0.748$). The same pattern, although on a lower overall performance level, is visible for the text-distortion features DIST capturing punctuation and other stylistic markers.
For lower ngram-sizes, the performance is only negligibly above or below the random guess threshold while for higher ngrams the performance is higher ($F1_{50}^{DIST-5}: 0.67$) but then decreases again in the number of authors. Consequently, the results show that there seems little cause to think that there are patterns in the style of authors related to gender.
On the other side, CHAR-2-grams have a reliable performance ($0.67 < F1^{CHAR-2} <0.80$) and increasing the ngram window only by 1 increases performance markedly in turn. Consequently, it can be assumed that the there seems to be a discerable pattern related to gender within the character-combinations used. The underlying assumption would be that certain topics might be reflected by the use similar words or that certain synonyms are preferred by one group over the other.
We can compare this with the result for the WORD-grams. Here we see, that while we the performance is high, it is still worse when compared to CHAR-(2,5)-grams. The latter would also capture words up to five characters long. However, if that overlap would be the sole driver of performance, then WORD-grams should not be outperformed. As such, we can conclude that there is a discernible pattern related to gender in low-context CHAR-ngrams.
In terms of the stability of the feature importance, the results are sobering. As in the aggregate before, the correlation tends towards zero when increasing the number of authors. Besides that, in some cases the correlation even flips signs. That implies features which were useful for predicting group A before are now either irrelevant or relevant for predicting group B (see for example \autoref{tab:gender_f1_dist_500_baseline_individual}, $\rho_{500}^{POS-2}: -0.15$). While mostly small, all correlation coefficients are significant on the 1\%-level.\\
When looking at age, the results shown in \autoref{tab:age_f1_dist_500_baseline_individual} reflect the overall findings for gender.
Text distortion alone such as punctuation reflected in the features of type DIST does hold some but not the majority of the information relevant to the prediction of age. That is evident from the stark decline towards random-guess accuracy, especially for low-level-ngrams.
\input{tables/age/latex/results_age_individual_baseline_500_f1_dist.tex}
When combined with CHAR, then especially higher-order ngrams (which is reflected in the featuretype ASIS) hold the most information about an authors age. That seems to be in line with findings linking age to a higher adherence to linguistic rules, even in an online environment \parencite{DeJonge2012TextmessageStudents, Hovy2015TaggingAge}.
However, also the content of the tweets seems to set the age categories apart, as illustrated by the fact that TAG alone has a relatively high predictive power even for the dataset comprised of 1000 authors (F1: 0.31). The same holds true for LEMMA implying that age groups are also set apart by the use of one set of words over another.
Here again, the feature stability is low, with a $ \rho \in [0, 0.05]$.\\
Thus, we can conclude that for singular feature sets, the model is able to extract information from the features, especially those with higher context as evident from the increase in predictive performance when the n-gram range is increased.
However the relevant information is not stable in the number of authors which means that additional authors introduce a wider variation which needs to be separated differently than the smaller range.
As the number of characters is limited overall (and thus the number of features in the lower n-gram range), that automatically implies that the the content and therefore the relevant features change. That seems to lead to an overall change in the way individual features are predictive. Thus, the rank correlation is low.
\input{tables/gender/latex/results_gender_dwald_direct_500_f1_dist.tex}
\subsection{Cumulated}
The previous analysis has shown that some featuretypes, such as POS, TAG, DEP, and NUM do hold little relevant information. We therefore constructed a subset of featuretypes which excludes them. That subset includes the types ASIS, CHAR, LEMMA, and WORD.
Compared to the previous analysis, we now give the model the possibility to include additional information, i.e. information stemming from different featuretypes, in the model.
As shown by the results in \autoref{tab:gender_f1_dist_500_direct_dwald} and \autoref{tab:age_f1_dist_500_direct_dwald}, the additional information yields to overall increase in performance. How much additional information leads to an improvement differs by target.
For \textit{age}, we find that some additional contextual information increases the outcome. However, when the contextual information becomes larger, e.g. by including LEMMA and WORD, the result does not improve anymore. The result is consistent across a different number of authors. Consequently, the information for \textit{age} seems to be less reliant on contextual information and content. Already single-word content and context as captured by CHAR-(2,5) and ASIS-(2,5), is enough for a high prediction score.
When we compare the outcome for \textit{gender} with the the results in \autoref{tab:gender_f1_dist_500_baseline_individual}, we see that using a cumulated input improves the results overall. It is especially important to note that when faced with a high number of individual authors, increasing the context by using additional featuretypes such as LEMMA or WORD in addition to high-level ngrams increases performance.
When taken together, our findings show that context and underlying data structure is an important driver behind the predictions of a model as shown by the fact that the relevant features in terms of predictiveness change.
At the same time, we show that the weight placed on individual features (and thus individual inputs reflecting certain contexts) is not stable.
\input{tables/age/latex/results_age_dwald_direct_500_f1_dist.tex}
That is evident by the correlation scores across different author sets. The scores are $\rho_{150|50}^{NUM}: 0.42$ at the highest for target gender and $\rho_{150|50}^{DEP-2}: -0.1$ at their lowest.
Consequently, while the predictive accuracy is high, the model seems to rely on correlational patterns which are not only not invariant but also quite unstable when the dataset is changed only slightly.\footnote{All anaylsis -stacked as well as cumulated - was also done on the full number of featuretypes as well as the different numbers of authors and the different input lengths. The results may be found in the Appendix \autoref{sec:att_app_tables}}.
\ No newline at end of file
\begin{table}
\begin{table}[!ht]
\centering
\caption{Accuracy and F1-scores for the prediction of age on a minimal tweet length of 100 chars using a cumulated
model on the reduced Featureset (DIST, CHAR, LEMMA, WORD)}
\label{tab:age_acc_100_direct_dwald}
\caption{Accuracy and F1-Sscores for the Prediction of age on a Minimal Instance Length of 100 Chars Using a Cumulated
Model on the Reduced Featureset (DIST, CHAR, LEMMA, WORD)}
\label{tab:Age_acc_100_direct_dwald}
\begin{adjustbox}{max width=\textwidth}
\begin{tabular}{llrrrrrrrr}
\begin{tabular}{llcccccccc}
\toprule
& target & \multicolumn{8}{l}{age} \\
& tweetLen & \multicolumn{8}{l}{100} \\
& Target & \multicolumn{8}{l}{age} \\
& Min. \# Chars in Input & \multicolumn{8}{l}{100} \\
& \# Authors & \multicolumn{2}{l}{50} & \multicolumn{2}{l}{150} & \multicolumn{2}{l}{500} & \multicolumn{2}{l}{1000} \\
& Score & accuracy & f1-score & accuracy & f1-score & accuracy & f1-score & accuracy & f1-score \\
featuretypes & subgrams & & & & & & & & \\
......
\begin{table}
\begin{table}[!ht]
\centering
\label{tab:age_dist_100_direct_dwald}
\caption{Average Distortion of Features for the Prediction of age on a Minimal Instance Length of 100 Chars Using a Cumulated
Model on the Reduced Featureset (DIST, CHAR, LEMMA, WORD)}
\label{tab:Age_dist_100_direct_dwald}
\begin{adjustbox}{max width=\textwidth}
\begin{tabular}{llcccccc}
\toprule
& target & \multicolumn{6}{l}{age} \\
& tweetLen & \multicolumn{6}{l}{100} \\
& Target & \multicolumn{6}{l}{age} \\
& Min. \# Chars in Input & \multicolumn{6}{l}{100} \\
& \# Authors & \multicolumn{2}{l}{150} & \multicolumn{2}{l}{500} & \multicolumn{2}{l}{1000} \\
& Score & \thead{Average Spearman's \\ Rho (ext.)} & \thead{Average Spearman's \\ Rho (red.)} & \thead{Average Spearman's \\ Rho (ext.)} & \thead{Average Spearman's \\ Rho (red.)} & \thead{Average Spearman's \\ Rho (ext.)} & \thead{Average Spearman's \\ Rho (red.)} \\
& Score & \thead{Avg. \\ Spearman's $\rho$ (ext.)} & \thead{Avg. \\ Spearman's $\rho$ (red.)} & \thead{Avg. \\ Spearman's $\rho$ (ext.)} & \thead{Avg. \\ Spearman's $\rho$ (red.)} & \thead{Avg. \\ Spearman's $\rho$ (ext.)} & \thead{Avg. \\ Spearman's $\rho$ (red.)} \\
featuretypes & subgrams & & & & & & \\
\midrule
\multirow{4}{*}{DIST} & 2 & 0.0348 & -0.0149 & 0.0099 & 0.0315 & -0.0141 & 0.0174 \\
......
......@@ -6,10 +6,10 @@
\begin{adjustbox}{max width=\textwidth}
\begin{tabular}{llcccccccc}
\toprule
& target & \multicolumn{8}{l}{age} \\
& tweetLen & \multicolumn{8}{l}{100} \\
& Target & \multicolumn{8}{l}{age} \\
& Min. \# Chars in Input & \multicolumn{8}{l}{100} \\
& \# Authors & \multicolumn{2}{l}{50} & \multicolumn{2}{l}{150} & \multicolumn{2}{l}{500} & \multicolumn{2}{l}{1000} \\
& Score & f1-score & \thead{Average Spearman's \\ Rho (ext.)} & f1-score & \thead{Average Spearman's \\ Rho (ext.)} & f1-score & \thead{Average Spearman's \\ Rho (ext.)} & f1-score & \thead{Average Spearman's \\ Rho (ext.)} \\
& Score & f1-score & \thead{Avg. \\ Spearman's $\rho$ (ext.)} & f1-score & \thead{Avg. \\ Spearman's $\rho$ (ext.)} & f1-score & \thead{Avg. \\ Spearman's $\rho$ (ext.)} & f1-score & \thead{Avg. \\ Spearman's $\rho$ (ext.)} \\
featuretypes & subgrams & & & & & & & & \\
\midrule
\multirow{4}{*}{DIST\_CHAR} & 2 & 0.5994 & -- & 0.4861 & 0.0013 & 0.3555 & 0.0077 & 0.2673 & 0.0047 \\
......
\begin{table}
\begin{table}[!ht]
\centering
\caption{Accuracy and F1-scores for the prediction of age on a minimal tweet length of 250 chars using a cumulated
model on the reduced Featureset (DIST, CHAR, LEMMA, WORD)}
\label{tab:age_acc_250_direct_dwald}
\caption{Accuracy and F1-Sscores for the Prediction of age on a Minimal Instance Length of 250 Chars Using a Cumulated
Model on the Reduced Featureset (DIST, CHAR, LEMMA, WORD)}
\label{tab:Age_acc_250_direct_dwald}
\begin{adjustbox}{max width=\textwidth}
\begin{tabular}{llrrrrrrrr}
\begin{tabular}{llcccccccc}
\toprule
& target & \multicolumn{8}{l}{age} \\
& tweetLen & \multicolumn{8}{l}{250} \\
& Target & \multicolumn{8}{l}{age} \\
& Min. \# Chars in Input & \multicolumn{8}{l}{250} \\
& \# Authors & \multicolumn{2}{l}{50} & \multicolumn{2}{l}{150} & \multicolumn{2}{l}{500} & \multicolumn{2}{l}{1000} \\
& Score & accuracy & f1-score & accuracy & f1-score & accuracy & f1-score & accuracy & f1-score \\