Commit fe65adbd authored by Marcel Henrik Schubert's avatar Marcel Henrik Schubert
Browse files

changed plots slightly

parent e0e3243d
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import ndjson\n",
"import jsonlines\n",
"import json\n",
"import pickle\n",
"import os\n",
"import sys\n",
"import random as rd\n",
"import json\n",
"import re, regex\n",
"from joblib import dump, load\n",
"import collections\n",
"import math\n",
"import statistics\n",
"\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"path = '../Data/pan19-celebrity-profiling-training-dataset-2019-01-31/stratified_subsample/org/'\n",
"\n",
"subsets = [500]\n",
"\n",
"classifiers = ['SVM']\n",
"\n",
"vect = ['_count', '']\n",
"\n",
"datafolder = 'split_data'\n",
"\n",
"ml_results = 'ml'\n",
"\n",
"filebeg = 'stratified_subsample_'\n",
"\n",
"labels = ['age', 'gender', 'author']"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
......@@ -951,10 +951,10 @@
],
"source": [
"#import matplotlib as mpl\n",
"#mpl.rcParams['font.sans-serif'] = ['Segoe UI Emoji']\n",
"#mpl.rcParams['font.serif'] = ['Segoe UI Emoji']\n",
"mpl.rcParams['font.sans-serif'] = ['Segoe UI Emoji']\n",
"mpl.rcParams['font.serif'] = ['Segoe UI Emoji']\n",
"import seaborn as sns # data visualization library \n",
"#sns.set_style({\"font.sans-serif\":['Segoe UI Emoji']}) \n",
"sns.set_style({\"font.sans-serif\":['Segoe UI Emoji']}) \n",
"for st in subsets:\n",
" for ana in subana:\n",
" an = ana.split('/')[0]\n",
......@@ -2,9 +2,21 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 17,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[WinError 3] Das System kann den angegebenen Pfad nicht finden: '../Data/pan19-celebrity-profiling-training-dataset-2019-01-31'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-17-cbac8facd46d>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mcollections\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 12\u001b[1;33m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mchdir\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'../Data/pan19-celebrity-profiling-training-dataset-2019-01-31'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m: [WinError 3] Das System kann den angegebenen Pfad nicht finden: '../Data/pan19-celebrity-profiling-training-dataset-2019-01-31'"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
......@@ -14,23 +26,77 @@
"%matplotlib inline\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import json\n",
"import collections\n",
"\n",
"os.chdir('../Data/pan19-celebrity-profiling-training-dataset-2019-01-31')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'ln' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-5-b392649f501c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mlb\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mln\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtranspose\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: name 'ln' is not defined"
]
}
],
"source": [
"# load from file-like objects\n",
"with open('labels.ndjson', 'r', encoding='utf-8') as f:\n",
" data = ndjson.load(f)\n",
"with open('labels_pre.json', 'r', encoding='utf-8') as f:\n",
" data = json.load(f)\n",
"\n",
"lb = pd.DataFrame(data)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"lb = lb.transpose()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Counter({'retiree': 2405,\n",
" 'old_adult_65': 6211,\n",
" 'adult_50': 11480,\n",
" 'young_adult_35': 13029,\n",
" 'child_21': 711})"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lb['life_phase'] = ''\n",
"lb['life_phase'] = np.where(2019 - lb['birthyear'] > 65, 'retiree', lb['life_phase'])\n",
"lb['life_phase'] = np.where(2019 - lb['birthyear'] < 66, 'old_adult_65', lb['life_phase'])\n",
"lb['life_phase'] = np.where(2019 - lb['birthyear'] < 51, 'adult_50', lb['life_phase'])\n",
"lb['life_phase'] = np.where(2019 - lb['birthyear'] < 36, 'young_adult_35', lb['life_phase'])\n",
"lb['life_phase'] = np.where(2019 - lb['birthyear'] < 22, 'child_21', lb['life_phase'])\n",
"\n",
"collections.Counter(lb.life_phase)"
]
},
{
"cell_type": "code",
"execution_count": 3,
......
This diff is collapsed.
......@@ -93,28 +93,6 @@ def chunkify(fname, gram, size=np.NaN):
else:
sys.exit('Chunking not yet implemented')
###this forces every group to have the same amount of tweets
limIds = []
i = 0
for ind, key in enumerate(ids):
leng = len(authors[key]['lineBytes'])
if leng < 600:
limIds.append(ind)
else:
if leng < minTweets:
minTweets = leng
count = 0
for i in limIds:
key = ids[i-count]
tmp = ids.pop(i-count)
tmp = strat.pop(i-count)
tmp = authors.pop(key)
count +=1
return authors, strat, ids, minTweets
def random_strat_draw(x, strat, size):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment