2019-01-17-Yearly-Top-Labels
In [1]:
import os
import pandas as pd
from text2topics import models
In [2]:
%load_ext autoreload
%autoreload 2
In [3]:
data_dir = "/Users/jeriwieringa/Dissertation/data/"
In [4]:
model = models.MalletModel(os.path.join(data_dir,
'model_outputs',
'target_300_10.18497.state.gz'
))
In [5]:
df = model.model()
In [6]:
params = model.extract_params()
In [7]:
# Load metadata
metadata = pd.read_csv(os.path.join(data_dir, "corpus_metadata", "meta.csv"), header=None)\
.reset_index()
metadata.columns = ['doc_id', 'filename', 'citation', 'author',
'periodical_name', 'volume', 'issue',
'date', 'page', 'url','abrev']
metadata['date_formatted'] = pd.to_datetime(metadata['date'])
In [8]:
# Load Labels
import gspread
from oauth2client.service_account import ServiceAccountCredentials
# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
dts = gc.open('Topic Labels').sheet1
labels = pd.DataFrame(dts.get_all_records())
In [9]:
doc_topic = df.groupby(['#doc', 'topic'])['type'].count().reset_index(name="token_count")
In [10]:
dt = doc_topic.merge(metadata, how='left', left_on="#doc", right_on="doc_id")
In [11]:
y_dt = dt.groupby([dt.date_formatted.dt.year, "topic"])['token_count']\
.sum()\
.reset_index(name='token_count')
In [12]:
y_m = models.pivot_smooth_norm(y_dt, params[0],
'date_formatted', 'topic', 'token_count')
In [13]:
y_m[:2]
Out[13]:
In [14]:
y_topics = y_m.unstack().reset_index(name='t_proportion')
Add metadata back and aggregate by labels
In [15]:
category_df = y_topics.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")\
[['topic', 'date_formatted', 't_proportion', 'topic_category', 'topic_label']]\
.groupby(['date_formatted', 'topic_category'])['t_proportion']\
.sum().reset_index(name='category_prevalence')
In [16]:
category_df[:35]
Out[16]:
Chart gives us an overview of the make up of the periodical literature in a given year. These numbers seem reasonable given my reading of these periodical documents.
Browse Top Topics by Year¶
Interface for viewing the top 25 topics in each year. Select starting and ending year (both included in the displayed data.) Also includes an interactive chart of the top weights over time.
In [17]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
# Keep charts from syncing with Plotly servers.
init_notebook_mode(connected=True)
cf.go_offline()