2018-10-17-Yearly-Top-Topics

In [1]:
import os
import pandas as pd
from text2topics import models
In [2]:
%load_ext autoreload
%autoreload 2
In [3]:
data_dir = "/Users/jeriwieringa/Dissertation/data/"
In [4]:
model = models.MalletModel(os.path.join(data_dir, 'model_outputs', 'target_300_10.18497.state.gz'))
In [5]:
df = model.model()
In [6]:
params = model.extract_params()
In [7]:
# Load metadata

metadata = pd.read_csv(os.path.join(data_dir, "corpus_metadata", "meta.csv"), header=None).reset_index()
metadata.columns = ['doc_id', 'filename', 'citation', 'author', 
                    'periodical_name', 'volume', 'issue', 
                    'date', 'page', 'url','abrev']
metadata['date_formatted'] = pd.to_datetime(metadata['date'])
In [8]:
# Load Labels

import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
dts = gc.open('Topic Labels').sheet1

labels = pd.DataFrame(dts.get_all_records())
In [9]:
doc_topic = df.groupby(['#doc', 'topic'])['type'].count().reset_index(name="token_count")
In [10]:
dt = doc_topic.merge(metadata, how='left', left_on="#doc", right_on="doc_id")
In [11]:
y_dt = dt.groupby([dt.date_formatted.dt.year, "topic"])['token_count'].sum().reset_index(name='token_count')
In [12]:
y_m = models.pivot_smooth_norm(y_dt, params[0],'date_formatted', "topic", 'token_count')
In [13]:
y_m[:2]
Out[13]:
topic 0 1 2 3 4 5 6 7 8 9 ... 240 241 242 243 244 245 246 247 248 249
date_formatted
1849 0.000466 0.002586 0.001395 0.003516 0.010145 0.004582 0.001459 0.007097 0.000664 6.630487e-07 ... 0.000995 0.000532 0.000067 0.000334 7.931035e-07 8.578557e-07 0.000001 0.000001 0.000200 7.317510e-07
1850 0.000748 0.000168 0.017215 0.002281 0.017356 0.010193 0.004587 0.017163 0.001495 1.032088e-04 ... 0.004613 0.002564 0.001134 0.000954 5.169408e-05 6.459166e-05 0.000142 0.000155 0.000116 3.093819e-04

2 rows × 250 columns

In [14]:
y_topics = y_m.unstack().reset_index(name='topic_proportion')
In [15]:
top_10 = y_topics.groupby('date_formatted').apply(lambda x: x.nlargest(10, 'topic_proportion')).reset_index(drop=True)
top_5 = y_topics.groupby('date_formatted').apply(lambda x: x.nlargest(5, 'topic_proportion')).reset_index(drop=True)
In [16]:
top_10[:3]
Out[16]:
topic date_formatted topic_proportion
0 155 1849 0.122260
1 103 1849 0.057418
2 131 1849 0.051848
In [17]:
def merge_labels(df):
    return df.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")
top_10 = top_10.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")
top_5 = merge_labels(top_5)
In [18]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

init_notebook_mode(connected=True)
cf.go_offline()