import os
import pandas as pd
from text2topics import models

%load_ext autoreload
%autoreload 2

data_dir = "/Users/jeriwieringa/Dissertation/data/"

model = models.MalletModel(os.path.join(data_dir, 'model_outputs', 'target_300_10.18497.state.gz'))

df = model.model()

params = model.extract_params()

# Load metadata

metadata = pd.read_csv(os.path.join(data_dir, "corpus_metadata", "meta.csv"), header=None).reset_index()
metadata.columns = ['doc_id', 'filename', 'citation', 'author', 
                    'periodical_name', 'volume', 'issue', 
                    'date', 'page', 'url','abrev']
metadata['date_formatted'] = pd.to_datetime(metadata['date'])

# Load Labels

import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
dts = gc.open('Topic Labels').sheet1

labels = pd.DataFrame(dts.get_all_records())

doc_topic = df.groupby(['#doc', 'topic'])['type'].count().reset_index(name="token_count")

dt = doc_topic.merge(metadata, how='left', left_on="#doc", right_on="doc_id")

y_dt = dt.groupby([dt.date_formatted.dt.year, "topic"])['token_count'].sum().reset_index(name='token_count')

y_m = models.pivot_smooth_norm(y_dt, params[0],'date_formatted', "topic", 'token_count')

y_m[:2]

y_topics = y_m.unstack().reset_index(name='topic_proportion')

top_10 = y_topics.groupby('date_formatted').apply(lambda x: x.nlargest(10, 'topic_proportion')).reset_index(drop=True)
top_5 = y_topics.groupby('date_formatted').apply(lambda x: x.nlargest(5, 'topic_proportion')).reset_index(drop=True)

top_10[:3]

def merge_labels(df):
    return df.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")
top_10 = top_10.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")
top_5 = merge_labels(top_5)

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

init_notebook_mode(connected=True)
cf.go_offline()

Browse Top Topics by Year¶

Interface for viewing the top 25 topics in each year. Select starting and ending year (both included in the displayed data.) Also includes an interactive chart of the top weights over time.

import ipywidgets as widgets
from ipywidgets import interactive

years = top_10['date_formatted'].unique().tolist()
frame = top_5

def top_topics(start_year='', end_year=''):
    topics = frame[(frame['date_formatted'] >= start_year) & (frame['date_formatted'] <= end_year)]
#     display_df = topics[topics['topic', 'date_formatted', 'topic_proportion', 'topic_label']]
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(topics)
    display_m = pd.pivot_table(topics, 
               index='date_formatted', 
               columns="topic_label", 
               values="topic_proportion",
              )
    display_m.iplot()

start = widgets.Select(options=years)
end = widgets.Select(options=years)

interactive(top_topics, start_year=start, end_year=end)

interactive(children=(Select(description='start_year', options=(1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856…

topic	0	1	2	3	4	5	6	7	8	9	...	240	241	242	243	244	245	246	247	248	249
date_formatted
1849	0.000466	0.002586	0.001395	0.003516	0.010145	0.004582	0.001459	0.007097	0.000664	6.630487e-07	...	0.000995	0.000532	0.000067	0.000334	7.931035e-07	8.578557e-07	0.000001	0.000001	0.000200	7.317510e-07
1850	0.000748	0.000168	0.017215	0.002281	0.017356	0.010193	0.004587	0.017163	0.001495	1.032088e-04	...	0.004613	0.002564	0.001134	0.000954	5.169408e-05	6.459166e-05	0.000142	0.000155	0.000116	3.093819e-04

	topic	date_formatted	topic_proportion
0	155	1849	0.122260
1	103	1849	0.057418
2	131	1849	0.051848