2018-10-17-Yearly-Top-Topics

In [1]:
import os
import pandas as pd
from text2topics import models
In [2]:
%load_ext autoreload
%autoreload 2
In [3]:
data_dir = "/Users/jeriwieringa/Dissertation/data/"
In [4]:
model = models.MalletModel(os.path.join(data_dir, 'model_outputs', 'target_300_10.18497.state.gz'))
In [5]:
df = model.model()
In [6]:
params = model.extract_params()
In [7]:
# Load metadata

metadata = pd.read_csv(os.path.join(data_dir, "corpus_metadata", "meta.csv"), header=None).reset_index()
metadata.columns = ['doc_id', 'filename', 'citation', 'author', 
                    'periodical_name', 'volume', 'issue', 
                    'date', 'page', 'url','abrev']
metadata['date_formatted'] = pd.to_datetime(metadata['date'])
In [8]:
# Load Labels

import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
dts = gc.open('Topic Labels').sheet1

labels = pd.DataFrame(dts.get_all_records())
In [9]:
doc_topic = df.groupby(['#doc', 'topic'])['type'].count().reset_index(name="token_count")
In [10]:
dt = doc_topic.merge(metadata, how='left', left_on="#doc", right_on="doc_id")
In [11]:
y_dt = dt.groupby([dt.date_formatted.dt.year, "topic"])['token_count'].sum().reset_index(name='token_count')
In [12]:
y_m = models.pivot_smooth_norm(y_dt, params[0],'date_formatted', "topic", 'token_count')
In [13]:
y_m[:2]
Out[13]:
topic 0 1 2 3 4 5 6 7 8 9 ... 240 241 242 243 244 245 246 247 248 249
date_formatted
1849 0.000466 0.002586 0.001395 0.003516 0.010145 0.004582 0.001459 0.007097 0.000664 6.630487e-07 ... 0.000995 0.000532 0.000067 0.000334 7.931035e-07 8.578557e-07 0.000001 0.000001 0.000200 7.317510e-07
1850 0.000748 0.000168 0.017215 0.002281 0.017356 0.010193 0.004587 0.017163 0.001495 1.032088e-04 ... 0.004613 0.002564 0.001134 0.000954 5.169408e-05 6.459166e-05 0.000142 0.000155 0.000116 3.093819e-04

2 rows × 250 columns

In [14]:
y_topics = y_m.unstack().reset_index(name='topic_proportion')
In [15]:
top_10 = y_topics.groupby('date_formatted').apply(lambda x: x.nlargest(10, 'topic_proportion')).reset_index(drop=True)
top_5 = y_topics.groupby('date_formatted').apply(lambda x: x.nlargest(5, 'topic_proportion')).reset_index(drop=True)
In [16]:
top_10[:3]
Out[16]:
topic date_formatted topic_proportion
0 155 1849 0.122260
1 103 1849 0.057418
2 131 1849 0.051848
In [17]:
def merge_labels(df):
    return df.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")
top_10 = top_10.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")
top_5 = merge_labels(top_5)
In [18]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

init_notebook_mode(connected=True)
cf.go_offline()

Browse Top Topics by Year

Interface for viewing the top 25 topics in each year. Select starting and ending year (both included in the displayed data.) Also includes an interactive chart of the top weights over time.

In [19]:
import ipywidgets as widgets
from ipywidgets import interactive
In [20]:
years = top_10['date_formatted'].unique().tolist()
frame = top_5

def top_topics(start_year='', end_year=''):
    topics = frame[(frame['date_formatted'] >= start_year) & (frame['date_formatted'] <= end_year)]
#     display_df = topics[topics['topic', 'date_formatted', 'topic_proportion', 'topic_label']]
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(topics)
    display_m = pd.pivot_table(topics, 
               index='date_formatted', 
               columns="topic_label", 
               values="topic_proportion",
              )
    display_m.iplot()
In [21]:
start = widgets.Select(options=years)
end = widgets.Select(options=years)

interactive(top_topics, start_year=start, end_year=end)
interactive(children=(Select(description='start_year', options=(1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856…
In [ ]:
 
In [ ]:
 
In [ ]: