2018-10-17-Yearly-Top-Topics
In [1]:
import os
import pandas as pd
from text2topics import models
In [2]:
%load_ext autoreload
%autoreload 2
In [3]:
data_dir = "/Users/jeriwieringa/Dissertation/data/"
In [4]:
model = models.MalletModel(os.path.join(data_dir, 'model_outputs', 'target_300_10.18497.state.gz'))
In [5]:
df = model.model()
In [6]:
params = model.extract_params()
In [7]:
# Load metadata
metadata = pd.read_csv(os.path.join(data_dir, "corpus_metadata", "meta.csv"), header=None).reset_index()
metadata.columns = ['doc_id', 'filename', 'citation', 'author',
'periodical_name', 'volume', 'issue',
'date', 'page', 'url','abrev']
metadata['date_formatted'] = pd.to_datetime(metadata['date'])
In [8]:
# Load Labels
import gspread
from oauth2client.service_account import ServiceAccountCredentials
# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
dts = gc.open('Topic Labels').sheet1
labels = pd.DataFrame(dts.get_all_records())
In [9]:
doc_topic = df.groupby(['#doc', 'topic'])['type'].count().reset_index(name="token_count")
In [10]:
dt = doc_topic.merge(metadata, how='left', left_on="#doc", right_on="doc_id")
In [11]:
y_dt = dt.groupby([dt.date_formatted.dt.year, "topic"])['token_count'].sum().reset_index(name='token_count')
In [12]:
y_m = models.pivot_smooth_norm(y_dt, params[0],'date_formatted', "topic", 'token_count')
In [13]:
y_m[:2]
Out[13]:
In [14]:
y_topics = y_m.unstack().reset_index(name='topic_proportion')
In [15]:
top_10 = y_topics.groupby('date_formatted').apply(lambda x: x.nlargest(10, 'topic_proportion')).reset_index(drop=True)
top_5 = y_topics.groupby('date_formatted').apply(lambda x: x.nlargest(5, 'topic_proportion')).reset_index(drop=True)
In [16]:
top_10[:3]
Out[16]:
In [17]:
def merge_labels(df):
return df.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")
top_10 = top_10.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")
top_5 = merge_labels(top_5)
In [18]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()
Browse Top Topics by Year¶
Interface for viewing the top 25 topics in each year. Select starting and ending year (both included in the displayed data.) Also includes an interactive chart of the top weights over time.
In [19]:
import ipywidgets as widgets
from ipywidgets import interactive
In [20]:
years = top_10['date_formatted'].unique().tolist()
frame = top_5
def top_topics(start_year='', end_year=''):
topics = frame[(frame['date_formatted'] >= start_year) & (frame['date_formatted'] <= end_year)]
# display_df = topics[topics['topic', 'date_formatted', 'topic_proportion', 'topic_label']]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
display(topics)
display_m = pd.pivot_table(topics,
index='date_formatted',
columns="topic_label",
values="topic_proportion",
)
display_m.iplot()
In [21]:
start = widgets.Select(options=years)
end = widgets.Select(options=years)
interactive(top_topics, start_year=start, end_year=end)
In [ ]:
In [ ]:
In [ ]: