2018-10-17-Yearly-Top-Topics
In [1]:
import os
import pandas as pd
from text2topics import models
In [2]:
%load_ext autoreload
%autoreload 2
In [3]:
data_dir = "/Users/jeriwieringa/Dissertation/data/"
In [4]:
model = models.MalletModel(os.path.join(data_dir, 'model_outputs', 'target_300_10.18497.state.gz'))
In [5]:
df = model.model()
In [6]:
params = model.extract_params()
In [7]:
# Load metadata
metadata = pd.read_csv(os.path.join(data_dir, "corpus_metadata", "meta.csv"), header=None).reset_index()
metadata.columns = ['doc_id', 'filename', 'citation', 'author',
'periodical_name', 'volume', 'issue',
'date', 'page', 'url','abrev']
metadata['date_formatted'] = pd.to_datetime(metadata['date'])
In [8]:
# Load Labels
import gspread
from oauth2client.service_account import ServiceAccountCredentials
# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
dts = gc.open('Topic Labels').sheet1
labels = pd.DataFrame(dts.get_all_records())
In [9]:
doc_topic = df.groupby(['#doc', 'topic'])['type'].count().reset_index(name="token_count")
In [10]:
dt = doc_topic.merge(metadata, how='left', left_on="#doc", right_on="doc_id")
In [11]:
y_dt = dt.groupby([dt.date_formatted.dt.year, "topic"])['token_count'].sum().reset_index(name='token_count')
In [12]:
y_m = models.pivot_smooth_norm(y_dt, params[0],'date_formatted', "topic", 'token_count')
In [13]:
y_m[:2]
Out[13]:
In [14]:
y_topics = y_m.unstack().reset_index(name='topic_proportion')
In [15]:
top_10 = y_topics.groupby('date_formatted').apply(lambda x: x.nlargest(10, 'topic_proportion')).reset_index(drop=True)
top_5 = y_topics.groupby('date_formatted').apply(lambda x: x.nlargest(5, 'topic_proportion')).reset_index(drop=True)
In [16]:
top_10[:3]
Out[16]:
In [17]:
def merge_labels(df):
return df.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")
top_10 = top_10.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")
top_5 = merge_labels(top_5)
In [18]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()