import os
import pandas as pd
from text2topics import models

%load_ext autoreload
%autoreload 2

data_dir = "/Users/jeriwieringa/Dissertation/data/"

model = models.MalletModel(os.path.join(data_dir, 'model_outputs', 'target_300_10.18497.state.gz'))

df = model.model()

params = model.extract_params()

# Load metadata

metadata = pd.read_csv(os.path.join(data_dir,'corpus_metadata', "meta.csv"), header=None).reset_index()
metadata.columns = ['doc_id', 'filename', 'citation', 'author', 
                    'periodical_name', 'volume', 'issue', 
                    'date', 'page', 'url','abrev']
metadata['date_formatted'] = pd.to_datetime(metadata['date'])

# Load Labels

import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
dts = gc.open('Topic Labels').sheet1

labels = pd.DataFrame(dts.get_all_records())

doc_topic = df.groupby(['#doc', 'topic'])['type'].count().reset_index(name="token_count")

doc_topic[:3]

doc_topic['topic'] = doc_topic['topic'].apply(pd.to_numeric)

dt = models.pivot_smooth_norm(doc_topic, params[0], '#doc', 'topic', 'token_count')

docs = dt.unstack().reset_index(name="topic_proportion")

docs = docs.merge(metadata, how='left', left_on="#doc", right_on="doc_id")

docs[:3]

Question of how do I want to distinguish "top" documents. For interpretation, I think I want those documents where the topic is prevalent, so accounts for more than 20% (?) of the content. The goal here is to see the large markers for use in the historical interpretation, rather than the more subtle markers that can be used to show development over time.

Step 1 then becomes to filter this frame by the those topics that have a proportion >= .20 for each document.

top_doc_topics = docs[docs['topic_proportion'] >= 0.2]

top_doc_topics[:3]

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

init_notebook_mode(connected=True)
cf.go_offline()

Browse Top Documents per Topic and Year(s)¶

View which documents have a greater than 10% prevalence of a given topic in a time range.

import ipywidgets as widgets
from ipywidgets import interactive

years = top_doc_topics.date_formatted.dt.year.sort_values().unique().tolist()
topic_ids = top_doc_topics['topic'].unique().tolist()

def top_docs(start_year='', end_year='', topic=''):
    docs = top_doc_topics[(top_doc_topics.date_formatted.dt.year >= start_year) & (top_doc_topics.date_formatted.dt.year <= end_year) & (top_doc_topics['topic'] == topic)]
#     display_df = topics[topics['topic', 'date_formatted', 'topic_proportion', 'topic_label']]
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(docs.sort_values('date_formatted'))

start = widgets.Select(options=years)
end = widgets.Select(options=years)
topic_id = widgets.Select(options=topic_ids)

interactive(top_docs, start_year=start, end_year=end, topic=topic_id)

interactive(children=(Select(description='start_year', options=(1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856…

	#doc	topic_proportion	doc_id	filename	citation	author	periodical_name	volume	issue	date	page	url	abrev	date_formatted
0	0	0.000247	0	ADV18981201-V02-01-page12.txt	Training School Advocate (Vol. 02.01) Dec 01, ...	NaN	Training School Advocate	02	01	1898-12-01	12	http://documents.adventistarchives.org/Periodi...	ADV	1898-12-01
1	1	0.000279	1	ADV18981201-V02-01-page13.txt	Training School Advocate (Vol. 02.01) Dec 01, ...	NaN	Training School Advocate	02	01	1898-12-01	13	http://documents.adventistarchives.org/Periodi...	ADV	1898-12-01
2	2	0.000265	2	ADV18981201-V02-01-page15.txt	Training School Advocate (Vol. 02.01) Dec 01, ...	NaN	Training School Advocate	02	01	1898-12-01	15	http://documents.adventistarchives.org/Periodi...	ADV	1898-12-01

	#doc	topic_proportion	doc_id	filename	citation	author	periodical_name	volume	issue	date	page	url	abrev	date_formatted
213	213	0.225973	213	ADV19000501-V02-05-page19.txt	Training School Advocate (Vol. 02.05) May 01, ...	NaN	Training School Advocate	02	05	1900-05-01	19	http://documents.adventistarchives.org/Periodi...	ADV	1900-05-01
235	235	0.207354	235	ADV19000601-V02-06-page14.txt	Training School Advocate (Vol. 02.06) Jun 01, ...	NaN	Training School Advocate	02	06	1900-06-01	14	http://documents.adventistarchives.org/Periodi...	ADV	1900-06-01
326	326	0.277509	326	ADV19001001-V02-10-page19.txt	Training School Advocate (Vol. 02.10) Oct 01, ...	NaN	Training School Advocate	02	10	1900-10-01	19	http://documents.adventistarchives.org/Periodi...	ADV	1900-10-01