%load_ext autoreload
%autoreload 2

Add some explaining text ....

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

init_notebook_mode(connected=False)
cf.go_offline()

%matplotlib inline

import os
import pandas as pd
from text2topics import models

data_dir = "/Users/jeriwieringa/Dissertation/data/"
output_dir = "/Users/jeriwieringa/Dissertation/site/files/interact/"

data = os.path.join(data_dir, "model_outputs", "target_300_10.18497.state.gz")

mallet_model = models.MalletModel(data)

df = mallet_model.model()
params = mallet_model.extract_params()

doc_topic = df.groupby(['#doc', 'topic'])['type'].count().reset_index(name="token_count")

#sanity check
models.pivot_smooth_norm(doc_topic, params[0], '#doc', 'topic', 'token_count')[:10]

# Load metadata

metadata = pd.read_csv(os.path.join(data_dir, "corpus_metadata", "meta.csv"), header=None).reset_index()
metadata.columns = ['doc_id', 'filename', 'citation', 'author', 
                    'periodical_name', 'volume', 'issue', 
                    'date', 'page', 'url','abrev']
metadata['date_formatted'] = pd.to_datetime(metadata['date'])

# Load Labels

import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
dts = gc.open('Topic Labels').sheet1

labels = pd.DataFrame(dts.get_all_records())

denom_titles = ['RH', 'ST', 'ARAI', 'GCB', 'PTAR','TCOG']
denom = models.subset_by_titles(doc_topic, metadata, denom_titles, params[0])

prophecy_denom = models.generate_graph_data(denom, ['prophecy'], labels)
models.generate_visualizations(prophecy_denom, 
                              'Denominational Titles', 'Prophecy Topics',
                              output_dir
                             )

esch_denom = models.generate_graph_data(denom, ['eschatology'], labels)
models.generate_visualizations(esch_denom, 
                              'Denominational Titles', 'Eschatology Topics', 
                              output_dir
                             )

signs_denom = models.generate_graph_data(denom, ['signs_of_the_times'], labels)
models.generate_visualizations(signs_denom, 
                              'Denominational Titles', 'Signs of the Times Topics', 
                              output_dir
                             )

church_denom = models.generate_graph_data(denom, ['church_and_state'], labels)
models.generate_visualizations(church_denom, 
                              'Denominational Titles', 'Church and State Topics', 
                              output_dir
                             )

endTimes_denom = models.generate_graph_data(denom, 
                                           ['prophecy', 'eschatology', 'signs_of_the_times', 'church_and_state'], 
                                           labels)
models.generate_visualizations(endTimes_denom, 
                              'Denominational Titles', 'All End-Times Topics', 
                              output_dir
                             )

Process full corpus (excluding YI)¶

# Full Corpus excludes 'YI' because of the spotty coverage and distinctive vocabulary
titles = metadata['abrev'].unique().tolist()
titles.remove('YI')

full_corpus = models.subset_by_titles(doc_topic, metadata, titles, params[0])

prophecy_full = models.generate_graph_data(full_corpus, ['prophecy'], labels)
models.generate_visualizations(prophecy_full, 
                              'Full Corpus', 'Prophecy Topics',
                              output_dir
                             )

esch_full = models.generate_graph_data(full_corpus, ['eschatology'], labels)
models.generate_visualizations(esch_full, 
                              'Full Corpus', 'Eschatology Topics',
                              output_dir
                             )

church_full = models.generate_graph_data(full_corpus, ['church_and_state'], labels)
models.generate_visualizations(church_full, 
                              'Full Corpus', 'Church and State Topics',
                              output_dir
                             )

signs_full = models.generate_graph_data(full_corpus, ['signs_of_the_times'], labels)
models.generate_visualizations(signs_full, 
                              'Full Corpus', 'Signs of the Times Topics',
                              output_dir
                             )

endTimes_full = models.generate_graph_data(full_corpus, 
                                           ['prophecy', 'eschatology', 'signs_of_the_times', 'church_and_state'], 
                                           labels)
models.generate_visualizations(endTimes_full, 
                              'Full Corpus', 'All End-Times Topics', 
                              output_dir
                             )

Experiment with using lable for all topics chart (too much detail with all the topics).

what data do I need?

aggregated topic prevalence for label
prep that for bar graph

Where is the data? - full_corpus (this has been normed on the year)

full_corpus[:10]

def aggregate_on_topic_category(df, topic_list):
    data = df.unstack().reset_index(name='t_proportion')\
            .merge(labels, how='left', left_on="topic", right_on="mallet_topic_id")\
            [['topic', 'date_formatted', 't_proportion', 'topic_category', 'topic_label']]
    agg_data = data.groupby(['date_formatted', 'topic_category'])['t_proportion']\
            .sum()\
            .reset_index(name="agg_topic_proportion")
    return agg_data[agg_data['topic_category'].isin(topic_list)]

import plotly.graph_objs as go

def generate_graph_data(df):
    """
    """
    # Compile into form for Plotly
    data = []
    for each in df['topic_category'].unique():
        filtered = df[df['topic_category'] == each]
        graph_obj = go.Bar(
            x = filtered['date_formatted'],
            y = filtered['agg_topic_proportion'],
            name = ''.join(filtered['topic_category'].unique())
        )
        data.append(graph_obj)
    
    return data

agg_end_times = aggregate_on_topic_category(full_corpus,
                                            ['prophecy', 
                                             'eschatology', 
                                             'signs_of_the_times', 
                                             'church_and_state'
                                            ])

agg_end_times[:5]

models.generate_visualizations(generate_graph_data(agg_end_times), 
                            'Full Corpus', 
                            'All End-Times Categories', 
                            output_dir)

denom_data = aggregate_on_topic_category(denom,
                                            ['prophecy', 
                                             'eschatology', 
                                             'signs_of_the_times', 
                                             'church_and_state'
                                            ])

models.generate_visualizations(generate_graph_data(denom_data),
                            'Denominational Titles', 
                             'All End-Times Categories',
                              output_dir
                             )

topic	0	1	2	3	4	5	6	7	8	9	...	240	241	242	243	244	245	246	247	248	249
#doc
0	0.000247	0.000077	0.000303	0.000252	0.000168	0.000858	0.000106	0.000355	0.000158	0.000083	...	0.000093	0.000146	0.000120	0.000251	0.000099	0.008392	0.000134	0.000146	0.000133	0.000091
1	0.000279	0.000087	0.000343	0.000285	0.037680	0.010343	0.000120	0.000402	0.000178	0.000094	...	0.000106	0.000165	0.000135	0.000284	0.000112	0.000121	0.000151	0.000165	0.000150	0.000103
2	0.000265	0.000082	0.000325	0.000269	0.000180	0.009792	0.000113	0.000381	0.159891	0.000089	...	0.000100	0.000157	0.000128	0.000269	0.000106	0.000115	0.000143	0.000157	0.000142	0.000098
3	0.000293	0.000091	0.000360	0.000299	0.000200	0.001018	0.000125	0.000422	0.000187	0.000098	...	0.000111	0.000173	0.000142	0.000298	0.000118	0.000127	0.000159	0.000174	0.000158	0.000109
4	0.000237	0.000074	0.055981	0.000242	0.000162	0.000824	0.000101	0.000341	0.000151	0.000080	...	0.000090	0.000140	0.000115	0.008196	0.000095	0.000103	0.000128	0.000140	0.000128	0.000088
5	0.000253	0.000079	0.000311	0.000258	0.000173	0.000880	0.000108	0.000365	0.025651	0.000085	...	0.000096	0.000150	0.000123	0.000257	0.000102	0.000110	0.000137	0.000150	0.000136	0.000094
6	0.000233	0.015735	0.000287	0.000238	0.000159	0.000811	0.000100	0.000336	0.000149	0.000078	...	0.000088	0.000138	0.000113	0.000237	0.000094	0.000101	0.000126	0.000138	0.000126	0.000086
7	0.000210	0.000065	0.000258	0.000214	0.000143	0.000731	0.000090	0.007360	0.000134	0.000071	...	0.000079	0.000125	0.000102	0.000214	0.077715	0.000091	0.000114	0.000125	0.000113	0.000078
8	0.000237	0.000074	0.000291	0.000242	0.000162	0.000824	0.000101	0.000341	0.000151	0.000080	...	0.000090	0.000140	0.000115	0.000241	0.000095	0.000103	0.000128	0.000140	0.000128	0.000088
9	0.000218	0.000068	0.000268	0.000222	0.014780	0.000757	0.000093	0.000314	0.000139	0.000073	...	0.000082	0.000129	0.000106	0.000221	0.000088	0.000095	0.000118	0.000129	0.000117	0.000081

topic	0	1	2	3	4	5	6	7	8	9	...	240	241	242	243	244	245	246	247	248	249
date_formatted
1849	0.000466	0.002586	0.001395	0.003516	0.010145	0.004582	0.001459	0.007097	0.000664	6.630487e-07	...	0.000995	0.000532	0.000067	0.000334	7.931035e-07	8.578557e-07	0.000001	0.000001	0.000200	7.317510e-07
1850	0.000748	0.000168	0.017215	0.002281	0.017356	0.010193	0.004587	0.017163	0.001495	1.032088e-04	...	0.004613	0.002564	0.001134	0.000954	5.169408e-05	6.459166e-05	0.000142	0.000155	0.000116	3.093819e-04
1851	0.000168	0.000088	0.012642	0.000423	0.008307	0.005930	0.003075	0.026111	0.001330	1.762721e-04	...	0.007153	0.000326	0.000326	0.001410	1.586710e-04	1.773238e-05	0.000088	0.000308	0.000027	2.291364e-04
1852	0.000167	0.000035	0.005281	0.001515	0.007970	0.005372	0.001744	0.007477	0.002126	9.241767e-04	...	0.001591	0.000167	0.000063	0.003620	4.934035e-04	2.093442e-05	0.000042	0.000160	0.000049	3.057964e-04
1853	0.000172	0.000177	0.004123	0.005804	0.014792	0.009618	0.004503	0.007720	0.002175	7.067793e-04	...	0.016509	0.000535	0.000231	0.001767	6.800938e-05	1.269083e-04	0.000141	0.000068	0.000168	5.119791e-04
1854	0.000371	0.000064	0.006968	0.005771	0.014992	0.006641	0.002876	0.004330	0.001102	2.825367e-04	...	0.011030	0.002079	0.000324	0.001863	4.447696e-05	1.333580e-04	0.000248	0.000070	0.000067	1.016099e-04
1855	0.000287	0.000017	0.005469	0.006721	0.016765	0.007997	0.001198	0.009276	0.002825	2.318219e-04	...	0.007935	0.001065	0.000259	0.002367	1.380224e-04	3.869919e-05	0.001159	0.000215	0.000088	2.759737e-04
1856	0.000356	0.000066	0.005137	0.005642	0.016413	0.007067	0.006419	0.017270	0.003977	6.013867e-04	...	0.013017	0.000106	0.000449	0.002120	1.030340e-04	5.652370e-05	0.000498	0.000116	0.000070	1.362549e-04
1857	0.000114	0.000129	0.006746	0.007267	0.013650	0.008857	0.003065	0.014193	0.007135	1.769910e-04	...	0.005354	0.000633	0.000360	0.003278	2.103126e-05	8.402032e-05	0.000540	0.000144	0.000099	1.709955e-04
1858	0.000206	0.000049	0.007224	0.007572	0.017155	0.008963	0.001800	0.019248	0.003200	3.876865e-04	...	0.003412	0.000194	0.000628	0.002729	1.169488e-04	1.507948e-04	0.001061	0.000366	0.000129	3.015439e-04

	date_formatted	topic_category	agg_topic_proportion
3	1849	church_and_state	0.002725
8	1849	eschatology	0.129428
21	1849	prophecy	0.003118
26	1849	signs_of_the_times	0.014663
36	1850	church_and_state	0.002488