Aggregated End-Times Topics
In [1]:
%load_ext autoreload
%autoreload 2
Add some explaining text ....
In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
init_notebook_mode(connected=False)
cf.go_offline()
In [3]:
%matplotlib inline
In [4]:
import os
import pandas as pd
from text2topics import models
In [5]:
data_dir = "/Users/jeriwieringa/Dissertation/data/"
output_dir = "/Users/jeriwieringa/Dissertation/site/files/interact/"
In [6]:
data = os.path.join(data_dir, "model_outputs", "target_300_10.18497.state.gz")
In [7]:
mallet_model = models.MalletModel(data)
In [8]:
df = mallet_model.model()
params = mallet_model.extract_params()
In [9]:
doc_topic = df.groupby(['#doc', 'topic'])['type'].count().reset_index(name="token_count")
In [10]:
#sanity check
models.pivot_smooth_norm(doc_topic, params[0], '#doc', 'topic', 'token_count')[:10]
Out[10]:
In [11]:
# Load metadata
metadata = pd.read_csv(os.path.join(data_dir, "corpus_metadata", "meta.csv"), header=None).reset_index()
metadata.columns = ['doc_id', 'filename', 'citation', 'author',
'periodical_name', 'volume', 'issue',
'date', 'page', 'url','abrev']
metadata['date_formatted'] = pd.to_datetime(metadata['date'])
In [12]:
# Load Labels
import gspread
from oauth2client.service_account import ServiceAccountCredentials
# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
dts = gc.open('Topic Labels').sheet1
labels = pd.DataFrame(dts.get_all_records())
In [13]:
denom_titles = ['RH', 'ST', 'ARAI', 'GCB', 'PTAR','TCOG']
denom = models.subset_by_titles(doc_topic, metadata, denom_titles, params[0])
In [14]:
prophecy_denom = models.generate_graph_data(denom, ['prophecy'], labels)
models.generate_visualizations(prophecy_denom,
'Denominational Titles', 'Prophecy Topics',
output_dir
)
In [15]:
esch_denom = models.generate_graph_data(denom, ['eschatology'], labels)
models.generate_visualizations(esch_denom,
'Denominational Titles', 'Eschatology Topics',
output_dir
)
In [16]:
signs_denom = models.generate_graph_data(denom, ['signs_of_the_times'], labels)
models.generate_visualizations(signs_denom,
'Denominational Titles', 'Signs of the Times Topics',
output_dir
)
In [17]:
church_denom = models.generate_graph_data(denom, ['church_and_state'], labels)
models.generate_visualizations(church_denom,
'Denominational Titles', 'Church and State Topics',
output_dir
)
In [18]:
endTimes_denom = models.generate_graph_data(denom,
['prophecy', 'eschatology', 'signs_of_the_times', 'church_and_state'],
labels)
models.generate_visualizations(endTimes_denom,
'Denominational Titles', 'All End-Times Topics',
output_dir
)
Process full corpus (excluding YI)¶
In [19]:
# Full Corpus excludes 'YI' because of the spotty coverage and distinctive vocabulary
titles = metadata['abrev'].unique().tolist()
titles.remove('YI')
full_corpus = models.subset_by_titles(doc_topic, metadata, titles, params[0])
In [20]:
prophecy_full = models.generate_graph_data(full_corpus, ['prophecy'], labels)
models.generate_visualizations(prophecy_full,
'Full Corpus', 'Prophecy Topics',
output_dir
)
In [21]:
esch_full = models.generate_graph_data(full_corpus, ['eschatology'], labels)
models.generate_visualizations(esch_full,
'Full Corpus', 'Eschatology Topics',
output_dir
)
In [22]:
church_full = models.generate_graph_data(full_corpus, ['church_and_state'], labels)
models.generate_visualizations(church_full,
'Full Corpus', 'Church and State Topics',
output_dir
)
In [23]:
signs_full = models.generate_graph_data(full_corpus, ['signs_of_the_times'], labels)
models.generate_visualizations(signs_full,
'Full Corpus', 'Signs of the Times Topics',
output_dir
)
In [24]:
endTimes_full = models.generate_graph_data(full_corpus,
['prophecy', 'eschatology', 'signs_of_the_times', 'church_and_state'],
labels)
models.generate_visualizations(endTimes_full,
'Full Corpus', 'All End-Times Topics',
output_dir
)
Experiment with using lable for all topics chart (too much detail with all the topics).
what data do I need?
- aggregated topic prevalence for label
- prep that for bar graph
Where is the data? - full_corpus (this has been normed on the year)
In [25]:
full_corpus[:10]
Out[25]:
In [26]:
def aggregate_on_topic_category(df, topic_list):
data = df.unstack().reset_index(name='t_proportion')\
.merge(labels, how='left', left_on="topic", right_on="mallet_topic_id")\
[['topic', 'date_formatted', 't_proportion', 'topic_category', 'topic_label']]
agg_data = data.groupby(['date_formatted', 'topic_category'])['t_proportion']\
.sum()\
.reset_index(name="agg_topic_proportion")
return agg_data[agg_data['topic_category'].isin(topic_list)]
In [27]:
import plotly.graph_objs as go
def generate_graph_data(df):
"""
"""
# Compile into form for Plotly
data = []
for each in df['topic_category'].unique():
filtered = df[df['topic_category'] == each]
graph_obj = go.Bar(
x = filtered['date_formatted'],
y = filtered['agg_topic_proportion'],
name = ''.join(filtered['topic_category'].unique())
)
data.append(graph_obj)
return data
In [28]:
agg_end_times = aggregate_on_topic_category(full_corpus,
['prophecy',
'eschatology',
'signs_of_the_times',
'church_and_state'
])
In [29]:
agg_end_times[:5]
Out[29]:
In [30]:
models.generate_visualizations(generate_graph_data(agg_end_times),
'Full Corpus',
'All End-Times Categories',
output_dir)
In [31]:
denom_data = aggregate_on_topic_category(denom,
['prophecy',
'eschatology',
'signs_of_the_times',
'church_and_state'
])
In [32]:
models.generate_visualizations(generate_graph_data(denom_data),
'Denominational Titles',
'All End-Times Categories',
output_dir
)
In [ ]: