import os
import pandas as pd
from text2topics import models

%load_ext autoreload
%autoreload 2

data_dir = "/Users/jeriwieringa/Dissertation/data/"

model = models.MalletModel(os.path.join(data_dir, 
                                                     'model_outputs', 
                                                     'target_300_10.18497.state.gz'
                                                    ))

df = model.model()

params = model.extract_params()

# Load metadata

metadata = pd.read_csv(os.path.join(data_dir, "corpus_metadata", "meta.csv"), header=None)\
            .reset_index()
metadata.columns = ['doc_id', 'filename', 'citation', 'author', 
                    'periodical_name', 'volume', 'issue', 
                    'date', 'page', 'url','abrev']
metadata['date_formatted'] = pd.to_datetime(metadata['date'])

# Load Labels

import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
dts = gc.open('Topic Labels').sheet1

labels = pd.DataFrame(dts.get_all_records())

doc_topic = df.groupby(['#doc', 'topic'])['type'].count().reset_index(name="token_count")

dt = doc_topic.merge(metadata, how='left', left_on="#doc", right_on="doc_id")

y_dt = dt.groupby([dt.date_formatted.dt.year, "topic"])['token_count']\
        .sum()\
        .reset_index(name='token_count')

y_m = models.pivot_smooth_norm(y_dt, params[0],
                                            'date_formatted', 'topic', 'token_count')

y_m[:2]

y_topics = y_m.unstack().reset_index(name='t_proportion')

Add metadata back and aggregate by labels

category_df = y_topics.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")\
[['topic', 'date_formatted', 't_proportion', 'topic_category', 'topic_label']]\
.groupby(['date_formatted', 'topic_category'])['t_proportion']\
.sum().reset_index(name='category_prevalence')

category_df[:35]

Chart gives us an overview of the make up of the periodical literature in a given year. These numbers seem reasonable given my reading of these periodical documents.

Browse Top Topics by Year¶

Interface for viewing the top 25 topics in each year. Select starting and ending year (both included in the displayed data.) Also includes an interactive chart of the top weights over time.

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

# Keep charts from syncing with Plotly servers.
init_notebook_mode(connected=True)
cf.go_offline()

years = category_df['date_formatted'].unique().tolist()

def get_top_values(df, n_top):
    return df.groupby('date_formatted').apply(lambda x: x.nlargest(n_top, 'category_prevalence'))\
    .reset_index(drop=True)
    
frame = get_top_values(category_df, 5)

frame[:5]

colors = ["#557ee1",
        "#bca63a",
        "#9776d6",
        "#73a343",
        "#534894",
        "#54b06c",
        "#ba556d",
        "#64b694",
        "#b65147",
        "#5bbcca",
        "#a69357",
        "#7d9bd4",
        "#776e49",
        "#6f759b",
        "#648576",
        "#9a7b75",
         "#6e7c8c"]
topic_colors = dict(zip(frame['topic_category'].unique().tolist(), colors))
# print(topic_colors)

# Function for generating chart data for different time periods
out_dir = "/Users/jeriwieringa/Dissertation/site/files/interact/"

def top_topics(start_year='', 
               end_year='', 
               df=frame, 
               width=900, height=800, 
               colors_dict = topic_colors,
               save=False
              ):
    
    sizeref = 2. * max(df['category_prevalence']*100) / (110 ** 2)
    
    topics = df[(df['date_formatted'] >= start_year) & (df['date_formatted'] <= end_year)]

    layout1 = go.Layout(
        height=height,
        width=width,
        title="Prevalence in Corpus of Top Five Categories per Year,<br>\
                {} to {}".format(start_year, end_year),
        yaxis=go.layout.YAxis(
            title='Topic Categories',
            automargin=True,
        ),
    )

    # Logic of this chart: I have a bubble chart for each set of years that is a compilation of 5+ charts
    # For each sub-chart, I am graphing the value of the topic category in each year, so my x series is years
    # and my y series is topic prevalence. Y value could be 0 in a year, and so should not be graphed.
    # To loop, I want to create a scatter chart for each unique Y (topic_category) value, 
    # with the color determined by the value of Y and the size determined by the category_prevalence.
    
    traces = []

    for cat in topics['topic_category'].unique().tolist():
        cat_df = topics[topics['topic_category'] == cat]
        
        hover_text = []
        for index, row in cat_df.iterrows():
            hover_text.append(("Year: {}<br>\
                                Prevalence: {}%".format(row['date_formatted'],
                                                        row['category_prevalence']*100
                                                       )
                              ))
        
        graph_data = go.Scatter(
            x = cat_df['date_formatted'],
            y = cat_df['topic_category'],
            text = hover_text,
            mode='markers',
            marker=dict(
                symbol='circle',
                size=cat_df['category_prevalence']*100,
                sizemode='area',
                sizeref=sizeref,
                color=colors_dict[cat]
                ),
            showlegend=False
            )

        traces.append(graph_data)
        
    fig = go.Figure(data=traces, layout=layout1)
    
    iplot(fig)
    if save:
        plot(fig, filename=os.path.join(out_dir, 'Yearly_Aggreg_Categories_{}_to_{}.html'.format(start_year,
                                                                                      end_year)
                                   ))

# Test run
top_topics(1848,1855)

start_years = range(years[0], years[-1], 6)
for each in start_years:
    top_topics(each, each+5, save=True)

top_topics(start_year=1849, end_year=1920, height=1200, width=1000 , save=True)

topic	0	1	2	3	4	5	6	7	8	9	...	240	241	242	243	244	245	246	247	248	249
date_formatted
1849	0.000466	0.002586	0.001395	0.003516	0.010145	0.004582	0.001459	0.007097	0.000664	6.630487e-07	...	0.000995	0.000532	0.000067	0.000334	7.931035e-07	8.578557e-07	0.000001	0.000001	0.000200	7.317510e-07
1850	0.000748	0.000168	0.017215	0.002281	0.017356	0.010193	0.004587	0.017163	0.001495	1.032088e-04	...	0.004613	0.002564	0.001134	0.000954	5.169408e-05	6.459166e-05	0.000142	0.000155	0.000116	3.093819e-04

	date_formatted	topic_category	category_prevalence
0	1849	advertisements	0.006380
1	1849	apologetics	0.025667
2	1849	bible	0.145878
3	1849	church_and_state	0.002725
4	1849	community_news	0.000069
5	1849	conference_proceedings	0.000665
6	1849	correspondence	0.050922
7	1849	education	0.005384
8	1849	eschatology	0.129428
9	1849	general_interest	0.000670
10	1849	health	0.003933
11	1849	history	0.017441
12	1849	meeting_reports	0.001668
13	1849	missions	0.010301
14	1849	nature	0.004182
15	1849	nutrition	0.007234
16	1849	obituaries	0.000001
17	1849	organization	0.003995
18	1849	periodicals	0.010147
19	1849	piety	0.019305
20	1849	politics	0.000534
21	1849	prophecy	0.003118
22	1849	religious_commentary	0.017840
23	1849	reports_on_the_cause	0.032896
24	1849	scan_errors	0.008492
25	1849	sermons	0.008428
26	1849	signs_of_the_times	0.014663
27	1849	social_commentary	0.029118
28	1849	spiritual_growth	0.035099
29	1849	stories	0.028924
30	1849	theology	0.123143
31	1849	theology_sabbath	0.250820
32	1849	transportation	0.000932
33	1850	advertisements	0.006922
34	1850	apologetics	0.052418

	date_formatted	topic_category	category_prevalence
0	1849	theology_sabbath	0.250820
1	1849	bible	0.145878
2	1849	eschatology	0.129428
3	1849	theology	0.123143
4	1849	correspondence	0.050922