2019-01-17-Yearly-Top-Labels

In [1]:
import os
import pandas as pd
from text2topics import models
In [2]:
%load_ext autoreload
%autoreload 2
In [3]:
data_dir = "/Users/jeriwieringa/Dissertation/data/"
In [4]:
model = models.MalletModel(os.path.join(data_dir, 
                                                     'model_outputs', 
                                                     'target_300_10.18497.state.gz'
                                                    ))
In [5]:
df = model.model()
In [6]:
params = model.extract_params()
In [7]:
# Load metadata

metadata = pd.read_csv(os.path.join(data_dir, "corpus_metadata", "meta.csv"), header=None)\
            .reset_index()
metadata.columns = ['doc_id', 'filename', 'citation', 'author', 
                    'periodical_name', 'volume', 'issue', 
                    'date', 'page', 'url','abrev']
metadata['date_formatted'] = pd.to_datetime(metadata['date'])
In [8]:
# Load Labels

import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
dts = gc.open('Topic Labels').sheet1

labels = pd.DataFrame(dts.get_all_records())
In [9]:
doc_topic = df.groupby(['#doc', 'topic'])['type'].count().reset_index(name="token_count")
In [10]:
dt = doc_topic.merge(metadata, how='left', left_on="#doc", right_on="doc_id")
In [11]:
y_dt = dt.groupby([dt.date_formatted.dt.year, "topic"])['token_count']\
        .sum()\
        .reset_index(name='token_count')
In [12]:
y_m = models.pivot_smooth_norm(y_dt, params[0],
                                            'date_formatted', 'topic', 'token_count')
In [13]:
y_m[:2]
Out[13]:
topic 0 1 2 3 4 5 6 7 8 9 ... 240 241 242 243 244 245 246 247 248 249
date_formatted
1849 0.000466 0.002586 0.001395 0.003516 0.010145 0.004582 0.001459 0.007097 0.000664 6.630487e-07 ... 0.000995 0.000532 0.000067 0.000334 7.931035e-07 8.578557e-07 0.000001 0.000001 0.000200 7.317510e-07
1850 0.000748 0.000168 0.017215 0.002281 0.017356 0.010193 0.004587 0.017163 0.001495 1.032088e-04 ... 0.004613 0.002564 0.001134 0.000954 5.169408e-05 6.459166e-05 0.000142 0.000155 0.000116 3.093819e-04

2 rows × 250 columns

In [14]:
y_topics = y_m.unstack().reset_index(name='t_proportion')

Add metadata back and aggregate by labels

In [15]:
category_df = y_topics.merge(labels, how="left", left_on="topic", right_on="mallet_topic_id")\
[['topic', 'date_formatted', 't_proportion', 'topic_category', 'topic_label']]\
.groupby(['date_formatted', 'topic_category'])['t_proportion']\
.sum().reset_index(name='category_prevalence')
In [16]:
category_df[:35]
Out[16]:
date_formatted topic_category category_prevalence
0 1849 advertisements 0.006380
1 1849 apologetics 0.025667
2 1849 bible 0.145878
3 1849 church_and_state 0.002725
4 1849 community_news 0.000069
5 1849 conference_proceedings 0.000665
6 1849 correspondence 0.050922
7 1849 education 0.005384
8 1849 eschatology 0.129428
9 1849 general_interest 0.000670
10 1849 health 0.003933
11 1849 history 0.017441
12 1849 meeting_reports 0.001668
13 1849 missions 0.010301
14 1849 nature 0.004182
15 1849 nutrition 0.007234
16 1849 obituaries 0.000001
17 1849 organization 0.003995
18 1849 periodicals 0.010147
19 1849 piety 0.019305
20 1849 politics 0.000534
21 1849 prophecy 0.003118
22 1849 religious_commentary 0.017840
23 1849 reports_on_the_cause 0.032896
24 1849 scan_errors 0.008492
25 1849 sermons 0.008428
26 1849 signs_of_the_times 0.014663
27 1849 social_commentary 0.029118
28 1849 spiritual_growth 0.035099
29 1849 stories 0.028924
30 1849 theology 0.123143
31 1849 theology_sabbath 0.250820
32 1849 transportation 0.000932
33 1850 advertisements 0.006922
34 1850 apologetics 0.052418

Chart gives us an overview of the make up of the periodical literature in a given year. These numbers seem reasonable given my reading of these periodical documents.

Browse Top Topics by Year

Interface for viewing the top 25 topics in each year. Select starting and ending year (both included in the displayed data.) Also includes an interactive chart of the top weights over time.

In [17]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

# Keep charts from syncing with Plotly servers.
init_notebook_mode(connected=True)
cf.go_offline()
In [18]:
years = category_df['date_formatted'].unique().tolist()

def get_top_values(df, n_top):
    return df.groupby('date_formatted').apply(lambda x: x.nlargest(n_top, 'category_prevalence'))\
    .reset_index(drop=True)
    
frame = get_top_values(category_df, 5)
In [19]:
frame[:5]
Out[19]:
date_formatted topic_category category_prevalence
0 1849 theology_sabbath 0.250820
1 1849 bible 0.145878
2 1849 eschatology 0.129428
3 1849 theology 0.123143
4 1849 correspondence 0.050922
In [20]:
colors = ["#557ee1",
        "#bca63a",
        "#9776d6",
        "#73a343",
        "#534894",
        "#54b06c",
        "#ba556d",
        "#64b694",
        "#b65147",
        "#5bbcca",
        "#a69357",
        "#7d9bd4",
        "#776e49",
        "#6f759b",
        "#648576",
        "#9a7b75",
         "#6e7c8c"]
topic_colors = dict(zip(frame['topic_category'].unique().tolist(), colors))
# print(topic_colors)
In [21]:
# Function for generating chart data for different time periods
out_dir = "/Users/jeriwieringa/Dissertation/site/files/interact/"

def top_topics(start_year='', 
               end_year='', 
               df=frame, 
               width=900, height=800, 
               colors_dict = topic_colors,
               save=False
              ):
    
    sizeref = 2. * max(df['category_prevalence']*100) / (110 ** 2)
    
    topics = df[(df['date_formatted'] >= start_year) & (df['date_formatted'] <= end_year)]

    layout1 = go.Layout(
        height=height,
        width=width,
        title="Prevalence in Corpus of Top Five Categories per Year,<br>\
                {} to {}".format(start_year, end_year),
        yaxis=go.layout.YAxis(
            title='Topic Categories',
            automargin=True,
        ),
    )

    # Logic of this chart: I have a bubble chart for each set of years that is a compilation of 5+ charts
    # For each sub-chart, I am graphing the value of the topic category in each year, so my x series is years
    # and my y series is topic prevalence. Y value could be 0 in a year, and so should not be graphed.
    # To loop, I want to create a scatter chart for each unique Y (topic_category) value, 
    # with the color determined by the value of Y and the size determined by the category_prevalence.
    
    traces = []

    for cat in topics['topic_category'].unique().tolist():
        cat_df = topics[topics['topic_category'] == cat]
        
        hover_text = []
        for index, row in cat_df.iterrows():
            hover_text.append(("Year: {}<br>\
                                Prevalence: {}%".format(row['date_formatted'],
                                                        row['category_prevalence']*100
                                                       )
                              ))
        
        graph_data = go.Scatter(
            x = cat_df['date_formatted'],
            y = cat_df['topic_category'],
            text = hover_text,
            mode='markers',
            marker=dict(
                symbol='circle',
                size=cat_df['category_prevalence']*100,
                sizemode='area',
                sizeref=sizeref,
                color=colors_dict[cat]
                ),
            showlegend=False
            )

        traces.append(graph_data)
        
    fig = go.Figure(data=traces, layout=layout1)
    
    iplot(fig)
    if save:
        plot(fig, filename=os.path.join(out_dir, 'Yearly_Aggreg_Categories_{}_to_{}.html'.format(start_year,
                                                                                      end_year)
                                   ))
In [22]:
# Test run
top_topics(1848,1855)
In [23]:
start_years = range(years[0], years[-1], 6)
for each in start_years:
    top_topics(each, each+5, save=True)
In [24]:
top_topics(start_year=1849, end_year=1920, height=1200, width=1000 , save=True)
In [ ]: