Charts-of-corpus-statistics

In [1]:
import os
import pandas as pd
In [2]:
dataDir = "/Users/jeriwieringa/Dissertation/data/"
vizDir = "/Users/jeriwieringa/Dissertation/site/files/interact/"
In [3]:
stats = pd.read_csv(os.path.join(dataDir, 'corpus_metadata', 'yearlyStats.csv'))
In [4]:
stats[:10]
Out[4]:
filename title_abbrev year issue total_tokens total_unique_tokens counts
0 RH18950924-V72-39-page8.txt RH 1895 RH18950924-V72-39 1716 595 Counter({'the': 164, 'of': 70, 'and': 63, 'to'...
1 RH19100217-V87-07-page6.txt RH 1910 RH19100217-V87-07 1337 525 Counter({'the': 74, 'to': 53, 'of': 47, 'and':...
2 YI19080804-V56-31-page3.txt YI 1908 YI19080804-V56-31 867 365 Counter({'the': 114, 'of': 47, 'in': 34, 'is':...
3 YI19141124-V62-47-page14.txt YI 1914 YI19141124-V62-47 1266 515 Counter({'the': 84, 'of': 50, 'to': 40, 'and':...
4 LB19000101-V02-11-page16.txt LB 1900 LB19000101-V02-11 318 230 Counter({'a': 9, 'and': 8, 'for': 6, 'copies':...
5 LB19001201-V03-10-page4.txt LB 1900 LB19001201-V03-10 1256 472 Counter({'the': 63, 'to': 54, 'and': 43, 'a': ...
6 LibM19120701-V07-03-page31.txt LibM 1912 LibM19120701-V07-03 594 289 Counter({'the': 57, 'of': 42, 'to': 21, 'and':...
7 PHJ18890501-V04-05-page7.txt PHJ 1889 PHJ18890501-V04-05 760 407 Counter({'the': 51, 'of': 31, 'and': 22, 'a': ...
8 LB19031201-V06-12-page29.txt LB 1903 LB19031201-V06-12 642 335 Counter({'the': 37, 'in': 25, 'to': 17, 'and':...
9 PUR19150422-V14-37-page6.txt PUR 1915 PUR19150422-V14-37 1119 546 Counter({'the': 48, 'to': 35, 'and': 33, 'in':...
In [5]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import cufflinks as cf
In [6]:
init_notebook_mode(connected=False)
cf.go_offline()
In [7]:
stats.describe()
Out[7]:
year total_tokens total_unique_tokens
count 197943.000000 197943.000000 197943.000000
mean 1900.878834 1235.530734 497.310918
std 14.940996 700.018828 227.594774
min 1849.000000 0.000000 0.000000
25% 1892.000000 725.000000 338.000000
50% 1904.000000 1088.000000 473.000000
75% 1913.000000 1676.000000 633.000000
max 1920.000000 5801.000000 2344.000000
In [8]:
yearly = stats.groupby(['year', 'title_abbrev'])['total_tokens', 'total_unique_tokens'].sum().reset_index()
yearly.columns = ['year', 'title_abbrev', 'aggregated_total_tokens', 'aggregated_total_unique_tokens']
In [9]:
yearly[:10]
Out[9]:
year title_abbrev aggregated_total_tokens aggregated_total_unique_tokens
0 1849 PTAR 42265 16256
1 1850 PTAR 184930 74837
2 1850 RH 42368 14700
3 1851 RH 329053 106268
4 1852 RH 397682 124892
5 1852 YI 22892 8749
6 1853 RH 613592 204923
7 1853 YI 71383 28042
8 1854 RH 851408 282081
9 1854 YI 92622 38963
In [10]:
y_pivot = yearly.pivot(index='year', columns='title_abbrev', values="aggregated_total_tokens")
In [11]:
y_pivot[:10]
Out[11]:
title_abbrev ADV ARAI AmSn CE CUV EDU GCB GH GOH GS ... PUR RH SOL ST SUW Sligo TCOG TMM WMH YI
year
1849 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1850 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 42368.0 NaN NaN NaN NaN NaN NaN NaN NaN
1851 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 329053.0 NaN NaN NaN NaN NaN NaN NaN NaN
1852 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 397682.0 NaN NaN NaN NaN NaN NaN NaN 22892.0
1853 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 613592.0 NaN NaN NaN NaN NaN NaN NaN 71383.0
1854 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 851408.0 NaN NaN NaN NaN NaN NaN NaN 92622.0
1855 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 486843.0 NaN NaN NaN NaN NaN NaN NaN 81764.0
1856 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 838180.0 NaN NaN NaN NaN NaN NaN NaN NaN
1857 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 902060.0 NaN NaN NaN NaN NaN NaN NaN NaN
1858 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 866581.0 NaN NaN NaN NaN NaN NaN NaN NaN

10 rows × 30 columns

In [12]:
fig = y_pivot.iplot(kind='bar', 
              barmode='stack',
              yTitle='Total Tokens in Corpus', 
              title='Tokens in Corpus per Year and Title',
              asFigure=True
             )
iplot(fig)
plot(fig, filename=os.path.join(vizDir, "tokens_title_year.html"))
Out[12]:
'file:///Users/jeriwieringa/Dissertation/site/files/interact/tokens_title_year.html'
In [13]:
# %load /Users/jeriwieringa/Dissertation/dev/code/shared_elements/load_from_gsheets.py
# Load Labels

import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd

# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)

doc_name = 'Topics'
dts = gc.open(doc_name)
In [14]:
frame = pd.DataFrame(dts.sheet1.get_all_records())
In [15]:
title_topics = pd.merge(yearly, frame, how='left', left_on='title_abbrev', right_on='title')
In [16]:
title_topics[:10]
Out[16]:
year title_abbrev aggregated_total_tokens aggregated_total_unique_tokens endYear initialPubLocation periodicalTitle startYear title topic
0 1849 PTAR 42265 16256 1850 Middletown, CT Present Truth (Advent Review) 1849 PTAR Denominational
1 1850 PTAR 184930 74837 1850 Middletown, CT Present Truth (Advent Review) 1849 PTAR Denominational
2 1850 RH 42368 14700 1920 Paris, ME Review and Herald 1850 RH Denominational
3 1851 RH 329053 106268 1920 Paris, ME Review and Herald 1850 RH Denominational
4 1852 RH 397682 124892 1920 Paris, ME Review and Herald 1850 RH Denominational
5 1852 YI 22892 8749 1920 Rochester, NY Youth's Instructor 1852 YI Denominational
6 1853 RH 613592 204923 1920 Paris, ME Review and Herald 1850 RH Denominational
7 1853 YI 71383 28042 1920 Rochester, NY Youth's Instructor 1852 YI Denominational
8 1854 RH 851408 282081 1920 Paris, ME Review and Herald 1850 RH Denominational
9 1854 YI 92622 38963 1920 Rochester, NY Youth's Instructor 1852 YI Denominational
In [17]:
titleTopics = title_topics.groupby(['year', 'topic'])['aggregated_total_tokens'].sum().reset_index()
In [18]:
y_titleTopics = titleTopics.pivot(index='year', columns='topic', values='aggregated_total_tokens')
In [19]:
fig2 = y_titleTopics.iplot(kind='bar', 
                           barmode='stack',
                           yTitle='Total Tokens in Corpus', 
                           title="Tokens in Corpus by Year and Subject Area",
                           asFigure=True
                          )
iplot(fig2)
plot(fig2, filename=os.path.join(vizDir, 'tokens-subject-year.html'))
Out[19]:
'file:///Users/jeriwieringa/Dissertation/site/files/interact/tokens-subject-year.html'
In [20]:
tokensTitle = stats.groupby(['title_abbrev'])['total_tokens'].sum().reset_index()
In [21]:
data = [
    go.Bar(
        x=tokensTitle['title_abbrev'],
        y=tokensTitle['total_tokens']
    )
]
iplot(data, image='png')
plot(data, filename=os.path.join(vizDir, "tokens-per-title.html"))
Out[21]:
'file:///Users/jeriwieringa/Dissertation/site/files/interact/tokens-per-title.html'

Total Tokens

In [22]:
yearly[['aggregated_total_tokens', 'aggregated_total_unique_tokens']].sum()
Out[22]:
aggregated_total_tokens           244564660
aggregated_total_unique_tokens     98439215
dtype: int64
In [ ]: