Charts-of-corpus-statistics
In [1]:
import os
import pandas as pd
In [2]:
dataDir = "/Users/jeriwieringa/Dissertation/data/"
vizDir = "/Users/jeriwieringa/Dissertation/site/files/interact/"
In [3]:
stats = pd.read_csv(os.path.join(dataDir, 'corpus_metadata', 'yearlyStats.csv'))
In [4]:
stats[:10]
Out[4]:
In [5]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import cufflinks as cf
In [6]:
init_notebook_mode(connected=False)
cf.go_offline()
In [7]:
stats.describe()
Out[7]:
In [8]:
yearly = stats.groupby(['year', 'title_abbrev'])['total_tokens', 'total_unique_tokens'].sum().reset_index()
yearly.columns = ['year', 'title_abbrev', 'aggregated_total_tokens', 'aggregated_total_unique_tokens']
In [9]:
yearly[:10]
Out[9]:
In [10]:
y_pivot = yearly.pivot(index='year', columns='title_abbrev', values="aggregated_total_tokens")
In [11]:
y_pivot[:10]
Out[11]:
In [12]:
fig = y_pivot.iplot(kind='bar',
barmode='stack',
yTitle='Total Tokens in Corpus',
title='Tokens in Corpus per Year and Title',
asFigure=True
)
iplot(fig)
plot(fig, filename=os.path.join(vizDir, "tokens_title_year.html"))
Out[12]:
In [13]:
# %load /Users/jeriwieringa/Dissertation/dev/code/shared_elements/load_from_gsheets.py
# Load Labels
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
# Load data from Google Doc
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/dev/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)
doc_name = 'Topics'
dts = gc.open(doc_name)
In [14]:
frame = pd.DataFrame(dts.sheet1.get_all_records())
In [15]:
title_topics = pd.merge(yearly, frame, how='left', left_on='title_abbrev', right_on='title')
In [16]:
title_topics[:10]
Out[16]:
In [17]:
titleTopics = title_topics.groupby(['year', 'topic'])['aggregated_total_tokens'].sum().reset_index()
In [18]:
y_titleTopics = titleTopics.pivot(index='year', columns='topic', values='aggregated_total_tokens')
In [19]:
fig2 = y_titleTopics.iplot(kind='bar',
barmode='stack',
yTitle='Total Tokens in Corpus',
title="Tokens in Corpus by Year and Subject Area",
asFigure=True
)
iplot(fig2)
plot(fig2, filename=os.path.join(vizDir, 'tokens-subject-year.html'))
Out[19]:
In [20]:
tokensTitle = stats.groupby(['title_abbrev'])['total_tokens'].sum().reset_index()
In [21]:
data = [
go.Bar(
x=tokensTitle['title_abbrev'],
y=tokensTitle['total_tokens']
)
]
iplot(data, image='png')
plot(data, filename=os.path.join(vizDir, "tokens-per-title.html"))
Out[21]:
Total Tokens¶
In [22]:
yearly[['aggregated_total_tokens', 'aggregated_total_unique_tokens']].sum()
Out[22]:
In [ ]: