Save_Title_Stats
In [1]:
import os
import pandas as pd
In [2]:
text_dir = "/Users/jeriwieringa/Dissertation/sources/text/2017-04-Final-Corpus/"
In [3]:
from collections import Counter
import csv
import glob
from text2topics import utilities
import string
def gather_stats(dir, out_dir):
"""Return a list of tuples with the filename, title_abbrev, total_tokens, total_unique_tokens, and token: count"""
transl = str.maketrans('', '', string.punctuation)
file_list = glob.iglob(os.path.join(dir, '*'))
with open(out_dir, 'w') as out:
writer = csv.writer(out)
writer.writerow(['filename', 'title_abbrev', 'year', 'issue', 'total_tokens', 'total_unique_tokens', 'counts'])
for each in file_list:
# print(each)
abbrev = utilities.get_title(os.path.basename(each))
year = utilities.get_year(os.path.basename(each))
issue = '-'.join(os.path.basename(each).split('-')[:3])
with open(each) as f:
content = f.read()
tokens = utilities.to_lower(utilities.tokenize_text((content.translate(transl))))
counts = Counter(tokens)
writer.writerow([os.path.basename(each), abbrev, year, issue, len(tokens), len(set(tokens)), counts])
In [5]:
gather_stats(text_dir, os.path.join('/Users/jeriwieringa/Dissertation/', 'data', 'corpus_metadata', 'yearlyStats.csv'))
In [ ]: