In [1]:
import os
import pandas as pd
In [2]:
text_dir = "/Users/jeriwieringa/Dissertation/sources/text/2017-04-Final-Corpus/"
In [3]:
from collections import Counter
import csv
import glob
from text2topics import utilities
import string

def gather_stats(dir, out_dir):
    """Return a list of tuples with the filename, title_abbrev, total_tokens, total_unique_tokens, and token: count"""
    transl = str.maketrans('', '', string.punctuation)

    file_list = glob.iglob(os.path.join(dir, '*'))
    with open(out_dir, 'w') as out:
        writer = csv.writer(out)
        writer.writerow(['filename', 'title_abbrev', 'year', 'issue', 'total_tokens', 'total_unique_tokens', 'counts'])
        for each in file_list:
#             print(each)
            abbrev = utilities.get_title(os.path.basename(each))
            year = utilities.get_year(os.path.basename(each))
            issue = '-'.join(os.path.basename(each).split('-')[:3])
            with open(each) as f:
                content =
            tokens = utilities.to_lower(utilities.tokenize_text((content.translate(transl))))
            counts = Counter(tokens)
            writer.writerow([os.path.basename(each), abbrev, year, issue, len(tokens), len(set(tokens)), counts])    
In [5]:
gather_stats(text_dir, os.path.join('/Users/jeriwieringa/Dissertation/', 'data', 'corpus_metadata', 'yearlyStats.csv'))
In [ ]: