initial-ocr-error-rates

In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import csv
from text2topics import reports
from text2topics import utilities
import os
from os import listdir
from os.path import isfile, join
In [3]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-03-01-Additional-Approved-Words.txt",
             "2017-02-14-Roman-Numerals.txt"]
In [4]:
out_dir = "/Users/jeriwieringa/Dissertation/drafts/data/module-2/"
In [5]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [6]:
titles = ["ADV", "AmSn", "ARAI", "CE", 
          "CUV", "EDU", "GCB", "GH", 
          "GOH", "GS", "HM", "HR", 
          "IR", "LB", "LH", "LibM", 
          "LUH", "NMN","PHJ","PTAR",
          "PUR","RH1850-1889", "RH1890-1920","Sligo","SOL",
          "ST","SUW","TCOG","TMM",
          "WMH","YI"]
In [7]:
with open(os.path.join(out_dir, 'initial_verified_rates.csv'), 'w') as out:
    csv_writer = csv.writer(out, delimiter=',')
    csv_writer.writerow(['Title', 'Initial Average Verified Rate'])
    for title in titles:
        base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)
        stats = reports.overview_statistics(join(base_dir, 'baseline'), spelling_dictionary, title)
        verified_rate = reports.average_verified_rate(stats)
        csv_writer.writerow([title, verified_rate])
In [ ]: