final-ocr-error-rates

In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import csv
from text2topics import reports
from text2topics import utilities
import os
from os import listdir
from os.path import isfile, join
In [3]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-03-01-Additional-Approved-Words.txt",
             "2017-02-14-Roman-Numerals.txt"]
In [4]:
out_dir = "/Users/jeriwieringa/Dissertation/drafts/data/module-2/"
In [5]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [6]:
titles = ["ADV", "AmSn", "ARAI", "CE", 
          "CUV", "EDU", "GCB", "GH", 
          "GOH", "GS", "HM", "HR", 
          "IR", "LB", "LH", "LibM", 
          "LUH", "NMN","PHJ","PTAR",
          "PUR","RH1850-1889", "RH1890-1920","Sligo","SOL",
          "ST","SUW","TCOG","TMM",
          "WMH","YI"]
In [7]:
with open('/Users/jeriwieringa/Dissertation/drafts/data/2017-03-24-final-correction-files.txt', 'r') as f:
    final_dirs = f.read().splitlines()
In [8]:
pairs = list(zip(titles, final_dirs))
In [9]:
with open(os.path.join(out_dir, 'final_verified_rates.csv'), 'w') as out:
    csv_writer = csv.writer(out, delimiter=',')
    csv_writer.writerow(['Title', 'Initial Average Verified Rate'])
    for title, final_dir in pairs:
        print(final_dir)
        base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning"
        stats = reports.overview_statistics(join(base_dir, final_dir), spelling_dictionary, title)
        verified_rate = reports.average_verified_rate(stats)
        csv_writer.writerow([title, verified_rate])
ADV/correction15
AmSn/correction9
ARAI/correction7
CE/correction9
CUV/correction9
EDU/correction9
GCB/correction9
GH/correction8
GoH/correction9
GS/correction7
HM/correction9
HR/correction9
IR/correction9
LB/correction9
LH/correction9
LibM/correction10
LUH/correction7
NMN/correction8
PHJ/correction9
PTAR/correction7
PUR/correction8
RH1850-1889/correction8
RH1890-1920/correction9
Sligo/correction8
SOL/correction6
ST/correction9
SUW/correction8
TCOG/correction6
TMM/correction6
WMH/correction8
YI/correction10
In [ ]: