initial-ocr-error-rates
In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import csv
from text2topics import reports
from text2topics import utilities
import os
from os import listdir
from os.path import isfile, join
In [3]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt",
"2016-12-07-SDA-place-names.txt",
"2016-12-08-SDA-Vocabulary.txt",
"2017-01-03-place-names.txt",
"2017-02-14-Base-Word-List-SCOWL&KJV.txt",
"2017-03-01-Additional-Approved-Words.txt",
"2017-02-14-Roman-Numerals.txt"]
In [4]:
out_dir = "/Users/jeriwieringa/Dissertation/drafts/data/module-2/"
In [5]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [6]:
titles = ["ADV", "AmSn", "ARAI", "CE",
"CUV", "EDU", "GCB", "GH",
"GOH", "GS", "HM", "HR",
"IR", "LB", "LH", "LibM",
"LUH", "NMN","PHJ","PTAR",
"PUR","RH1850-1889", "RH1890-1920","Sligo","SOL",
"ST","SUW","TCOG","TMM",
"WMH","YI"]
In [7]:
with open(os.path.join(out_dir, 'initial_verified_rates.csv'), 'w') as out:
csv_writer = csv.writer(out, delimiter=',')
csv_writer.writerow(['Title', 'Initial Average Verified Rate'])
for title in titles:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)
stats = reports.overview_statistics(join(base_dir, 'baseline'), spelling_dictionary, title)
verified_rate = reports.average_verified_rate(stats)
csv_writer.writerow([title, verified_rate])
In [ ]: