IR-OCR-Evaluation-and-Correction

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-02-14-Roman-Numerals.txt",
             "2017-03-01-Additional-Approved-Words.txt"
            ]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "IR"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)

Baseline

In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/IR/baseline

Average verified rate: 0.9209845429108203

Average of error rates: 0.08383389544688026

Total token count: 1176871

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 100 )
Out[11]:
[('w', 4512),
 ('e', 1999),
 ('m', 1961),
 ('-', 1701),
 ('r', 1655),
 ('f', 1614),
 ('¥', 1404),
 ('d', 1356),
 ('n', 1037),
 ('t', 984),
 ('re-', 939),
 (')', 866),
 ('g', 852),
 ("'", 837),
 ('con-', 837),
 ('(', 767),
 ('in-', 718),
 ('tion', 661),
 ('th', 567),
 ('co', 488),
 ('be-', 387),
 ('mt', 375),
 ('com-', 366),
 ('de-', 349),
 ('ñ', 348),
 ('rd', 336),
 ('ence', 323),
 ('u', 318),
 ('en-', 287),
 ('*', 278),
 ('meet-', 271),
 ('confer-', 268),
 ('wm', 254),
 ('ex-', 254),
 ('ad-', 253),
 ('ference', 248),
 ('tions', 245),
 ('ment', 245),
 ('indi-', 242),
 ('k', 211),
 ('mo', 206),
 ('mis-', 200),
 ('_', 198),
 ('ac-', 197),
 ('sab-', 194),
 ('*mrs', 188),
 ('at-', 186),
 ('ber', 180),
 ('pro-', 179),
 ('pre-', 178),
 ('presidentñw', 175),
 ('camp-', 172),
 ('ple', 168),
 ('ers', 163),
 ('peo-', 162),
 ('im-', 161),
 ('ap-', 157),
 ('ly', 157),
 ('an-', 148),
 ('dis-', 148),
 ('%', 144),
 ('un-', 138),
 ('rocklane', 137),
 ('seventh-', 132),
 ('ren', 129),
 ('side)', 127),
 ('per-', 127),
 ('/', 126),
 ('sionary', 124),
 ('inter-', 122),
 ('adiana', 121),
 ('secretaryñw', 120),
 ('num-', 118),
 ('sub-', 118),
 ('ary', 116),
 ('ance', 112),
 ('z', 108),
 ('breth-', 108),
 ('can-', 107),
 ('mem-', 106),
 ('mes-', 104),
 ('pr', 104),
 ('sabbath-', 102),
 ('for-', 101),
 ('ent', 101),
 ("canvassers'", 101)]

Check Special Character Use

In [12]:
reports.tokens_with_special_characters(errors_summary)[:500]
Out[12]:
[('¥', 1404),
 (')', 866),
 ('(', 767),
 ('ñ', 348),
 ('*', 278),
 ('_', 198),
 ('*mrs', 188),
 ('presidentñw', 175),
 ('%', 144),
 ('side)', 127),
 ('/', 126),
 ('secretaryñw', 120),
 ('*a', 99),
 ('ñc', 89),
 ('\\v', 74),
 ('committeeñw', 72),
 ('=', 72),
 ('ña', 71),
 ('secretaryñmrs', 65),
 ('ñthe', 60),
 ('treasurerñw', 58),
 ('street)', 58),
 ('*w', 57),
 ('secretariesña', 56),
 ('(west', 56),
 ('treasurerña', 56),
 ('treasurerñt', 55),
 ('*c', 54),
 ('missionaryñr', 53),
 ('*j', 50),
 ('¥¥', 50),
 ('*e', 45),
 ('ô', 41),
 ('(east', 39),
 ('¡', 38),
 ('i)', 38),
 ('ã', 34),
 ('*john', 33),
 ('libertyña', 32),
 ('))', 31),
 ('medicalñdr', 31),
 (']', 30),
 ('**', 28),
 ('secretaryñj', 28),
 ('agentñwm', 28),
 ('*r', 28),
 ('wantedña', 27),
 ('`', 27),
 ('a)', 25),
 ('*s', 25),
 ('agentñj', 24),
 ('ñtest', 24),
 ('[', 24),
 ('educationalñmrs', 23),
 ('%v', 22),
 ('(the', 21),
 ('fieldñarcher', 21),
 ('+', 20),
 ('(e', 19),
 ('*henry', 19),
 ('(a)', 19),
 ('¥¥¥', 19),
 ('¥the', 18),
 ('secretaryñc', 18),
 ('*b', 18),
 ('sceretaryñw', 17),
 ('__', 17),
 ('(to', 17),
 ('*charles', 17),
 ('*carrie', 16),
 ('¥mrs', 16),
 ('presidentñmorris', 15),
 ('•', 15),
 ('girl)', 14),
 ('fieldña', 14),
 ('(i', 14),
 ('*g', 13),
 ('*n', 13),
 ('\\ve', 13),
 ('*p', 13),
 ('*d', 13),
 ('*emmanuel', 12),
 ('*martin', 12),
 ('*dr', 12),
 ('*sarah', 12),
 ('¥e', 12),
 ('*l', 12),
 ('*george', 12),
 ('(b)', 12),
 ('*rhoda', 12),
 ('*martha', 12),
 ('(continued', 12),
 ('(and', 12),
 ('ñb', 12),
 ('*frank', 12),
 ('*james', 11),
 ('*samuel', 11),
 ('medicalñda', 11),
 ('*u', 11),
 ('a¥', 11),
 ('¥a', 11),
 ('*fannie', 11),
 ('it)', 11),
 ('*stephen', 11),
 ('(if', 11),
 ('ñto', 11),
 ('ñdied', 11),
 ('*archer', 11),
 ('treasurerñjno', 11),
 ('*thomas', 11),
 ('their¥', 10),
 ('*walter', 10),
 ('nd)', 10),
 ('>', 10),
 ('-¥', 10),
 ('*blanche', 10),
 ('~', 10),
 ('*robert', 10),
 ('*margaret', 10),
 ('(paid)', 10),
 ('(see', 10),
 ('¥w', 10),
 ('(c)', 10),
 ('*geo', 10),
 ('for¥', 10),
 ('#', 9),
 ('*nellie', 9),
 ('*celia', 9),
 ('¥d', 9),
 ('()', 9),
 ('(luring', 9),
 ('(mal', 9),
 ('%%', 9),
 ('\\', 9),
 ('fieldñ', 8),
 ('¥j', 8),
 ('the¥', 8),
 ('¥h', 8),
 ('(or', 8),
 ('(e)', 8),
 ('ñyes', 8),
 ('ñbut', 8),
 ('departmentñdr', 8),
 ('ñwe', 8),
 ('•mrs', 8),
 ('ñin', 8),
 ('(concluded', 8),
 ('*victor', 8),
 ('seventh¥day', 8),
 ('ñmrs', 8),
 ("'¥", 7),
 ('¥-', 7),
 ('st)', 7),
 ('/%', 7),
 ('ñi', 7),
 ('twenty¥five', 7),
 ('*mary', 7),
 ('¥of', 7),
 ('-)', 7),
 ('~~', 7),
 ('ñj', 7),
 ('in¥', 7),
 ('ó', 7),
 ('¥m', 7),
 ('(for', 7),
 ('(lev', 6),
 ('ñan', 6),
 ('(d)', 6),
 ('\ufeff', 6),
 ('***', 6),
 ('(page', 6),
 ('c)', 6),
 ('(heb', 6),
 ('(a', 6),
 ('•e', 6),
 ('ñselected', 6),
 ('¥c', 6),
 ('fieldñburt', 6),
 ('our¥', 6),
 ('ñelder', 6),
 ('*glesner', 6),
 ('*¥', 6),
 ('<', 6),
 ('ñand', 6),
 ('ñthat', 6),
 ('ñed', 6),
 ('¥s', 6),
 ('ñit', 6),
 ('*chris', 6),
 ('ñall', 6),
 ('ñprov', 6),
 ('*raymond', 6),
 ('*clara', 6),
 ('^', 6),
 ('ñvol', 6),
 ('i¥', 6),
 ('ñr', 6),
 ('ñps', 6),
 ('û', 6),
 ('ôô', 5),
 ('¤', 5),
 ('i/', 5),
 ('*lydia', 5),
 ('t)', 5),
 ('¥¥¥¥¥', 5),
 ('wantedñan', 5),
 ('rebecca__', 5),
 ('(gen', 5),
 ('ñeditor', 5),
 ('(in', 5),
 ('¥la', 5),
 ('sabbath-schoolñmrs', 5),
 ('medicalñdu', 5),
 ('¥i', 5),
 ('departmentñda', 5),
 ('si)', 5),
 ('*millie', 5),
 ('((', 5),
 ('¥r', 5),
 ('=mrs', 5),
 ('*adelia', 5),
 ('(not', 5),
 ('(sr', 5),
 ('w¥', 5),
 ('(lays', 5),
 ('educationalñmae', 5),
 ("')", 5),
 ('camp¥meeting', 5),
 ('¥we', 5),
 ('•m', 4),
 ('ñt', 4),
 ('ci)', 4),
 ('an¥', 4),
 ('(from', 4),
 ('auditorñw', 4),
 ('secretaryña', 4),
 ('j¥', 4),
 ('%c', 4),
 (')r', 4),
 ('elder¥', 4),
 ('cents)', 4),
 ('ñthis', 4),
 ('•a', 4),
 ('\\i', 4),
 ('*bertha', 4),
 ('(is', 4),
 ('to¥', 4),
 ('¥wallace', 4),
 ('saleñmy', 4),
 ('(which', 4),
 ('¥l', 4),
 ('ñnot', 4),
 ('(mt', 4),
 ('presidentñi', 4),
 ('partmentñclara', 4),
 ('[the', 4),
 ('*edgar', 4),
 ('(as', 4),
 ('*gabriella', 4),
 ('it>', 4),
 ('¥mary', 4),
 ('¥xa', 4),
 ('forgetñlest', 4),
 ('(p', 4),
 ('ñm', 4),
 ('¥t', 4),
 ('*gleaner', 4),
 ('ñso', 4),
 ('secretaryñ', 4),
 ('s)', 4),
 ('****', 4),
 ('(hiring', 4),
 ('(matt', 4),
 ('*olive', 4),
 ('l¥', 4),
 ('*church', 4),
 ('educationalñmits', 4),
 ('wantedñgood', 4),
 ('(late', 4),
 (')(', 4),
 ('brotherñyour', 4),
 ('mr¥s', 4),
 ('*everett', 4),
 ('w*', 4),
 ('\\vest', 4),
 ('(new', 4),
 ('ñreview', 4),
 ('medicalñdn', 4),
 ('*wm', 4),
 ('(c', 4),
 ('ñdr', 4),
 ('ñyou', 4),
 ('[to', 4),
 ('ñare', 3),
 ('(v', 3),
 ('(two', 3),
 ('c=', 3),
 ('+d', 3),
 ('allñ', 3),
 ('beastñno', 3),
 ('¥-¥', 3),
 ('+a', 3),
 ('indã', 3),
 ('*cella', 3),
 ('year¥', 3),
 ("sec'yñmrs", 3),
 ('months)', 3),
 ('number¥', 3),
 ('¥hogan', 3),
 ('}', 3),
 ('page)', 3),
 ('other¥', 3),
 ('at¥', 3),
 ('cornett]', 3),
 ('ñby', 3),
 ('(nebraska)', 3),
 ('s¥', 3),
 ('tithe)', 3),
 ('ãndiana', 3),
 ('ñfrom', 3),
 ('supplement)', 3),
 ('c¥', 3),
 ('li)', 3),
 ('departmentñdn', 3),
 ('di)', 3),
 ('(it', 3),
 ('(lone', 3),
 ('a¥nd', 3),
 ('church)', 3),
 ('and¥', 3),
 ('(lay', 3),
 ("('", 3),
 ('of¥', 3),
 ('(ibid', 3),
 ('•c', 3),
 ('(in-', 3),
 ('e***', 3),
 ('(james', 3),
 ('•d', 3),
 ('(at', 3),
 ('ñid', 3),
 ("('reek", 3),
 ('*cash', 3),
 ('committeeñi', 3),
 ('ñtwo', 3),
 ('ñfor', 3),
 ('book)', 3),
 ('r¥', 3),
 ('[paid]', 3),
 ('*vv', 3),
 ('¥he', 3),
 ('___', 3),
 ('ñwas', 3),
 ('(name', 3),
 ('re¥', 3),
 ('(i()', 3),
 ('missionaryñit', 3),
 ('(inventory)', 3),
 ('¥¥¥¥', 3),
 ('ñw', 3),
 ('treasurerñjso', 3),
 ('ñbecause', 3),
 ('¥to', 3),
 ('ñmary', 3),
 ('mccum¡', 3),
 ('et)', 3),
 ('¥*', 3),
 ('è', 3),
 ('pages)', 3),
 ('£', 3),
 ('iña', 3),
 ('brother¥', 3),
 ('t¥', 3),
 ('a_', 3),
 ('ñn', 3),
 ('reporter)', 3),
 ('ñsouthern', 3),
 ('ñsigns', 3),
 ('ñis', 3),
 ('(john', 3),
 ('(f)', 3),
 ('ñthey', 3),
 ('¥one', 3),
 (')ndiana', 3),
 ('maggie__', 3),
 ('number)', 3),
 ('p¥', 3),
 ('ñ-', 3),
 ('en¥', 3),
 ('¥theime', 3),
 ('(lied', 3),
 ('•wallace', 2),
 ('(this', 2),
 ('(august', 2),
 ('¥bass', 2),
 ('ñpaper', 2),
 ('(show', 2),
 ('readings)', 2),
 ('*burt-', 2),
 ('brotherñi', 2),
 ('[continued', 2),
 ('v)', 2),
 ('(eastside)', 2),
 ('n()', 2),
 ('ñthese', 2),
 ('ñwhile', 2),
 ('`no', 2),
 ('_w', 2),
 ('`lo', 2),
 ('¥--', 2),
 ('[or', 2),
 ('(now', 2),
 ('ñministry', 2),
 ('ôn', 2),
 ('twenty¥pive', 2),
 ('identñw', 2),
 ('¥it', 2),
 ('tñ', 2),
 ('beast)', 2),
 ('(nov', 2),
 ('sister¥', 2),
 ("•merchants'", 2),
 ('reidñarcher', 2),
 ('issue)', 2),
 ('ñii', 2),
 ('•wolever', 2),
 ('sô', 2),
 ('saleñthe', 2),
 ('`go', 2),
 ('indianapolis__', 2),
 ('/¥', 2),
 ('readñ', 2),
 ('ct*', 2),
 ('will)', 2),
 ('%vas', 2),
 ('educationalñmils', 2),
 ("+'", 2),
 ("'['he", 2),
 ('o¥', 2),
 ('foldñ', 2),
 ('(inwood)', 2),
 ('¥f', 2),
 ('ñyoung', 2),
 ('a=', 2),
 ('(f', 2),
 ('*churches', 2),
 ('(tuesday)', 2),
 ('~e', 2),
 ('(ia', 2),
 ('booksñand', 2),
 ('camp¥meetings', 2),
 ('ý', 2),
 ('ñada', 2),
 ('(by', 2),
 ('bl)', 2),
 ('indiana)', 2),
 ('(n)', 2),
 ('paper¥', 2),
 ('ini)', 2),
 ('betterñ', 2),
 ('¥they', 2),
 ('itñwho', 2),
 ('\\vords', 2),
 ('(hi', 2),
 ('[from', 2),
 ('¥wm', 2),
 ('ñsarah', 2),
 ('ñsome', 2),
 ('•w', 2),
 ('v%', 2),
 ('+g', 2),
 ('bi)', 2),
 ('it~~', 2),
 ('%veil', 2),
 ('wa¥y', 2),
 ('(laughter', 2),
 ('w)', 2),
 ('saleñeight', 2),
 ('con¥', 2),
 ('—', 2),
 ('is)', 2),
 ('•howe', 2),
 ('ñone', 2),
 ('us)', 2),
 ('teacher)', 2),
 ('`they', 2),
 ('president—w', 2),
 ('c(', 2),
 ('songñ', 2),
 ('g)', 2),
 ('(letter', 2),
 ('in)', 2),
 ('thursday)', 2),
 ('clair)', 2),
 ('ñg', 2),
 ('or¥', 2),
 ('(same', 2),
 ('ha¥', 2),
 ('iã', 2),
 ('presidentñnv', 2)]

Correction 1 -- Normalize Characters

In [13]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
    
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/IR/correction1

Average verified rate: 0.9293155255922436

Average of error rates: 0.07463153456998314

Total token count: 1174770

In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[('w', 5080),
 ('e', 2163),
 ('m', 1991),
 ('r', 1809),
 ('-', 1798),
 ('f', 1642),
 ('d', 1407),
 ('t', 1102),
 ('n', 1090),
 ('re-', 939),
 ("'", 897),
 ('g', 887),
 ('con-', 837),
 ('in-', 722),
 ('tion', 663),
 ('th', 574),
 ('co', 495),
 ('be-', 388),
 ('mt', 380),
 ('com-', 366),
 ('de-', 349),
 ('rd', 339),
 ('u', 334),
 ('ence', 323),
 ('wm', 291),
 ('en-', 287),
 ('meet-', 272),
 ('confer-', 268),
 ('ex-', 255),
 ('ad-', 253),
 ('ment', 249),
 ('ference', 248),
 ('tions', 246),
 ('indi-', 242),
 ('k', 217),
 ('mo', 208),
 ('mis-', 201),
 ('ac-', 197),
 ('sab-', 194),
 ('at-', 189),
 ('ber', 183),
 ('pro-', 180),
 ('pre-', 179),
 ('camp-', 172),
 ('ple', 168),
 ('ers', 165),
 ('im-', 162),
 ('peo-', 162),
 ('ap-', 158),
 ('ly', 157)]

Correction 2 -- Fix Line Endings

In [16]:
# %load shared_elements/correct_line_endings.py
prev = "correction1"
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/IR/correction2

Average verified rate: 0.9417987630580531

Average of error rates: 0.06224536256323778

Total token count: 1154460

In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('w', 5076),
 ('e', 2155),
 ('m', 1990),
 ('r', 1807),
 ('-', 1770),
 ('f', 1640),
 ('d', 1402),
 ('t', 1095),
 ('n', 1089),
 ("'", 897),
 ('g', 887),
 ('th', 572),
 ('co', 493),
 ('mt', 380),
 ('tion', 361),
 ('rd', 339),
 ('u', 334),
 ('wm', 291),
 ('k', 217),
 ('mo', 207),
 ('ence', 177),
 ('re-', 175),
 ('con-', 156),
 ('tions', 147),
 ('in-', 142),
 ('ference', 139),
 ('rocklane', 139),
 ('ment', 131),
 ('adiana', 121),
 ('z', 114),
 ('ber', 110),
 ("canvassers'", 106),
 ('pr', 105),
 ('seventh-', 97),
 ('ple', 88),
 ('geporter', 86),
 ('ers', 86),
 ('x', 83),
 ('horlacher', 81),
 ('ly', 80),
 ('q', 79),
 ('boze', 79),
 ('nd', 78),
 ('iu', 77),
 ('ren', 76),
 ('indpls', 75),
 ('al', 75),
 ('id', 73),
 ('sionary', 71),
 ('com-', 69)]

Correction 3 -- Remove Extra Dashes

In [19]:
# %load shared_elements/remove_extra_dashes.py
prev = "correction2"
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
            
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
        else:
            pass
        
    if len(replacements) > 0:
#         print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [20]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/IR/correction3

Average verified rate: 0.9471213441022522

Average of error rates: 0.05673187183811129

Total token count: 1156043

In [21]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[21]:
[('w', 5094),
 ('e', 2181),
 ('m', 1998),
 ('r', 1828),
 ('f', 1647),
 ('d', 1406),
 ('t', 1126),
 ('n', 1097),
 ("'", 926),
 ('g', 891),
 ('co', 605),
 ('th', 577),
 ('mt', 381),
 ('tion', 363),
 ('rd', 340),
 ('u', 336),
 ('wm', 295),
 ('re', 250),
 ('k', 219),
 ('mo', 210),
 ('ence', 178),
 ('tions', 147),
 ('ference', 139),
 ('rocklane', 139),
 ('ment', 131),
 ('adiana', 121),
 ('z', 120),
 ('ber', 111),
 ("canvassers'", 106),
 ('pr', 105),
 ('ple', 88),
 ('x', 86),
 ('geporter', 86),
 ('ers', 86),
 ('al', 85),
 ('horlacher', 81),
 ('q', 81),
 ('ly', 81),
 ('ren', 79),
 ('boze', 79),
 ('nd', 78),
 ('iu', 77),
 ('indpls', 75),
 ('mc', 75),
 ('ex', 75),
 ('id', 73),
 ('sionary', 71),
 ('ft', 69),
 ('minnick', 67),
 ('ent', 66)]

Correction 4 -- Remove Extra Quotation Marks

In [22]:
# %load shared_elements/replace_extra_quotation_marks.py
prev = "correction3"
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 1:
                if token_list[-2] is 's' or 'S':
                    pass
                else:
                    corrections.append((token, re.sub(r"'", r"", token)))
            else:
                pass
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
        else:
            pass
    
    if len(corrections) > 0:
#         print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [23]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/IR/correction4

Average verified rate: 0.9480189176259711

Average of error rates: 0.05579258010118044

Total token count: 1155959

In [24]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[24]:
[('w', 5095),
 ('e', 2195),
 ('m', 1999),
 ('r', 1834),
 ('f', 1649),
 ('d', 1416),
 ('t', 1132),
 ('n', 1106),
 ('g', 892),
 ("'", 825),
 ('co', 605),
 ('th', 577),
 ('mt', 381),
 ('tion', 363),
 ('rd', 340),
 ('u', 336),
 ('wm', 297),
 ('re', 251),
 ('k', 220),
 ('mo', 210),
 ('ence', 178),
 ('tions', 147),
 ('ference', 139),
 ('rocklane', 139),
 ('ment', 131),
 ('z', 123),
 ('adiana', 121),
 ('ber', 111),
 ('pr', 105),
 ("canvassers'", 103),
 ('ple', 88),
 ('x', 87),
 ('geporter', 86),
 ('ers', 86),
 ('al', 85),
 ('q', 82),
 ('horlacher', 81),
 ('ly', 81),
 ('nd', 79),
 ('ren', 79),
 ('boze', 79),
 ('iu', 77),
 ('indpls', 75),
 ('mc', 75),
 ('ex', 75),
 ('id', 74),
 ('sionary', 71),
 ('ft', 71),
 ('minnick', 67),
 ('ent', 66)]

Correction 5 -- Rejoin Burst Words

In [25]:
# %load shared_elements/rejoin_burst_words.py
prev = "correction4"
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    pattern = re.compile("(\s(\w{1,2}\s){5,})")
    
    replacements = []
    clean.check_splits(pattern, spelling_dictionary, content, replacements)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 5

In [26]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/IR/correction5

Average verified rate: 0.9480089308962817

Average of error rates: 0.055806913996627315

Total token count: 1155987

In [27]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[27]:
[('w', 5094),
 ('e', 2194),
 ('m', 1999),
 ('r', 1834),
 ('f', 1649),
 ('d', 1416),
 ('t', 1131),
 ('n', 1105),
 ('g', 892),
 ("'", 825),
 ('co', 605),
 ('th', 577),
 ('mt', 381),
 ('tion', 363),
 ('rd', 340),
 ('u', 336),
 ('wm', 297),
 ('re', 251),
 ('k', 220),
 ('mo', 210),
 ('ence', 178),
 ('tions', 147),
 ('ference', 139),
 ('rocklane', 139),
 ('ment', 131),
 ('z', 123),
 ('adiana', 121),
 ('ber', 111),
 ('pr', 105),
 ("canvassers'", 103),
 ('ple', 88),
 ('x', 87),
 ('geporter', 86),
 ('ers', 86),
 ('al', 85),
 ('q', 82),
 ('horlacher', 81),
 ('ly', 81),
 ('nd', 79),
 ('ren', 79),
 ('boze', 79),
 ('iu', 77),
 ('indpls', 75),
 ('mc', 75),
 ('ex', 75),
 ('id', 74),
 ('sionary', 71),
 ('ft', 71),
 ('minnick', 67),
 ('ent', 66)]

Correction 6 -- Rejoin Split Words

In [28]:
# %load shared_elements/rejoin_split_words.py
prev = "correction5"
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [29]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/IR/correction6

Average verified rate: 0.9487000605025607

Average of error rates: 0.05507588532883642

Total token count: 1155323

In [30]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[30]:
[('w', 5087),
 ('e', 2176),
 ('m', 1994),
 ('r', 1825),
 ('f', 1647),
 ('d', 1412),
 ('t', 1111),
 ('n', 1098),
 ('g', 886),
 ("'", 825),
 ('th', 555),
 ('co', 494),
 ('mt', 381),
 ('tion', 355),
 ('rd', 339),
 ('u', 335),
 ('wm', 297),
 ('k', 219),
 ('mo', 208),
 ('re', 187),
 ('ence', 178),
 ('tions', 146),
 ('ference', 139),
 ('rocklane', 139),
 ('ment', 130),
 ('z', 122),
 ('adiana', 121),
 ('ber', 108),
 ('pr', 104),
 ("canvassers'", 103),
 ('x', 87),
 ('ple', 87),
 ('geporter', 86),
 ('ers', 86),
 ('q', 82),
 ('horlacher', 81),
 ('ly', 80),
 ('nd', 79),
 ('al', 79),
 ('boze', 78),
 ('iu', 77),
 ('indpls', 75),
 ('ren', 75),
 ('ft', 71),
 ('sionary', 68),
 ('ex', 68),
 ('minnick', 67),
 ('ary', 66),
 ('id', 66),
 ('ent', 65)]

Correction 7 -- Rejoin Split Words II

In [31]:
# %load shared_elements/rejoin_split_words.py
prev = "correction6"
cycle = "correction7"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [32]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/IR/correction7

Average verified rate: 0.9490016816153205

Average of error rates: 0.05474789207419899

Total token count: 1154842

In [33]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[33]:
[('w', 5066),
 ('e', 2174),
 ('m', 1976),
 ('r', 1823),
 ('f', 1647),
 ('d', 1409),
 ('n', 1097),
 ('t', 1096),
 ('g', 885),
 ("'", 825),
 ('th', 551),
 ('co', 491),
 ('mt', 380),
 ('tion', 353),
 ('rd', 336),
 ('u', 332),
 ('wm', 297),
 ('k', 218),
 ('mo', 207),
 ('ence', 174),
 ('tions', 146),
 ('re', 139),
 ('rocklane', 139),
 ('ference', 137),
 ('z', 122),
 ('ment', 122),
 ('adiana', 121),
 ('ber', 106),
 ("canvassers'", 103),
 ('pr', 100),
 ('x', 87),
 ('ple', 87),
 ('geporter', 86),
 ('q', 82),
 ('ers', 82),
 ('horlacher', 81),
 ('boze', 78),
 ('iu', 77),
 ('al', 77),
 ('indpls', 75),
 ('ren', 75),
 ('nd', 73),
 ('ly', 73),
 ('ft', 69),
 ('sionary', 68),
 ('ex', 68),
 ('minnick', 67),
 ('id', 66),
 ('sr', 65),
 ('ent', 63)]

Review Remaining Errors

In [34]:
reports.docs_with_high_error_rate(summary)
Out[34]:
[('IR19100511-V16-10-page6.txt', 0.571),
 ('IR19100511-V16-10-page3.txt', 0.5),
 ('IR19041109-V10-23-page3.txt', 0.305),
 ('IR19040413-V10-08-page5.txt', 0.295),
 ('IR19081125-V14-25-page7.txt', 0.277),
 ('IR19040511-V10-10-page5.txt', 0.258),
 ('IR19081209-V14-26-page7.txt', 0.255),
 ('IR19040302-V10-05-page5.txt', 0.255),
 ('IR19081223-V14-27-page5.txt', 0.253),
 ('IR19081111-V14-24-page7.txt', 0.245),
 ('IR19041012-V10-21-page3.txt', 0.227),
 ('IR19080401-V14-07-page9.txt', 0.225),
 ('IR19080318-V14-06-page3.txt', 0.225),
 ('IR19040203-V10-03-page5.txt', 0.222),
 ('IR19050104-V11-01-page6.txt', 0.221),
 ('IR19081014-V14-22-page7.txt', 0.221),
 ('IR19090106-V15-01-page5.txt', 0.217),
 ('IR19070102-V13-01-page1.txt', 0.215),
 ('IR19090203-V15-03-page3.txt', 0.21),
 ('IR19040106-V10-01-page6.txt', 0.209),
 ('IR19080513-V14-10-page7.txt', 0.208),
 ('IR19030204-V09-03-page3.txt', 0.207)]
In [35]:
# %load shared_elements/high_error_rates.py
doc_keys = [x[0] for x in reports.docs_with_high_error_rate(summary) if x[1] > 0.25]

# utilities.open_original_docs(doc_keys, directories['cycle'])

High error documents are charts of pledges for the local training school.

In [36]:
reports.long_errors(errors_summary, min_length=15)[:100]
Out[36]:
(['bartdemonstrated',
  'black-stringleave',
  'certificateentitles',
  'followrecunimendations',
  'sivaranwseswavieni',
  'carefulsuccession',
  'corresponmembers',
  'ontheirbacksonabitofrushmatting',
  'childrenintelligent',
  'logansubscriptions',
  'threattemptation',
  'septemencouraging',
  'inauguratintroduced',
  'meansperformance',
  'finaldifficulties',
  'harmonionsdevelopment',
  'withdenominational',
  'wedding-ring-rub',
  'conunprecedented',
  'eeeeeereeeetoteeeeet',
  'churchmembersneglecttopay',
  'enlightenpamphlets',
  'cheerdevelopement',
  'raffifitsmowiwkwiewaresie',
  'overcomcooperation',
  'peoplethroughont',
  'seeeeeeeeereeeee',
  'positiveposition',
  'improveappearance',
  "whichisever'tobealoyal",
  'systematchildren',
  'virteen-year-old',
  'circumstanterrible',
  'celebratannouncement',
  "etriiiiibiyikeib's",
  'subscripchildren',
  'unnecesmissionary',
  'medicarmissionary',
  'meetorganization',
  'canvassextensively',
  'thesewillcertainlyhavea',
  'self-gratification',
  'eeieeeeeeeceeeee',
  'helaprwfriothmmoetahnesr',
  'reresponsibility',
  'presentlocationorcircumstancesof',
  'subscriberstothe',
  'missionthree-color',
  'indianapsibilities',
  'recommendcontributions',
  'toherhomecaresanddutiesas',
  'east-indianapolis',
  'subscripencouraged',
  'tnisrepresentatrons',
  'brethdiscontinued',
  'considergreatest',
  'testicontributed',
  'circumfercomodation',
  'approachingperil',
  'undersurprisingly',
  'financialcondition',
  'roberetsisassisting',
  'camp-meetbilities',
  'aaaasaaaasetaaaav',
  'enteradvancement',
  'proclaimemployment',
  'faithfulbettering',
  'transvaaiconference',
  'interurcompelled',
  'encourageapprove',
  'educationwritten',
  'chickestablished',
  'wednescorrespondence',
  'philadelersetshire',
  'showeitthatprophecyi',
  'organizasponsibility',
  'representligently',
  'progsponsibility',
  'sanitarinvesting',
  'believrepresentation',
  'indictioleeeeeeeecieeeeteeeeeett',
  'self-sacrichildren',
  'president--morris',
  'institugatherings',
  'prosperouslourney',
  'recomrecommendations',
  'commodationsareexcellent',
  'long-to-be-remembered',
  'acceptdistributing',
  'instriimentalities',
  'oursttbbathschool',
  'entiouslyobserve',
  'conferencouragement',
  'southlegislation',
  "confersix-weeks'",
  'canvassearnestly',
  'estabindifferent',
  'sanitarpresented',
  'encouragescriptions',
  'donadenominational',
  'convenbeautifully',
  'legislaimportant',
  'vaitievnlueorders',
  'alertillefliellmomm',
  'desiropportunity',
  'churchschoolteachers',
  'thoroughnersville',
  'necessaryprerequisite',
  'estedfamilyatcolumbus',
  'thewaytoouroffice',
  'meetconsiderably',
  'convenperiodicals',
  'superintendentcy',
  'tabernalightened',
  'camp-meetcitizens',
  'unioncaongference',
  "self-supporting'",
  'continudianapolis',
  'surroundstudents',
  'semieniiminnelyip',
  'presentaquestion',
  'aaaaaaataaaaaaaaabat',
  'sabbathillustrated',
  "giveb'exhaustive",
  'meetanticipatian',
  'assumpone-fourth',
  'inseparstimulated',
  'expectedtoacthere',
  'camp-meetfuneral',
  'arrangeportunity',
  'propcommandments',
  'intercommandment',
  'cansuperintendent',
  'missionincreased',
  'missionconducting',
  'reportofcanvassing',
  'unadministration',
  'ministradetermination',
  'accommodationscan',
  'followsuggestions',
  'alphabetvincingly',
  'fellow-passengers',
  'interestingexperiences',
  'mechanicspartment',
  'accomplishstrong',
  'undoubtorganization',
  'conferfinancially',
  'great-grandchildren',
  'congregationservice',
  'andceasetolookbackin',
  'regardconsiderably',
  'unfortuunderstanding',
  'continuingthrough',
  'eeeeeeeieeeeecge',
  'departredemption',
  'indiommendations',
  'whereverpresented',
  'ccaannvvaassesresrs',
  'thousatisfactorily',
  'gatherinstruction',
  'andmarthamarshall',
  'grocerconsecration',
  'heaven-appointed',
  'nreiesematitectict',
  'willnotthosewhohave',
  'ofourbooksinthisterritorysofar',
  'distribuoverthrowing',
  'understandrefining',
  'missionunderstand',
  'forty-by-sixty-foot',
  'one-hundred-thousand-dollar',
  'deliveringevenings',
  'executivecommittee',
  'double-distilled',
  'health-restoring',
  'mmoimimmininniiimm',
  'encouragewestern',
  'washingopposition',
  'romanconsciousness',
  'aaaaasaaaaillaaa',
  'canvassingworkfortwoweeks',
  'yourrecentletterisathand',
  'spisponsibilities',
  'responsichildren',
  'attemptbeautiful',
  'periodattractions',
  'denominationcause',
  'testimonperishing',
  'termofthisschoolyearhas',
  'appreciattemperance',
  'mahamissionaries',
  'confidsanitarium',
  'frequently-given',
  "reasonable'wages",
  'veeeeeeeeeeeeeeeeeeeek',
  'thecauvassing-work',
  'attenddetermination',
  'capabilidaughter',
  'church-membership',
  'particudifferent',
  'developprinciples',
  'thereareprospects',
  'fursubscriptions',
  'peocircumstances',
  'feeeeeeeeefieeeeettes',
  'neighencroachments',
  'conferencecomtnittee',
  'largeconferences',
  'washingneglected',
  'rlaaaaaaaaaawaaaaa',
  'grand-chilreceiving',
  'carryrighteousness',
  'theprivilegesand',
  'easttwenty-thirdstreet',
  'correspondnamely',
  'furnishconference',
  'commerassistance',
  'interestcheerful',
  'self-pronouneing',
  'veeeeeeeeeeeeeeteeeteeeiseeeeeeeeeeeeeee',
  'liegeeeeeeeeeeeeeeeeee',
  'interremembrances',
  'andheisverymuchinterested',
  'miimirmimirimmimplete',
  'duringtheweekofprayerbrother',
  'firstcirbelievers',
  'eoceeeeeeeeleceee',
  'self-examination',
  'canvasssacrifice',
  'valuevalueofrdelrs',
  'religio-political',
  'individbackground',
  'concontributions',
  'ten-cents-a-week',
  'ccieeeeeeeeeeeeeviirzie',
  'knowledgeforstill',
  'eeeeeeeeliseeecteeees',
  'convenwithstanding',
  'momppoimnimiparewn',
  'iiwwwwwvaithafrws',
  'patricksblessing',
  'ijoineieldersimpsonat',
  'wwwritgrjrnminkswpw',
  'providlegislation',
  'danish-norwegian',
  'superintendmitted',
  'eeeeeeieeetieeei',
  'concernministers',
  'reportofcanvassingworkfortwo',
  'sulphur-bag-wearing',
  'worpsurroundings',
  'peoteiiresebturrg',
  'demonstrageneral',
  'florence-spurgeon',
  'eeeeeeeeeeeeeeel',
  'afterthelmeiness',
  'sinscisytenrthia',
  'employconference',
  'niinsionmediately',
  'campcongregation',
  'portableappliance',
  'seeeseeetiefeeeee',
  'accomplishgiving',
  'ummessimeeimesse',
  'correspondcamp-meeting',
  'atgiaoseoseaaasek',
  'huntingprovisions',
  'npeeeeiceeeeeeee',
  "hisjudgment'scome",
  'accomplishperish',
  'simplestatepropose',
  'considwillingness',
  'unfaiththreatened',
  'tjiiviiiimmiiiiper',
  'opportunidownfall',
  'imillcrynunivaivi',
  'reportofcanvassingwork',
  'missioninstrumentalities',
  'trainpossibilities',
  'unoriiiiiiimmmumssaimms',
  'thursphotographs',
  'largecongregation',
  'considappraisement',
  'superindeavoring',
  'ecceeekieeeeticieeee',
  'spartansdifferent',
  'werenotannouncedpubliclyuntili',
  'busicongregation',
  'easttwentythirdstreet',
  'conferpartnership',
  'mimirilfirali-piv',
  'canvasssentation',
  'missioncarpenters',
  'readingslmoonnier',
  'hard-heartedness',
  'martinsfurnished',
  'encouragrevealed',
  'publishsubscribing',
  'localdenominational',
  'percorrespondence',
  'indiansideration',
  'seventhhereafter',
  'advancemendations',
  'superconsideration',
  'connnunicatoward',
  'metieyttiltiometit',
  'andselltheminyourcommunity',
  'accomplishprospect',
  'much-to-be-desired',
  'camp-meetacknowledged',
  'unadulteratunite',
  'seaaaaaaaasarwposaa',
  'adventdianapolis',
  'keeeeeeeeeeeeeeee',
  'discuscouragement',
  'vilieeefeeeeeeeilieee',
  'disinfecinterest',
  'conferopportunity',
  'camp-meetbaptized',
  'ritliglitennient',
  'self-examone-tenth',
  'wiriniimpirrmiptown',
  'washingcessitating',
  'officeregulations',
  'remarkcommandment-keepers',
  'indianapoinstructions',
  'administragreater',
  'certificate-holders',
  'riireeeeeeeeficeeeeeeeeiee',
  'provicongregation',
  'sanitaplications',
  'particprogressing',
  'indianapbeautiful',
  'departmentalsecretaries',
  'netweeseeeeeeeereoweee',
  'gatherinterurban',
  "housandi'dollars",
  'self-sucricolored',
  "translation'etnd",
  "children'sreceiving",
  'martinsquantities',
  'establishprincipal',
  'instrucfurnished',
  'indianapolismiscellaneous',
  'linenresurrection',
  'buildliberations',
  'avnedrywghloadwitlol',
  'feeeeeeeeeieteeeeeeeeetc',
  'imporreplenishing',
  'commandaddresses',
  'associacalendars',
  'clappeirnjsejnfnie',
  'favorablecircumstances',
  'walfillwittlfilll',
  'sabbathrecommendation',
  'recotumendauring',
  'liisiaembiameeresi',
  'responsibilisomewhat',
  'recommendaopened',
  'leadorganization',
  'regardtreasurers',
  'ecietieeeeieeeec',
  'satisfacreadings',
  'sabbathinstitute',
  'sevefiirliiifidred',
  'morestisceptible',
  "iv'et'itilrltiiceksburg",
  'temperconferences',
  'denominamissionaries',
  'consecratburdens',
  'sabbath-afternoon',
  'eeeeeeeeeeeeeeeeeet',
  'opportuattendance',
  'faithfulcanvassers',
  'financamp-meeting',
  'theall-important',
  'co-opbath-school',
  'wipmptiniireiiisshm',
  'opportunclaration',
  'aaaaaassiwaesaaaaeoaetaan',
  'eeeeeseeeeeeeeeeeee',
  'tippippipsyaktnr',
  'reveladevelopments'],
 15)

Correction 8 -- Remove long error tokens

In [37]:
# %load shared_elements/remove-tokens-with-long-strings-of-characters.py
prev = cycle
cycle = "correction8"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    replacements.append(clean.check_for_repeating_characters(tokens, "e|E"))
    replacements.append(clean.check_for_repeating_characters(tokens, "a|A"))
    replacements.append(clean.check_for_repeating_characters(tokens, ""))
    
    replacements = [item for sublist in replacements for item in sublist]
            
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
IR19020618-V08-13-page3.txt: [('veeeeeeeeeeeeeeeeeeeek', ' ')]
IR19020702-V08-14-page2.txt: [('eeeeeseeeEeeeeeeeee', ' ')]
IR19040120-V10-02-page5.txt: [('EeeeeeEeeeeeE', ' ')]
IR19060328-V12-07-page5.txt: [('feeeeeeeeefieeeeettes', ' ')]
IR19060815-V12-17-page3.txt: [('.seeeseeetiefeeeee', ' ')]
IR19061107-V12-23-page5.txt: [('NPeeeeiceeeeeeee', ' ')]
IR19061205-V12-25-page5.txt: [('aaaasaaaasetaaaav', ' ')]
IR19070410-V13-08-page5.txt: [('veeeeeeeeeeeeeeteeeteeeiseeeeeeeeeeeeEee', ' '), ('seaaaaaaaasarwposaa', ' ')]
IR19070410-V13-08-page7.txt: [('aaaaaassiwaesaaaaeoaetaaN', ' ')]
IR19070424-V13-09-page7.txt: [('ccieeeeeeeeeeeeeviirzie', ' ')]
IR19070424-V13-09-page8.txt: [('eeeeeeeeeeeeeeeeeet', ' ')]
IR19070508-V13-10-page7.txt: [('liegeeeeeeeeeeeeeeeeee', ' '), ('Netweeseeeeeeeereoweee', ' ')]
IR19070605-V13-12-page3.txt: [('Eoceeeeeeeeleceee', ' ')]
IR19070619-V13-13-page8.txt: [('eeeeeeeeefeeeci', ' ')]
IR19070703-V13-14-page3.txt: [('.seeeeeeeweee', ' ')]
IR19070918-V13-19-page3.txt: [('keeeeeeeeeeeeeeee', ' ')]
IR19071016-V13-21-page7.txt: [('.kieeeseeeeeeee', ' '), ('eaaaaaaaaaaaaaa', ' ')]
IR19071113-V13-23-page3.txt: [('eeeeeeeeeeeeeeel', ' ')]
IR19071225-V13-26-page4.txt: [('Indictioleeeeeeeecieeeeteeeeeett', ' ')]
IR19080108-V14-01-page5.txt: [('opeeefieeeeeeJ', ' ')]
IR19080318-V14-06-page4.txt: [('Vilieeefeeeeeeeilieee', ' ')]
IR19080722-V14-16-page5.txt: [('eeeeeeeeeeeteee', ' ')]
IR19080805-V14-17-page5.txt: [('.seeeeeeeeereeeee', ' ')]
IR19080916-V14-20-page6.txt: [('eeeeeereeeetoteeeeet', ' ')]
IR19080930-V14-21-page5.txt: [('Feeeeeeeeeieteeeeeeeeetc', ' ')]
IR19080930-V14-21-page6.txt: [('eeeeeeeeecceeee', ' ')]
IR19081125-V14-25-page6.txt: [('eeeeeeeeeeeeeee', ' ')]
IR19090203-V15-03-page2.txt: [('Niteegieeeeeee', ' ')]
IR19090303-V15-05-page5.txt: [('eeeeeEieeetieeei', ' ')]
IR19090317-V15-06-page3.txt: [('eeeeeeeeliseeecteeees', ' ')]
IR19090414-V15-08-page5.txt: [('eteeieeeeeeek', ' ')]
IR19090623-V15-12-page5.txt: [('..eEieeeeeeeceeeee', ' '), ('aaaaoiNgoaaaaaa', ' '), ('iaaaaaaaaaaaaaa', ' ')]
IR19090818-V15-16-page5.txt: [('eeeeeetieceeee', ' '), ('aaaaaaataaaaaaaaabat', ' ')]
IR19091027-V15-21-page5.txt: [('aaaaasaaaaillaaa', ' ')]
IR19091222-V15-25-page2.txt: [('aaagoaaataaasAl', ' ')]
IR19100105-V16-01-page5.txt: [('rlaaaaaaaaaawaaaaa', ' ')]
IR19100105-V16-01-page6.txt: [('k.eeeeeeeeeteee', ' ')]
IR19100119-V16-02-page5.txt: [('eeeeeeEieeeeecge', ' ')]
IR19100216-V16-04-page3.txt: [('eeeeeeeeeeeee', ' ')]
IR19100316-V16-06-page4.txt: [('riireeeeeeeeficeeeeeeeeiee', ' ')]
IR19100413-V16-08-page3.txt: [('ecceeekieeeEticieeee', ' ')]
In [38]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/IR/correction8

Average verified rate: 0.9490411276816472

Average of error rates: 0.05470489038785834

Total token count: 1154794

In [39]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[39]:
[('w', 5066),
 ('e', 2174),
 ('m', 1976),
 ('r', 1823),
 ('f', 1647),
 ('d', 1409),
 ('n', 1097),
 ('t', 1096),
 ('g', 885),
 ("'", 825),
 ('th', 551),
 ('co', 491),
 ('mt', 380),
 ('tion', 353),
 ('rd', 336),
 ('u', 332),
 ('wm', 297),
 ('k', 217),
 ('mo', 207),
 ('ence', 174),
 ('tions', 146),
 ('re', 139),
 ('rocklane', 139),
 ('ference', 137),
 ('z', 122),
 ('ment', 122),
 ('adiana', 121),
 ('ber', 106),
 ("canvassers'", 103),
 ('pr', 100),
 ('x', 87),
 ('ple', 87),
 ('geporter', 86),
 ('q', 82),
 ('ers', 82),
 ('horlacher', 81),
 ('boze', 78),
 ('iu', 77),
 ('al', 77),
 ('indpls', 75),
 ('ren', 75),
 ('nd', 73),
 ('ly', 73),
 ('ft', 69),
 ('sionary', 68),
 ('ex', 68),
 ('minnick', 67),
 ('id', 66),
 ('sr', 65),
 ('ent', 63)]

Correction 9 -- Separate Squash Words

In [40]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log

prev = cycle
cycle = "correction9"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

verified_tokens = []

for filename in corpus:  
    content = utilities.readfile(directories['prev'], filename)
    clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)

tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]

sorted_list_of_words = list(words_sorted_short['token'])

wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = utilities.strip_punct(content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    
    for token in tokens:
        if not token.lower() in spelling_dictionary:
            if len(token) > 17:
                if re.search(r"[\-\-\'\"]", token):
                    pass
                else:
                    split_string = clean.infer_spaces(token, wordcost, maxword)
                    list_split_string = split_string.split()
                    
                    if clean.verify_split_string(list_split_string, spelling_dictionary):
                        replacements.append((token, split_string))
                    else:
                        pass
            else:
                pass
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
IR19011009-V07-20-page8.txt: [('estedfamilyatColumbus', 'est ed family at Columbus'), ('andheisverymuchinterested', 'and he is very much interested')]
IR19020212-V08-04-page3.txt: [('trainpossibilities', 'train possibilities'), ('ChurchSchoolTeachers', 'Church School Teachers')]
IR19020226-V08-05-page3.txt: [('werenotannouncedpubliclyuntilI', 'were not announced publicly until I')]
IR19020604-V08-12-page3.txt: [('faithfulcanvassers', 'faithful canvassers')]
IR19020702-V08-14-page3.txt: [('deliveringevenings', 'delivering evenings'), ('simplestatepropose', 'simple state propose')]
IR19021001-V08-19-page2.txt: [('childrenintelligent', 'children intelligent')]
IR19021029-V08-21-page3.txt: [('ontheirbacksonabitofrushmatting', 'on their back son a bit of rush mat ting'), ('REPORTOFCANVASSINGWORKFORTWO', 'REPORT OF CANVASSING WORK FOR TWO')]
IR19021112-V08-22-page3.txt: [('CANVASSINGWORKFORTWOWEEKS', 'CANVASSING WORK FOR TWO WEEKS')]
IR19030204-V09-03-page4.txt: [('REPORTOFCANVASSING', 'REPORT OF CANVASSING')]
IR19030415-V09-08-page4.txt: [('REPORTOFCANVASSINGWORK', 'REPORT OF CANVASSING WORK')]
IR19031014-V09-21-page2.txt: [('acceptdistributing', 'accept distributing')]
IR19040302-V10-05-page6.txt: [('regardconsiderably', 'regard considerably')]
IR19040330-V10-07-page3.txt: [('favorablecircumstances', 'favorable circumstances')]
IR19040831-V10-18-page3.txt: [('recommendcontributions', 'recommend contributions')]
IR19041221-V10-26-page3.txt: [('Thesewillcertainlyhavea', 'These will certainly have a')]
IR19050201-V11-03-page4.txt: [('Logansubscriptions', 'Logan subscriptions')]
IR19050329-V11-07-page3.txt: [('appreciattemperance', 'app rec i at temperance')]
IR19050412-V11-08-page4.txt: [('toherhomecaresanddutiesas', 'to her home cares and duties as')]
IR19050607-V11-12-page2.txt: [('accomplishprospect', 'accomplish prospect')]
IR19050621-V11-13-page3.txt: [('enlightenpamphlets', 'enlighten pamphlets')]
IR19050802-V11-16-page2.txt: [('canvassextensively', 'canvass extensively')]
IR19050830-V11-18-page2.txt: [('Sabbathrecommendation', 'Sabbath recommendation')]
IR19050830-V11-18-page3.txt: [('IndianapolisMiscellaneous', 'Indianapolis Miscellaneous')]
IR19060103-V12-01-page4.txt: [('termofthisschoolyearhas', 'term of this school year has'), ('DURINGtheweekofprayerBrother', 'DURING the week of prayer Brother')]
IR19060214-V12-04-page8.txt: [('andceasetolookbackin', 'and cease to look back in')]
IR19060801-V12-16-page2.txt: [('attenddetermination', 'attend determination')]
IR19061219-V12-26-page6.txt: [('establishprincipal', 'establish principal')]
IR19070130-V13-03-page4.txt: [('Sabbathillustrated', 'Sabbath illustrated')]
IR19070313-V13-06-page6.txt: [('superconsideration', 'sup er consideration')]
IR19070508-V13-10-page3.txt: [('YOURrecentletterisathand', 'YOUR recent letter is at hand'), ('presentlocationorcircumstancesof', 'present location or circumstances of')]
IR19070717-V13-15-page5.txt: [('publishsubscribing', 'publish subscribing')]
IR19071002-V13-20-page8.txt: [('carryrighteousness', 'carry righteousness')]
IR19071211-V13-25-page1.txt: [('financialcondition', 'financial condition')]
IR19071211-V13-25-page6.txt: [('Conferencouragement', 'Confer encouragement')]
IR19071211-V13-25-page7.txt: [('andselltheminyourcommunity', 'and sell them in your community')]
IR19080108-V14-01-page3.txt: [('ofourbooksinthisterritorysofar', 'of our books in this territory so far')]
IR19080122-V14-02-page4.txt: [('ministradetermination', 'minis trade term i nation')]
IR19080205-V14-03-page7.txt: [('Washingcessitating', 'Washing cess it at ing')]
IR19080304-V14-05-page4.txt: [('proclaimemployment', 'proclaim employment')]
IR19080318-V14-06-page1.txt: [('convenwithstanding', 'con ven with standing')]
IR19080513-V14-10-page5.txt: [('understandrefining', 'understand ref in ing')]
IR19081028-V14-23-page6.txt: [('withdenominational', 'with denominational')]
IR19081111-V14-24-page6.txt: [('donadenominational', 'dona denominational')]
IR19090512-V15-10-page1.txt: [('ExecutiveCommittee', 'Executive Committee')]
IR19090804-V15-15-page2.txt: [('missioninstrumentalities', 'mission instrumentalities')]
IR19090818-V15-16-page4.txt: [('overcomcooperation', 'over com cooperation')]
IR19090915-V15-18-page1.txt: [('Romanconsciousness', 'Roman consciousness')]
IR19091124-V15-23-page8.txt: [('harmonionsdevelopment', 'harm onions development')]
IR19091222-V15-25-page5.txt: [('thousatisfactorily', 'thou satisfactorily')]
IR19100105-V16-01-page3.txt: [('localdenominational', 'local denominational')]
IR19100202-V16-03-page7.txt: [('interestingexperiences', 'interesting experiences')]
IR19100216-V16-04-page1.txt: [('DepartmentalSecretaries', 'Departmental Secretaries')]
IR19100302-V16-05-page1.txt: [('churchmembersneglecttopay', 'church members neglect to pay'), ('Willnotthosewhohave', 'Will not those who have')]
IR19100413-V16-08-page1.txt: [('EASTTWENTYTHIRDSTREET', 'EAST TWENTY THIRD STREET')]
IR19100511-V16-10-page7.txt: [('followrecunimendations', 'follow rec uni mend at ions')]
In [41]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/IR/correction9

Average verified rate: 0.9491017808532303

Average of error rates: 0.0546433389544688

Total token count: 1154952

In [42]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[42]:
[('w', 5066),
 ('e', 2174),
 ('m', 1976),
 ('r', 1823),
 ('f', 1647),
 ('d', 1409),
 ('n', 1097),
 ('t', 1096),
 ('g', 885),
 ("'", 825),
 ('th', 551),
 ('co', 491),
 ('mt', 380),
 ('tion', 353),
 ('rd', 336),
 ('u', 332),
 ('wm', 297),
 ('k', 217),
 ('mo', 207),
 ('ence', 174),
 ('tions', 146),
 ('re', 139),
 ('rocklane', 139),
 ('ference', 137),
 ('z', 122),
 ('ment', 122),
 ('adiana', 121),
 ('ber', 106),
 ("canvassers'", 103),
 ('pr', 100),
 ('x', 87),
 ('ple', 87),
 ('geporter', 86),
 ('q', 82),
 ('ers', 82),
 ('horlacher', 81),
 ('boze', 78),
 ('iu', 77),
 ('al', 77),
 ('indpls', 75),
 ('ren', 75),
 ('nd', 73),
 ('ly', 73),
 ('ft', 69),
 ('sionary', 68),
 ('ex', 68),
 ('minnick', 67),
 ('id', 66),
 ('sr', 65),
 ('ent', 63)]
In [ ]: