CUV-OCR-Evaluation-and-Correction

Columbia Union Visitor

Overall, the OCR for this title is messy -- it appears that the OCR engine had trouble with the column breaks.

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-02-14-Roman-Numerals.txt",
             "2017-03-01-Additional-Approved-Words.txt"
            ]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "CUV"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)

Baseline

In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/baseline

Average verified rate: 0.9174753500833346

Average of error rates: 0.09053696

Total token count: 6484099

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 500 )
Out[11]:
[('co', 19949),
 ('w', 14081),
 ('-', 13680),
 ('e', 13268),
 ('br', 9834),
 ('m', 8834),
 ('d', 7654),
 ('f', 7357),
 ('r', 7342),
 ('g', 7131),
 ("'", 6768),
 ('¥', 5964),
 ('re-', 5019),
 ('t', 4720),
 ('gc', 4452),
 ('con-', 4254),
 ('od', 4219),
 ('n', 3875),
 ('ñ', 3845),
 ('in-', 3327),
 ('tion', 3144),
 ('ck', 2989),
 ('be-', 2628),
 ('de-', 2034),
 ('pa', 1917),
 (')', 1884),
 ('ex-', 1788),
 ('k', 1740),
 ('ence', 1724),
 ('com-', 1721),
 ('mt', 1684),
 ('bf', 1518),
 ('en-', 1514),
 ('va', 1463),
 ('confer-', 1379),
 ('ment', 1358),
 ('ference', 1348),
 ('sab-', 1241),
 ('th', 1161),
 ('meet-', 1138),
 ('mis-', 1108),
 ('ad-', 1095),
 ('*', 1094),
 ('pro-', 1069),
 ("canvassers'", 1061),
 ('pre-', 1017),
 ('ers', 1016),
 ('peo-', 994),
 ('_', 970),
 ('at-', 967),
 ('ple', 961),
 ('ber', 922),
 ('ac-', 911),
 ('tions', 910),
 ('es', 892),
 ('un-', 868),
 ('col-', 839),
 ('im-', 818),
 ('dis-', 802),
 ('or-', 798),
 ('mem-', 794),
 ('(', 785),
 ('to-', 785),
 ('per-', 780),
 ('can-', 775),
 ('an-', 697),
 ('inter-', 680),
 ('ap-', 660),
 ('wm', 656),
 ("'the", 652),
 ('mes-', 643),
 ('for-', 636),
 ('ful', 626),
 ('u', 616),
 ('sionary', 611),
 ('ance', 599),
 ('ments', 597),
 ('ary', 588),
 ('al-', 580),
 ('--', 577),
 ('pg', 545),
 ('bers', 545),
 ('ser-', 539),
 ('camp-', 532),
 ('ent', 527),
 ('/', 520),
 ('mission-', 504),
 ('work-', 501)]

Correction 1 -- Check for special character use

In [12]:
reports.tokens_with_special_characters(errors_summary)[:100]
Out[12]:
[('¥', 5964),
 ('ñ', 3845),
 (')', 1884),
 ('*', 1094),
 ('_', 970),
 ('(', 785),
 ('/', 520),
 ('%', 470),
 ('ña', 312),
 ('ã', 297),
 ('¥¥', 275),
 ('ñthe', 272),
 ('(a)', 227),
 ('(b)', 212),
 ('*two', 196),
 ('=', 186),
 ('(for', 185),
 ('ñselected', 178),
 ('¡', 171),
 ('ñmrs', 148),
 ('`', 141),
 ('ñcom-', 134),
 ('(c)', 127),
 ('•', 126),
 ('+', 115),
 ('ñh', 114),
 ('(to', 110),
 ('ñcoming', 108),
 ('(academia', 102),
 ('(the', 101),
 (']', 90),
 ('ñw', 89),
 ('(columbia', 88),
 ('\\', 84),
 ('conferenceñmission', 84),
 ('(d)', 80),
 ('ñbible', 79),
 ('ñr', 78),
 ('❑', 78),
 ('(concluded)', 75),
 ('(continued', 74),
 ('¥the', 73),
 ('\ufeff', 72),
 ('a)', 72),
 ('ô', 68),
 ('second¥class', 68),
 ('*j', 64),
 ('(colored)', 63),
 ('(a', 63),
 ('¥¥¥', 59),
 ('[entered', 59),
 ('andñ', 59),
 ('continued)', 59),
 ('[', 58),
 ('*barnesville', 56),
 ('ñthat', 56),
 ('*broughton', 55),
 ('ñgreat', 55),
 ('_the', 55),
 ('the¥', 54),
 ('ñf', 54),
 ('the_', 54),
 ('>', 53),
 ('ñdied', 52),
 ('*west', 51),
 ('¤', 50),
 ('*companies', 50),
 ('ñin', 50),
 ('ñj', 49),
 ('*reedsville', 49),
 ('*jackson', 49),
 ('(continued)', 49),
 ('(german)', 48),
 ('-¥', 46),
 ('ñb', 45),
 ('ñto', 45),
 ('seventh¥day', 45),
 ('ñif', 45),
 ('ñe', 44),
 ('*a', 44),
 ('(e)', 44),
 ('ñreview', 43),
 ('wantedña', 43),
 ('ñvol', 42),
 ('*week', 41),
 ('(and', 41),
 ('(or', 41),
 ('—', 41),
 ('ý', 40),
 ('¥and', 40),
 ('¥of', 37),
 ('ñelder', 37),
 ('*gilboa', 37),
 ('¥¥¥¥', 36),
 ('*conant', 35),
 ('camp¥meeting', 35),
 ("'¥", 34),
 ('ñwe', 34),
 ('ohio)', 33),
 ('ñand', 33)]
In [13]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
    
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 1

In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction1

Average verified rate: 0.9234446884710339

Average of error rates: 0.08394623999999999

Total token count: 6471517

In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[('co', 19981),
 ('w', 14250),
 ('-', 14205),
 ('e', 13469),
 ('br', 9842),
 ('m', 8888),
 ('d', 7814),
 ('r', 7520),
 ('f', 7507),
 ('g', 7228),
 ("'", 7009),
 ('re-', 5029),
 ('t', 4825),
 ('gc', 4454),
 ('con-', 4258),
 ('od', 4229),
 ('n', 3927),
 ('in-', 3334),
 ('tion', 3152),
 ('ck', 2990),
 ('be-', 2632),
 ('de-', 2045),
 ('pa', 1925),
 ('com-', 1858),
 ('ex-', 1795),
 ('k', 1758),
 ('ence', 1726),
 ('mt', 1690),
 ('bf', 1519),
 ('en-', 1517),
 ('va', 1467),
 ('confer-', 1380),
 ('ment', 1362),
 ('ference', 1353),
 ('sab-', 1246),
 ('th', 1182),
 ('meet-', 1140),
 ('mis-', 1123),
 ('ad-', 1095),
 ('pro-', 1070),
 ("canvassers'", 1062),
 ('ers', 1021),
 ('pre-', 1019),
 ('peo-', 995),
 ('at-', 969),
 ('ple', 962),
 ('ber', 927),
 ('tions', 915),
 ('ac-', 913),
 ('es', 902)]

Correction 2 -- Fix line endings

In [16]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 2

In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction2

Average verified rate: 0.9530517833206363

Average of error rates: 0.05505456

Total token count: 6332232

In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('co', 19976),
 ('w', 14245),
 ('-', 14088),
 ('e', 13462),
 ('br', 9842),
 ('m', 8882),
 ('d', 7809),
 ('r', 7510),
 ('f', 7493),
 ('g', 7221),
 ("'", 7009),
 ('t', 4811),
 ('gc', 4454),
 ('od', 4224),
 ('n', 3924),
 ('ck', 2990),
 ('pa', 1926),
 ('k', 1757),
 ('mt', 1690),
 ('bf', 1519),
 ('va', 1467),
 ('th', 1181),
 ("canvassers'", 1100),
 ('es', 823),
 ('wm', 671),
 ("'the", 657),
 ('--', 633),
 ('u', 631),
 ('pg', 545),
 ('-the', 491),
 ('z', 481),
 ("the'", 396),
 ('sp', 371),
 ('hm', 362),
 ("'of", 347),
 ('sabbathschool', 329),
 ('x', 324),
 ('reichenbach', 307),
 ('ok', 292),
 ('mcelphatrick', 291),
 ('-of', 288),
 ('pp', 249),
 ('seventhday', 247),
 ("colporteurs'", 247),
 ("'and", 246),
 ('-and', 236),
 ('-to', 234),
 ('buttermore', 230),
 ('al', 222),
 ('-a', 218)]

Correction 3 -- Remove extra quotation characters

In [19]:
# %load shared_elements/replace_extra_quotation_marks.py
prev = cycle
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 1:
                if token_list[-2] is 's' or 'S':
                    pass
                else:
                    corrections.append((token, re.sub(r"'", r"", token)))
            else:
                pass
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
        else:
            pass
    
    if len(corrections) > 0:
#         print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Correction 4 -- Address extra dashes

In [20]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
            
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
        else:
            pass
        
    if len(replacements) > 0:
#         print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 4

In [21]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction4

Average verified rate: 0.9602815412218036

Average of error rates: 0.047562560000000004

Total token count: 6340528

In [22]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[22]:
[('co', 20599),
 ('w', 14345),
 ('e', 13710),
 ('br', 9883),
 ('m', 8952),
 ('d', 7884),
 ('r', 7616),
 ('f', 7574),
 ('g', 7285),
 ("'", 6382),
 ('t', 4989),
 ('gc', 4462),
 ('od', 4234),
 ('n', 4006),
 ('ck', 2995),
 ('pa', 1946),
 ('k', 1779),
 ('mt', 1704),
 ('bf', 1522),
 ('va', 1472),
 ('th', 1236),
 ("canvassers'", 1079),
 ('es', 848),
 ('wm', 676),
 ('u', 656),
 ('pg', 545),
 ('z', 489),
 ('re', 474),
 ("the'", 385),
 ('sp', 375),
 ('hm', 365),
 ('x', 339),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('ok', 297),
 ('mcelphatrick', 291),
 ('al', 257),
 ('pp', 253),
 ('seventhday', 251),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('nd', 226),
 ('cc', 213),
 ('ce', 210),
 ('bfl', 206),
 ('barto', 204),
 ("to'", 196),
 ('oertley', 189),
 ('ca', 187),
 ('wc', 186)]

Correction 5 -- Address Burst Words

In [23]:
# %load shared_elements/rejoin_burst_words.py
prev = cycle
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    pattern = re.compile("(\s(\w{1,2}\s){5,})")
    
    replacements = []
    clean.check_splits(pattern, spelling_dictionary, content, replacements)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 5

In [24]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction5

Average verified rate: 0.9602856941881112

Average of error rates: 0.047561599999999996

Total token count: 6340486

In [25]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[25]:
[('co', 20599),
 ('w', 14340),
 ('e', 13687),
 ('br', 9883),
 ('m', 8946),
 ('d', 7880),
 ('r', 7597),
 ('f', 7572),
 ('g', 7280),
 ("'", 6382),
 ('t', 4968),
 ('gc', 4462),
 ('od', 4234),
 ('n', 3996),
 ('ck', 2995),
 ('pa', 1946),
 ('k', 1776),
 ('mt', 1704),
 ('bf', 1522),
 ('va', 1472),
 ('th', 1236),
 ("canvassers'", 1079),
 ('es', 848),
 ('wm', 676),
 ('u', 653),
 ('pg', 545),
 ('z', 489),
 ('re', 475),
 ("the'", 385),
 ('sp', 375),
 ('hm', 365),
 ('x', 339),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('ok', 297),
 ('mcelphatrick', 291),
 ('al', 256),
 ('pp', 253),
 ('seventhday', 251),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('nd', 226),
 ('cc', 213),
 ('ce', 210),
 ('bfl', 206),
 ('barto', 204),
 ("to'", 196),
 ('oertley', 189),
 ('ca', 187),
 ('wc', 186)]

Correction 6 -- Address Split Words I

In [26]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 6

In [27]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction6

Average verified rate: 0.9608287928828794

Average of error rates: 0.04697536

Total token count: 6337844

In [28]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[28]:
[('co', 20033),
 ('w', 14323),
 ('e', 13528),
 ('br', 9806),
 ('m', 8931),
 ('d', 7855),
 ('f', 7567),
 ('r', 7546),
 ('g', 7259),
 ("'", 6382),
 ('t', 4932),
 ('gc', 4423),
 ('od', 4173),
 ('n', 3966),
 ('ck', 2989),
 ('pa', 1934),
 ('k', 1764),
 ('mt', 1703),
 ('bf', 1522),
 ('va', 1466),
 ('th', 1150),
 ("canvassers'", 1079),
 ('es', 826),
 ('wm', 675),
 ('u', 647),
 ('pg', 545),
 ('z', 488),
 ("the'", 385),
 ('hm', 365),
 ('sp', 363),
 ('x', 338),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('mcelphatrick', 291),
 ('ok', 287),
 ('re', 281),
 ('seventhday', 251),
 ('pp', 251),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('nd', 221),
 ('cc', 213),
 ('bfl', 206),
 ('barto', 204),
 ('al', 203),
 ('ce', 201),
 ("to'", 196),
 ('oertley', 189),
 ('wc', 186),
 ('q', 185)]

Correction 7 -- Address Split Words II

In [29]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction7"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 7

In [30]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction7

Average verified rate: 0.9610930375444612

Average of error rates: 0.04669136

Total token count: 6336141

In [31]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[31]:
[('co', 20014),
 ('w', 14213),
 ('e', 13523),
 ('br', 9797),
 ('m', 8886),
 ('d', 7845),
 ('f', 7543),
 ('r', 7538),
 ('g', 7249),
 ("'", 6381),
 ('t', 4915),
 ('gc', 4423),
 ('od', 4168),
 ('n', 3956),
 ('ck', 2988),
 ('pa', 1933),
 ('k', 1760),
 ('mt', 1702),
 ('bf', 1522),
 ('va', 1465),
 ('th', 1134),
 ("canvassers'", 1077),
 ('es', 803),
 ('wm', 675),
 ('u', 611),
 ('pg', 545),
 ('z', 486),
 ("the'", 385),
 ('hm', 365),
 ('sp', 363),
 ('x', 338),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('mcelphatrick', 291),
 ('ok', 285),
 ('seventhday', 251),
 ('pp', 250),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('cc', 208),
 ('bfl', 206),
 ('barto', 204),
 ("to'", 196),
 ('re', 195),
 ('oertley', 189),
 ('wc', 186),
 ('al', 186),
 ('q', 185),
 ('syphers', 182),
 ("''", 180)]

Survey remaining errors

Get docs with high error rate

In [32]:
messy_docs = reports.docs_with_high_error_rate( summary, min_error_rate = .2 )
In [33]:
docs_2_check = [x[0] for x in messy_docs if x[1] > 0.3]
In [34]:
docs_2_check
Out[34]:
['CUV19120207-V17-06-page6.txt',
 'CUV19131224-V18-50-page2.txt',
 'CUV19131105-V18-44-page2.txt',
 'CUV19131210-V18-49-page4.txt',
 'CUV19131203-V18-48-page4.txt',
 'CUV19140211-V19-07-page4.txt',
 'CUV19160615-V21-24-page6.txt',
 'CUV19150819-V20-33-page6.txt']
In [35]:
# utilities.open_original_docs(docs_2_check, directories['cycle'])

The documents with high error rates are tables with canvasser information and images.

Get long errors

In [36]:
reports.long_errors(errors_summary, min_length=15)
Out[36]:
(['fairbanks-roosevelt',
  'raisingthelighthigher',
  'celestialvlidkiff',
  "north'philadelphia",
  'mionmenimmommmemmemmemmmom',
  'seventh-dayadventist',
  'ommemmomommommommummemn',
  "cenference'office",
  'oirecognitintiof',
  'vrafilaqdraining',
  'provirigtelaialvenderful',
  'idatinctibriihat',
  'recruiting-ground',
  'responsibilitity',
  'mmemmummmumnmemounmemmomm',
  'fellowpassengers',
  'has-treatment-rooms',
  "treasurer'sannualreport",
  "ofrathirn'alktlive",
  'thirteen-fourteen',
  'ethrindanclitient',
  'addre-ast-iviount',
  'colored-seventh-day',
  'otsligatoryunder',
  "the'determination",
  'amongunbelievers',
  'prayerlletlicated',
  'morningrollewing',
  'theresponsibility',
  'tolinnhleinadlaa',
  'frinnuittprigsfr',
  'memminimmemmemenmennmmemill',
  "the'mom'teffient'",
  'prograntprepared',
  'girtrudebillington',
  'mormiiiiiiiiiimiiiiii',
  'forweekendingmay',
  'generalqonference',
  'accomplishmentof',
  'positioniegarding',
  'nmemmememommemememmemmimi',
  'mmummmmuummenmemmenemmorm',
  "teiriptertion's'",
  "cenferendei'should",
  'strongconviction',
  'threshing-floors',
  'dimisailliaolosetkimesole',
  'notalwayspossible',
  'educationassists',
  'lowerpropensities',
  'mommemmemsommemiimmini',
  'therealpossessionoftherealinher',
  'outsideinfluences',
  "thatphineerinthe'workof",
  'sabbath-offering',
  'pennsylvanialconference',
  "and'home'departnient",
  'mminimminimmememennumumemmi',
  'ragerwestmorelandco',
  'austrian-serviin',
  'tatelminispoitan',
  'nfeetingatssentbies',
  'experimentalknowledge',
  'spanish-american',
  "ivicelphatrick's",
  'wednesdaymorning',
  'numbersincreased',
  'fullrepresentation',
  "have'really'deterrnihed",
  'alsotestimonials',
  'attractionshould',
  'memmememmemmeemmummememmm',
  'atfirsteotnmttnded',
  'traitsoreliaracter',
  'deiinififnations',
  'memmormormemminimumummipow',
  'eimmemmemememmmemememmemm',
  'mminimmummememmemmumminimme',
  'memmememminimmemmemminimmme',
  'dictator-president',
  'thetare-l-its-a-fen',
  'twenty-cents-per-week',
  'yrogrampqmmittee',
  'babyloncomnrandments',
  'major-generalsickles',
  'societysecretary',
  'disease-carrying',
  'schoolconvention',
  'atijeitljranaithitis',
  'cigarettesmoking',
  'mminimeinimiiiiiiiiii',
  'twenty-eent-aweek',
  'ingatheririgwork',
  'comitittiteallithe',
  'betweenfourandfive',
  'havecertificates',
  'watchengtheeducation',
  'ellwangerworcester',
  'encourconsecration',
  'inunselfiehservice',
  'mimiiiiiiimmiiiiiim',
  "brother'buttermore's",
  'quarter-distance',
  "church'school'for",
  'mommmemmemminommeminimm',
  'emmemmemmmmemmmemmemmmum',
  'personalattention',
  'fdrreatabjishirtg',
  'induatchtlacwdenfty',
  'personalsoliciting',
  'largestcamprmeetiogaohio',
  'greatlyifacilitate',
  'wilmingtoilidelbe',
  'religieuallherty',
  'individual-account',
  "adventist'establishment",
  're-creation-redemption',
  'respcnsibilities',
  "christ'sobjectlessons",
  'istoaeltishshadow',
  'artist-architects',
  'pictorially-treated',
  "this'boliference",
  'isevilleelviedina',
  'ourootuorlateletrir',
  'charrlelestsotno',
  'earnestfirst-day',
  'seventh-day-adventist',
  'visitorornmalorganof',
  'before-ordination',
  'officialorganofthe',
  'minennenommirmenee',
  'responsibilinstead',
  'recommentlations',
  'sanitarium-management',
  'wearesurethatyou',
  'libertyfdepartment',
  'followingtlinesltroni',
  'considerableliterary',
  'hackensack-paterson',
  'missionstatement',
  'lighthousekeeper',
  'rtvelirtafguntil',
  'consecratdistinct',
  'conferenceoffice',
  'sanitaritheirdonations',
  'mommemmommmmmummmemmmem',
  'superintendent-and',
  'sandwithedibetween',
  'showethforthrighteousness',
  'notesfromtheconference',
  "representatives'",
  'presentindebtedness',
  'privilege-except',
  'disciplinedintel',
  'after-these-dear',
  'coluivrbfk-union',
  'workindustriously',
  'missrobbiesutphin',
  'alladvertisements',
  'fiften-cent-a-week',
  "oheelipthiehld'get",
  'disappointaiiint',
  'attsiergailigsmiritg',
  'indianapolis-southern',
  'workedfifty-seven',
  'therighteousness',
  'characteristicthat',
  'high-ceilingedroom',
  'atthehomeofthebride',
  'generaldeportment',
  'accoinplishmeats',
  'josephinebauerlein',
  'oriental-countries',
  'ted-with-thecounty',
  'pleasure-seeking',
  'sittlicitityhone',
  'mminiummismemmomemmesiiiii',
  'conferencecomposing',
  'uncoinplaiitingly',
  'quiterimpossible',
  'spiritual--dondition',
  'affectionatefather',
  'westpennsylvania',
  "training'schools",
  'instructivesermon',
  'comingcumstances',
  'departurefronrit',
  "harvestfleld'where",
  'childrenotherwise',
  "that'prace'cluring'phe",
  'mmiimmemommemmiummemmemems',
  'witliattiortivii',
  'missionenterprise',
  'mmommumionmemmemmemeremmim',
  'life-companionship',
  'church-jnembership',
  'interest-gripping',
  'delinquentchurches',
  'missittoankmrereises',
  'miniumsemommmiimmemiim',
  'twenty-centsa-week',
  'donatingheartily',
  'ucationalmeeting',
  'insidevtheleityilitsla',
  'jemwoisuhnvternoncollegefund',
  'spirituallessons',
  'dollar-a-week-fund',
  'anti-organization',
  'rhawiltdertronotrovithmsau',
  'mmiiiirmiiiiiiii',
  'curiosity-consumed',
  'everydollarwillaid',
  'methodistepiscopal',
  "hisfather'siliness'",
  'cocliotyrfeodrweigonrk',
  'churchrmembership',
  'continuallyunder',
  'mmummememommmummemmu',
  'medically-trained',
  'ommeensmemmeemmumemomm',
  'danish-norwegians',
  'springfield--elder',
  "sundown'isaturday",
  "present'signified",
  'ordainedministera',
  'presents-another',
  'geneatiordhcation',
  'weestmidiedligmys',
  'westliennsylvanin',
  'accompanyfaithful',
  'sietieisoeerfully',
  'emmenemmummommemirmnernmme',
  'isolatedbelievers',
  'westernhemisphere',
  'iamengagedinaline',
  'seriousconsideration',
  'iimmimmemmennemmemmenommem',
  'quarter-a-quarter',
  'erfurbonsiderati',
  'norththumberland',
  "stanley'ssabbath",
  "nmetviltliwfte'end",
  'mommummumummensimmum',
  "alreptly'accoinplishedwill",
  'exhibitions--one',
  'minimemememenomemme',
  'weearnestlypraythat',
  'thelrefiittitlittidettaffeleilititi',
  'legislationnutside',
  'thingeforthyselfseek',
  'health-destroying',
  "canvassers'reports",
  'important-numbers',
  'religiouscoviction',
  'minimmemminimmemmummenimmem',
  'tellingyouthowthelord',
  'theirliereitoeyas',
  'commander-inchief',
  'seventhday-adventists',
  "professor'fredrick",
  'mommommommommirlimmummom',
  'laboredialthfully',
  'asteepiidtjestia',
  'ithpresaivebaptis',
  'preeispirthatwhich',
  'colporteur-evangelists',
  'earnestlydesired',
  'exeellentinumber',
  'twenty-centa-week',
  'redthatallwhowereinattendance',
  'shalibepreadlied',
  'indescriminately',
  'princefredricktown',
  'instruinefitality',
  'dubtetittheservice',
  'eighteeirkeepipi',
  'christlessgraves',
  'fifteen-cents-a-week',
  'stick-to-it-ive-ness',
  'churehtreasurers',
  'menummemeneumemmkpmememum',
  'holinesso-igliteousness',
  'arrangementshave',
  'twenty-cent-a-week-fund',
  'pennsyldistribution',
  'minimmmemmememmmummumemomm',
  "nearing'thishome",
  'heart-wanderings',
  'emmemmemmusiimem',
  'circulatapproval',
  "canvassers'institute",
  'organizsucceeding',
  'reirresentatives',
  'immediatecontact',
  "tract'society'at",
  'considerationfrom',
  'medicalevangelists',
  'standard-bearers',
  "experienned'eanvassers",
  'hitontibligiting',
  'zimmermanmorrisonscove',
  'sinakerepresents',
  'constituencylwill',
  'fifty-cent-a-week',
  'hospital-sanitarium',
  'diseaseproducing',
  'greatgrandchildren',
  "earnest'desireto",
  '-------------------------',
  'cent-a-week-fund',
  'thousandsvisiting',
  'missionliterature',
  'thirtyfivecounties',
  'memmemmempenememmemememem',
  'inproportiontothetimeyoucan',
  'onniimmemminiminomommememm',
  'witilltiteispirit',
  'largeintroduction',
  'principaljeature',
  'missionarygiconvention',
  'sarveracknowledge',
  'ileretnittalfested',
  'minmemmemmemoimmommmememomm',
  'mmemmmemmememmommommeminin',
  'iiiiumumilumliii',
  'imminsimmummommumm',
  'mmemememememmememmemeemee',
  'minimemmemmommeolmmemminimm',
  'interest-bearing',
  'possisignificance',
  'honsiciiiiiimeiromminiiiimmill',
  'mmixemmemmemminommirm',
  'ememememememummememennemm',
  'traitstruthfulness',
  'righteousnessunto',
  'mantle-enshrouded',
  'gospelremedyforpresent-day',
  'demonstratedthat',
  'dedemonstrations',
  'innationlplinber',
  'wouldbewithessed',
  'eduoationaladvancement',
  "nebuchadnezzer's",
  'eatforeign-bread',
  're-establishment',
  'mfmniimmemmoimmemminmemewomm',
  'societytreasurers',
  'uselessexpenditures',
  'experiencesduring',
  "become'acquainted",
  "this'enlargement",
  "faithful'efforts",
  "colporteurs'institutes",
  "payinent'otieskhof",
  "longedintensely'",
  'twenty-five-cent-a-week',
  "andheaven'shelpimplore",
  "theirbonsevi'anti",
  'cause-of-education-lies-near-to-your',
  'amongtheteacrhes',
  'reicraibanglitediftliraisifort',
  'fifteencent-a-week',
  'itibildriptiorielorfiut',
  "testament'ilistory",
  'sitnilitadecision',
  'accountsreceivable',
  'mmullonommummummmuung',
  'msmememmummemmummemememem',
  'thecircumstances',
  'emiiiiiiremereedieme',
  'seibertdiscussed',
  'health-restoring',
  'conference-t-mission',
  "petipleit'itinpaigt",
  'hisquestionnaire',
  'wehavereadwithprofound',
  'memmemmmemmemmmommommumm',
  'whichsheregularlyattended',
  'iniminimmmummummommeminimmem',
  'characteristic-s',
  "pleastrui''year's",
  'advertiseneglected',
  'ineasurehelonging',
  'stipposhilitraining',
  'commandment-keeping',
  "evebittgrattene'elock",
  'fourteen-thousand',
  'secretary-treasurer',
  'accomplishinints',
  'keligiousliberty',
  'selcsupportingaiiiisionary',
  'stiperintendeiff',
  'familiesgathered',
  "third'angel's'mesdage",
  'secnremayschtflarship',
  'whentheworldarounduswillgive',
  'mememeriviiiiiiiiii',
  'conferenceoommittee',
  'tsofinterestpresen',
  "eewidl'ree'venttdollars",
  'mumemmummiimummormomummene',
  "hsailsettiessakelromiliihe'lliord",
  'iovtinglilleavenly',
  'ever-appreciated',
  'responsibilitiei',
  'mememememmemmemommemminom',
  'beingaccomplished',
  "p'ncefrederickt'n",
  'zimmerlystrenton',
  'inter-denominational',
  'conductedoespecially',
  'october-november',
  "photogrophers'and",
  'beingperpetrated',
  'and-thanksgiving',
  'bibleygood-clwistian',
  'giventoncerningthe',
  'ammummamontommok',
  'shenandoalivalley',
  'respowsibilities',
  'imummompsimempommaimomp',
  "suell'ilnestions",
  'accompliatnetits',
  'twenty-cents-a-week',
  'mmmemmemmmemmemememmim',
  'church-and-state',
  'clarareichenbach',
  'religiousliberty',
  'jerseyconference',
  'mmemmemmemmememummemommo',
  'twenty-cents-aweek',
  'thoroughlyinvestigating',
  'anti-tuberculosialeagues',
  'characterbuilding',
  'unrighteousriess',
  'publishingaesociation',
  'gatheringbeviews',
  'arepeedgesailack',
  'mmemenummumummummenmermibm',
  'beautifutgrounds',
  'symptornsofpohoning',
  'convertinginfluence',
  "prineegeorge'sco",
  "whole'conferencecomfnittee",
  'mmiiiiimminumbinimmisminimmimuni',
  'sistersthroughout',
  'addresseikturnished',
  'eiiiiiiiiimiiiiimini',
  'spiritualization',
  'columbiallitiottillinitori',
  'tvgiblegsitlifunint',
  'miiiiiiiiiiiiiiiii',
  'andwithaneatappearanceinthe',
  "cornmandmentits'pnrpose",
  'eighty-eightoneedy',
  'misimmemminimumminimmemmeur',
  'mminimmememmemmemememmemen',
  'wanted-something',
  'differencebetween',
  'gospelopportunity',
  'contributedduring',
  'emmemomummemmememmememmem',
  'long-to-beremembered',
  'acquaintitairies',
  'accountancy-offered',
  'otsevexalfyikatt',
  "glylor'giielielp",
  'self-sacrificingly',
  'canvasser-evangelists',
  'missionarycollege',
  'viottiomone-tenth',
  'tporsjawititinghaptism',
  'important-business',
  'thenorwalkehurch',
  'thirteenth-sabbath',
  'vicallisterville',
  'enjoyedlistening',
  'mmemememomemmemmmeemmoimmin',
  'anionntofpepetry',
  'pleyeoaciistitie',
  'assenibleoeerned',
  'studentcanvassers',
  'oldacquaiutences',
  'memmememmememmemmemmemomm',
  'anumunnursmiunimummunino',
  'imummummummummummum',
  'this-characteristic',
  'life-lengtheners',
  'mmommumerrimmuemmemmeinme',
  'missionaryspirit',
  'mightiefittmeitaagethat',
  'peoplelienderecl',
  'strong-sensation',
  'magistratesoffice',
  'commandment-loving',
  'coronationnumber',
  'seresponsibility',
  "accompanied'her'to-the'southern",
  'harvest-ingathering',
  'ourrnissionaries',
  'emmemmemmwommemmemmiiiimmo',
  'stick-to-it-iveness',
  "readltheibible'through",
  'havingcommittees',
  'mioughittxtualos',
  'aratelxvissionary',
  'interestingexperiess',
  'killinyldelkeditteeds',
  'religio-political',
  'eibininiiiiiiiiiiie',
  'anlutheranpeople',
  'punctuality-compulsion',
  'importantmeeting',
  'indiserittinately',
  'liliscelyarneoris',
  'criticismagainst',
  'brethrenandmyselfhadtheprivilegeof',
  'eighteoustiessiwhose',
  'foreign-speaking',
  'ebtapailioiiehip',
  'instrumentalites',
  'unostentatiousness',
  'tirnlessandustry',
  'couldconscientiously',
  'business-stenographic',
  'firatresurrection',
  'partners--higher',
  'offfioditeditivni',
  'yainqiidereeither',
  'adjtisinietitair',
  'thirteen-hundred-dollar',
  "aggressive'workfor",
  'odoodoodoolljdooo',
  'southwesternittnionirecord',
  'besidesherimmediatefamilysheleavesto',
  'ininiiiiimendmenisem',
  'tifeatchnidndlor',
  'selfjustification',
  'local-conferences',
  'unappreciatively',
  'throwsbitriselcupon',
  'interestinterwoven',
  'missionaryagenelee',
  'hydrotherapeutist',
  'fivecent-per-copy',
  'consciencestricken',
  'iniiiiimiiiiiiimill',
  "nibilewe'liee'p'enr'e'",
  'half-scholarships',
  'statepenitentiary',
  'momminimmemmemmeminimmemnom',
  "shouldloeate'all",
  'joint-resolution',
  'anold-fastionedwoman',
  'earnestlirequested',
  'mmemmommmomememmemmemo',
  'theannottneetnerit',
  'valuableassistance',
  "the'deed'iarecorded",
  'thedentenitiatioual',
  'committee-meeting',
  'fcleiridiploatiop',
  'oiirnhnreheswill',
  'withinthetimespecified',
  'apositionwithbrother',
  'hydrotherapeutie',
  'individualmembers',
  'accrediteddelegates',
  'sabbathechoulefeuretary',
  'pastor-preachers',
  'sabbathofferings',
  "brotherarohiebrownlee'grandtotal",
  'statementcalculated',
  'iminienemeneniiimmin',
  'chambersblirgmag',
  'mmememeneemeemmemememmemm',
  'practicallyevery',
  'imminmemsmommommimummilms',
  'ortheonnierelide',
  "continuiie'until",
  'unummemmemommemommemem',
  'progressingnicely',
  'been-considerable',
  'schooldepartment',
  'mimmmommummemmmememmommme',
  'mmemmiuminimminimmummem',
  'fifteen-cent-a-week',
  'cottonplantation',
  'othersconsecrating',
  'unioirconference',
  'scandinaviancompany',
  'cornmunicatienthrough',
  "iluarterly'meethig",
  'fhtilitirvisitied',
  'respectfultoward',
  'thatexpressionfmade',
  'beenbaptizedsince',
  "suninier'scampaign",
  "sabbath-keeper's",
  'readingsregularly',
  "dedicatory'services",
  'tabernacleswassheld',
  "writtem'arficles",
  'subscriptionsand',
  'mmiimmememmememinimmommememi',
  'tolinisittliegreat',
  'appreciatecitheirtimely',
  'missionarylcollege',
  'necessarybusiness',
  'ingatheringreviews',
  "summer'scampaign",
  'resppiisittilitiesr',
  "on'closer'investigation",
  'monthspastnurses',
  'instituterecently',
  "christ's'example",
  'immumminsmummummainsmsnm',
  'world-circulating',
  'organizationoftheohioconference',
  'faithfulthatpromised',
  'viffieelersistrg',
  'coaference--mission',
  'fifteen-cent-a-wk',
  'sateanancialpoliey',
  'isbeginningaseries',
  'student-colporteur',
  "their'enthusiasm",
  'becominginterested',
  'encourageinentthat',
  'approvedadvertisements',
  'mmemmummummemmmemmmemomme',
  'generalebversight',
  'danish-norwegian',
  'self-sufficiency',
  'dilapidated-looking',
  'takethisekarrtination',
  'ciliiilalessoctna',
  'rounded-outmessage-filled',
  'selfrighteousness',
  'composition-rhetoric',
  'dollarper-member',
  'different-plaees',
  'emmoimmemmemmmmmommememmem',
  'momminammummummummum',
  "canvisers'report",
  'distinguishbetween',
  'confidentlyexpect',
  'austro-hungarian',
  "f'lkorowrdokhlii",
  'constantinopleor',
  'principlofftaitierican',
  'perseverancesuch',
  "laboes'seductive",
  'blessingswithout',
  'otkidopelwithahe',
  'ourcarnpmeetings',
  'followingquotation',
  "governor-general's",
  'trust-worthiness',
  'stgusakiklwavakes',
  'reresponsibility',
  'companrherewhoare',
  'mistmderstanding',
  'momenizeimismnomisminiimiif',
  'erickson-andross',
  'cleanseourselves',
  'memummemmrommemmommeminime',
  'isighedukttreath',
  'thicklypopulated',
  'nicationsintended',
  'beautifulgarments',
  'heantifulidlewood',
  "offered'atnaller",
  'literaturerprepared',
  'senditigotapaidr',
  'nemmemmemmememmememmmemen',
  'americanbrethren',
  'columbiasunivernity',
  'founteetiripktelikets',
  'middietownournet',
  'heart-expression',
  'expressioncontrary',
  'writterrtirttlftimerttifeltrovingatrri',
  'self-sacrificebasis',
  "cincinnatilviiiirefi'",
  'speciariltanksgiving',
  'ten-cents-a-week',
  'newkriaistiontedi',
  "secretary-treasurer's",
  'anyysanyosanyyynyos',
  'wegoduswthdiochouisz',
  'encounteringopposition',
  'backslidgentleman',
  'reversecendition',
  'advertisementfor',
  'greatlystrengthened',
  'emphasized-the-importance',
  'fincrilienrirdifferently',
  'selfforgetfulness',
  "biennial'session",
  'sufferinghumanity',
  'thoroughsstablished',
  'thoroughlyfurnished',
  'virtualacknowledgment',
  'severalconferinneain',
  'disease-destroying',
  "liberty'secretary",
  "s'arrival'tithoihe",
  "standing'eommittees",
  'theseactualities',
  'iimmemmememememomminimmmem',
  'moimmommommummmumm',
  'andfonorgetioallyi',
  'approachableness',
  "conference'laborers",
  'shenandoahvalleyacademy',
  "sabbathschools'in",
  "the'tenipirairee",
  'aannddwedesireyouto',
  "doubtless'approved",
  "babylonish-medo-persiani'",
  'fifteen-per-cent',
  "thalinterest'will'",
  'preparedliterature',
  'perseverancewould',
  "tha'olitiveliralid",
  'brethrenandsisters',
  'preparetheitisolves',
  'totalliabilities',
  'minimminimmemmmimemmemmemom',
  'should-beencouraged',
  'minimmememememememmmummemo',
  'vandergriff-nellie',
  'hadbeenlookingafterthechurchat',
  'mmuummuummememusemomminimm',
  'preachingservices',
  'teen-cent-a-week',
  "'''''''''''''''''",
  'roumaniariniussian',
  'frorilikivoutittetthands',
  'accomplishedhere',
  'micointnendationi',
  'andwillgoforthfromthathomerealiz',
  'intelligentlittle',
  'iimminiminimemininememenememm',
  'excellentsafeguard',
  'medicallsecretary',
  'istotviiilistanding',
  'effectiveservice',
  'goothcongregations',
  'chisanrishathaim',
  'thewestphiladelphiastation',
  "we'reilliistrating",
  'expressingsympathy',
  "aryangetnerits'shall",
  'memmenemmummommummomminimm',
  'hawley-honesdale',
  "superintendents'",
  'sabbathschoalmissioriary',
  'responsibilityand',
  "tvaining'sohoolin",
  'mearlklopfenstein',
  'prophecy-fulfilling',
  'indeetitgratilyipg',
  'airizifigarirtgar',
  'cternmatidmelits',
  "recognition'have",
  "itls'these-feite",
  "sisterulery'sgreatestdelightwas",
  'orstbuyistisiires',
  'minimminimmimmemmmemoimmenum',
  '------------------',
  'conferencestosee',
  'just-ordereolhis',
  'lrfolglhfliathnontei',
  'memimmuummummemmemmionmemm',
  "and'unendinglife",
  'one-dollar-aweek',
  'mniumposommiummmummint',
  'delegates-at-large',
  'sttmonconcerning',
  'withglongingdesireto',
  'columbiaunionconference',
  'farmingdistricts',
  "sitfroililferour'",
  "distilevm'eeting",
  "the'snperintendants",
  "pansigrnier'people",
  'linesofotinstruction',
  'mumminimminimmummemmummoimms',
  "popula'rsubseription",
  'specialblessings',
  'minimmuminimmemmummeummorre',
  'magazine-reading',
  'missionarieswhile',
  "princegeorge'sco",
  'southern-illinois',
  'neighborswithout',
  'mememmeminimmeememe',
  'one-dollar-a-week',
  'mmommermummemmernimmummem',
  'differenvemanifestati',
  'propermithisteriat',
  'thanksgivingsfor',
  'twenty-five-cents-a-week',
  'systemspreviously',
  'zoolertstarmravaymieeohl',
  'iiiiiiiiooelieeedoom',
  'rrepresentatives',
  'churchschoolteachers',
  'inimmemmemmemmemmem',
  'conferincapacity',
  'slavish-bohemian',
  'studenticolporteurs',
  'emmememmemommmememmommme',
  'earnest-christains',
  'christmas-vacation',
  'formerlyattended',
  'iiiiiiiiiiiiiiiiiiiiiiiiinmiiiiiiiiiiiiiiii',
  'fifteen-centa-week',
  'mminimmememminimmeemmemmemm',
  'beginningjanuary',
  'applicatluntnaile',
  'canvasser-evangelist',
  'tilliunrighteoustiess',
  'vwehrseanlmitlyn',
  'filteen-cent-a-week',
  'civil-engineering',
  "name'withoufflinching",
  'withrviffeeilient',
  'ntermediateleiisons',
  "thisour'only'institation",
  "naerloan'neighbors",
  'imeineineeeniniinne',
  'subscriptionincluding',
  'doyoueverthinkofchina',
  'developcharacter',
  'faithfulcolporters',
  'verlagsgesellchaft',
  'temperanceperiodical',
  "influential'people",
  'arreffbifndrawoontliititil',
  'attention-please',
  'peaceful-looking',
  'disfellowshiping',
  'selfconsciousness',
  'secretarytreasurer',
  'maromemmemonommoodoolid',
  'ingatheringliervices',
  "short'discourses",
  "froin'aecintiniating",
  'interestingothers',
  "tiltweritonthly'",
  'portanceofacleandiet',
  'persecutionsbegin',
  "mybrother'skeeper",
  'faciisqualttligiladits',
  'messengersrecognize',
  "professor'graham",
  'buttohealfromplagueandsickness',
  'sabbath-schoolsgave',
  'coinmunicationproceed',
  "colporteurs'iepqrts",
  'fifteen-eenta-week',
  'endencouragement',
  'superintenelentaskink',
  'self-gratification',
  "transgressions'are",
  'professorsclymer',
  'these-experiences',
  'jewisittypewrier',
  'sabbathschool-convention',
  'sanitarium-hospital',
  'mummimmuummommemmememmommh',
  'stoop-shouldered',
  'differentdepartments',
  'success-structure',
  'thoumissionaries',
  'thitigciyertiagaio',
  'superintendent-hewed',
  'intelligentlooking',
  'faithfullycaring',
  'jacobwestmoreland',
  'deniandedexorbitant',
  'johnsonalleghney',
  'medicalsecretary',
  'isonhiswaytothewest',
  'altsscumeditohbelieye',
  'withoutdecreasing',
  'mommommiummommmmommiumwm',
  'medicafibeparfment',
  'rommermeminimmemmommemminim',
  'exaggeratedclaims',
  'lrielpticraeclaisinr',
  'thesabbathschool',
  'lookadtorrwesley',
  'zimmermanlvlorrison',
  "pennsylvania'sanitarium",
  'iiiiidayafillnoligja',
  'superintenderitto',
  'mmemommemminimmmemimm',
  "eleveti'hiffeettls",
  'mportantessentia',
  "opening'nfeetfug",
  'inventoriesexpense',
  'mummiumusummunismammiums',
  'opportunitiesfor',
  'healthycondition',
  'missionaryrcollege',
  'neveriuttereclia',
  "tarryingofthe'bridegroom",
  'sanitarium-seminary',
  "observant'magazine",
  'mmimuniniiiiiiiiiiiiiin',
  "uniform'examinations",
  'conferencemissiciit',
  'wiamifirytortsariergoftlaitp',
  'twentycent-a-week',
  'constitutionoftheohioconference',
  'responsibilifles',
  'danisb-norwegians',
  'chifichseernedanxious',
  'gerhattiburlington',
  'commandmentkeeping',
  'everytrue-believer',
  'charlottesvillebr',
  "contenient'clistance",
  'notwithstandinethe',
  'thedrearfamhouse',
  'heaven-appointed',
  'consideringaccepting',
  "ezekiel'aprophecy",
  'btottghtlfiltlir',
  'memminimmemmiummememminimen',
  'remittanspiritual',
  'andsingtothepeoplethu',
  'gongregationalist',
  'adventistaernoone',
  "willimanhouston's",
  'everythingseemed',
  'establishedtracts',
  'regularexamination',
  'ismegyssfeesietkeioso',
  'mmemmemmemmemememmmmemerm',
  'yis-da-e-ne-thie',
  'minummumommummilmimummamm',
  'ertethiragernent',
  'carelessobserver',
  'health-improving',
  'emmemmomemmummemmmommeme',
  'minimiiiimmeimmememe',
  'someorganization',
  'surnamedthemselves',
  'amemmemmermiimmemomminimmem',
  'responslbiiities',
  'memumumimmememummommemoms',
  'educational-fund',
  'procureadditional',
  'harvestingatheringin',
  'appointeclpresident',
  'miniwommominimmil',
  'self-forgetfulness',
  'incubatoricellar',
  "etiti'leainirlove",
  'mmommemememmememminimmomme',
  'butgodtsizeening',
  'letthelioinejand',
  'pledgesamounting',
  'personal-neatness',
  "agreeably'suprisod",
  'miimmemememmummmemmimmem',
  'pleasure-grounds',
  'muussesumersimmomemmem',
  'everysetenthrigi',
  'wretched-looking',
  'aymelltalarentart',
  'fifteen-cent-aweek',
  'greatdisadvantages',
  'prograthipublished',
  'mememmemmemmennmenemememe',
  'withinleathercovers',
  'blessedrichlymally',
  'mommemmemmemminmemmeememme',
  'inenimennimennineini',
  'issionarycollegepressatberrein',
  'wereiappreelated',
  'warningrisiftraiftio',
  'waftedheavenward',
  'great-grandchildren',
  'nationsandforsal',
  'instumentalitities',
  'goodrepresentation',
  'eimembissiesiiiiiiiii',
  'responsibilityof',
  'temperance-dynamite',
  'membership-black',
  'bahadfrealerithe',
  'preciousexperiences',
  'judgmentnommenced',
  ...],
 15)
  • mommemmommmmmummmemmmem and other variations with long strings of "m". remove these to avoid noise. Locate a series of "m"s within a token -- findall(r'([m+]{2,})') -- and check the length of the results -- if len(findall) > 3.

  • Split of long words will be difficult as there seems to be a combination of spelling errors and conjoined words.

Correction 8 -- Remove long error tokens

In [37]:
# %load shared_elements/remove-tokens-with-long-strings-of-characters.py
prev = cycle
cycle = "correction8"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    sub_list = ["m|M", "i|I"]
    
    replacements = []
    for sub in sub_list:
        replacements.append(clean.check_for_repeating_characters(tokens, sub))
    
    replacements = [item for sublist in replacements for item in sublist]
            
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
CUV19060613-V10-24-page4.txt: [('Ammummamontommok', ' ')]
CUV19160921-V21-37-page8.txt: [('EIMIIIIIIIIII', ' ')]
CUV19160928-V21-38-page8.txt: [('HONSICIIIIIIMEIROMMINIIIIMMIll', ' ')]
CUV19161005-V21-39-page8.txt: [('OMMEMMOMOMMOMMOMMUMMEMN', ' ')]
CUV19161019-V21-41-page8.txt: [('MMIIIIIMMINUMBINIMMISMINIMMIMUNI', ' '), ('MMINIMEMIMMINIMMINIMMNIMMIMMIM', ' ')]
CUV19161116-V21-45-page8.txt: [('MEMOMMEMMEMMOD', ' '), ('MMEMEMENEEMEEMMEMEMEMMEMM', ' '), ('MMEMEMEMEMEMMEMEMMEMEEMEE', ' ')]
CUV19170104-V22-01-page8.txt: [('MMIIMMEMEMMEMEMINIMMOMMEMEMI', ' '), ('EMMEMMEMMWOMMEMMEMMIIIIMMO', ' ')]
CUV19170111-V22-02-page8.txt: [('IIIIIIIIIIIIIIIIIIIIIIIIINMIIIIIIIIIIIIIIII', ' ')]
CUV19170215-V22-07-page8.txt: [('INIMMEMMEMMEMMEMMEM', ' '), ('MMINIMMUMMEMEMMEMMUMMINIMME', ' '), ('MUMMINIMMINIMMUMMEMMUMMOIMMS', ' '), ('imummummummummummum', ' ')]
CUV19170222-V22-08-page8.txt: [('IIIMMOMMEMMENEMMEMOMMEMEMOM', ' '), ('MIRIMMOMMINIWORMEMEMEMMEMM', ' '), ('MOMMMEMMEMMINOMMEMINIMM', ' ')]
CUV19170301-V22-09-page8.txt: [('MEMMOMMEMMEMMEMEMMINIMIIMMO', ' '), ('MMEMMEMMEMMEMEMUMMEMOMMO', ' '), ('MEMMEMEMMINIMMEMMEMMINIMMME', ' ')]
CUV19170308-V22-10-page8.txt: [('MIMIIIIIIIMMIIIIIIM', ' ')]
CUV19170322-V22-12-page8.txt: [('INIIIIIMIIIIIIIMIll', ' ')]
CUV19170329-V22-13-page8.txt: [('IIMIIIIIIIIMIEM', ' ')]
CUV19170405-V22-14-page10.txt: [('MMOMMINIMMEMMEMEMEMMOMMEMM', ' '), ('MMOMMEMEMEMMEMEMMINIMMOMME', ' '), ('IIMIOMMEMOMMEMMEMMEMIIMMIMME', ' ')]
CUV19170419-V22-16-page8.txt: [('EMMEMMEMMUSIIMEM', ' '), ('IIMMEMMEMEMEMEMOMMINIMMMEM', ' '), ('MISIMMEMMINIMUMMINIMMEMMEUR', ' '), ('ROMMEMENUMMINIMMIMMEMMEMW', ' '), ('MEMMORMORMEMMINIMUMUMMIPOW', ' ')]
CUV19170607-V22-23-page8.txt: [('MINIMMUMINIMMEMMUMMEUMMORRE', ' '), ('MMEMMOMMMOMEMEMMEMMEMO', ' '), ('INIMINIMMMUMMUMMOMMEMINIMMEM', ' '), ('EMEMEMEMMUMOMMEMUMMEMMEMO', ' '), ('MEMUMMEMMROMMEMMOMMEMINIME', ' '), ('MEMUMUMIMMEMEMUMMOMMEMOMS', ' ')]
CUV19170705-V22-27-page8.txt: [('MIIIIIIIIIIIIIIIII', ' ')]
CUV19170719-V22-29-page8.txt: [('MMIMUNINIIIIIIIIIIIIIIN', ' ')]
CUV19170809-V22-32-page8.txt: [('MINIMMEMEMEMEMEMEMMMUMMEMO', ' '), ('immumminsmummummainsmsnm', ' '), ('mummiumusummunismammiums', ' '), ('AMMENUMMEMEMEMEMMENMEMMEM', ' '), ('moommummummummummumummis', ' ')]
CUV19170823-V22-34-page8.txt: [('MORMIIIIIIIIIIMIIIIII', ' '), ('ESIMENEMEMMIIIIIIIIIIII', ' ')]
CUV19170913-V22-36-page8.txt: [('MUMMIMMUUMMOMMEMMEMEMMOMMH', ' '), ('ROMMERMEMINIMMEMMOMMEMMINIM', ' '), ('mommummumummensimmum', ' '), ('MINIMMINIMMIMMEMMMEMOIMMENUM', ' '), ('MUMEMMUMMIIMUMMORMOMUMMENE', ' '), ('MMUUMMUUMMEMEMUSEMOMMINIMM', ' '), ('mniumposommiummmummint', ' ')]
CUV19171004-V22-39-page8.txt: [('milsommummummom', ' '), ('MUMMINIUMNSUMUMMOMMEM', ' '), ('UNUMMEMMEMOMMEMOMMEMEM', ' '), ('MMOMMEMUMOMMOMMOMMEMM', ' '), ('moimmommommummmumm', ' '), ('imminsimmummommumm', ' ')]
CUV19171018-V22-41-page8.txt: [('lommuntimmunimmumminmemma', ' ')]
CUV19171025-V22-42-page8.txt: [('MMINIMEINIMIIIIIIIIII', ' ')]
CUV19171101-V22-43-page8.txt: [('MEMEMERIVIIIIIIIIII', ' ')]
CUV19171108-V22-44-page8.txt: [('MEMEMMEMMEMMENNMENEMEMEME', ' '), ('IIMMIMMEMMENNEMMEMMENOMMEM', ' '), ('MEMMINIMMEMMEMENMENNMMEMill', ' '), ('MMINIMMEMEMMEMMEMEMEMMEMEN', ' '), ('MMEMMMEMMEMEMMOMMOMMEMININ', ' '), ('MMEMMEMMEMMEMEMEMMMMEMERM', ' '), ('MMOMMUMIONMEMMEMMEMEREMMIM', ' '), ('EMMEMOMUMMEMMEMEMMEMEMMEM', ' '), ('MINIMMMEMMEMEMMMUMMUMEMOMM', ' ')]
CUV19171115-V22-45-page8.txt: [('MMIIMMEMOMMEMMIUMMEMMEMEMS', ' '), ('MMINIMMEMMEMEMMINIMEMMEMMME', ' '), ('MMMEMMEMMMEMMEMEMEMMIM', ' '), ('ONNIIMMEMMINIMINOMOMMEMEMM', ' '), ('MINMEMMEMMEMOIMMOMMMEMEMOMM', ' '), ('MEMEMEMEMMEMMEMOMMEMMINOM', ' '), ('MIONMENIMMOMMMEMMEMMEMMMOM', ' '), ('MEMMENEMMUMMOMMUMMOMMINIMM', ' '), ('momminammummummummum', ' ')]
CUV19171129-V22-47-page8.txt: [('INIIIMINEIMIIIIIIMMOWNIPIRET', ' ')]
CUV19171206-V22-48-page8.txt: [('EIIIIIIIIIMIIIIIMINI', ' ')]
CUV19180103-V23-01-page8.txt: [('EIBININIIIIIIIIIIIE', ' ')]
CUV19180110-V23-02-page8.txt: [('MMIIIIRMIIIIIIII', ' ')]
CUV19180117-V23-03-page8.txt: [('EMMEMMEMMMMEMMMEMMEMMMUM', ' '), ('EMMOIMMEMMEMMMMMOMMEMEMMEM', ' '), ('EMMEMMOMEMMUMMEMMMOMMEME', ' '), ('EMMEMMMMEMMUMMEMM', ' '), ('MEMMEMMMEMMEMMMOMMOMMUMM', ' '), ('MIMMMOMMUMMEMMMEMEMMOMMME', ' '), ('MIIMMEMEMEMMUMMMEMMIMMEM', ' '), ('MOMMEMMOMMMMMUMMMEMMMEM', ' '), ('MMEMMUMMUMMEMMMEMMMEMOMME', ' ')]
CUV19180124-V23-04-page8.txt: [('mAnummommummin', ' ')]
CUV19180131-V23-05-page8.txt: [('MMEMENUMMUMUMMUMMENMERMIBM', ' '), ('MMOMMUMERRIMMUEMMEMMEINME', ' '), ('MMOMMERMUMMEMMERNIMMUMMEM', ' '), ('MMUMMMMUUMMENMEMMENEMMORM', ' '), ('OMMEENSMEMMEEMMUMEMOMM', ' ')]
CUV19180214-V23-07-page8.txt: [('MINIMEMMEMMOMMEOLMMEMMINIMM', ' '), ('MMUMMEMEMOMMMUMMEMMU', ' '), ('MMIXEMMEMMEMMINOMMIRM', ' '), ('mommommommommirlimmummom', ' '), ('MOMMEMMEMSOMMEMIIMMINI', ' '), ('MMINIUMMISMEMMOMEMMESIIIII', ' '), ('MMEMOMMEMMINIMMMEMIMM', ' '), ('MEMIMMUUMMUMMEMMEMMIONMEMM', ' ')]
CUV19180221-V23-08-page8.txt: [('MOMMEMMEMMEMMEMMEMMEMME', ' '), ('EMMENEMMUMMOMMEMIRMNERNMME', ' '), ('MMEMMUMMMUMNMEMOUNMEMMOMM', ' '), ('MEMMEMEMMEMMEMEMAMMEMMOUR', ' '), ('MEMMEMEMMEMEMMEMMEMMEMOMM', ' '), ('MfMNIIMMEMMOIMMEMMINMEMEWOMM', ' '), ('MEMEMOREMMEMMEEMEENEEMMEM', ' '), ('MEMMEMMEMPENEMEMMEMEMEMEM', ' ')]
CUV19180228-V23-09-page8.txt: [('MINIMMINIMMEMMMIMEMMEMMEMOM', ' '), ('MMINIMMEMEMMINIMMEEMMEMMEMM', ' '), ('mmimmemummomminummisummom', ' '), ('mommommiummommmmommiumwm', ' '), ('mmummomummismursammmemm', ' '), ('minummumommummilmimummamm', ' '), ('manummommammimmismiammis', ' '), ('MMEMEMEMOMEMMEMMMEEMMOIMMIN', ' '), ('Imminmemsmommommimummilms', ' ')]
CUV19180307-V23-10-page8.txt: [('MMINIMMINIMMEMEMENNUMUMEMMI', ' '), ('MEMMEMEMMEMMEEMMUMMEMEMMM', ' '), ('MOMMEMMEMMEMMINMEMMEEMEMME', ' '), ('MERMMEEMMEMEMEMEMEMMEMMOM', ' '), ('EMINIEMEMMEEMMEMMEENNUMMME', ' '), ('EIMMEMMEMEMEMMMEMEMEMMEMM', ' '), ('emmonsmmomminmemmammummm', ' '), ('MSMEMEMMUMMEMMUMMEMEMEMEM', ' ')]
CUV19180314-V23-11-page8.txt: [('MOMMINIMMEMMEMMEMINIMMEMNOM', ' '), ('NMEMMEMEMOMMEMEMEMMEMMIMI', ' '), ('NEMMEMMEMMEMEMMEMEMMMEMEN', ' '), ('EMMEMEMMEMOMMMEMEMMOMMME', ' '), ('AMEMMEMMERMIIMMEMOMMINIMMEM', ' '), ('NOMMIMMINEMINIMOMMEIMMINIMM', ' '), ('MEMMINIMMEMMIUMMEMEMMINIMEN', ' '), ('MINIMMEMMINIMMEMMUMMENIMMEM', ' '), ('MMEMMIUMINIMMINIMMUMMEM', ' ')]
CUV19200408-V25-15-page5.txt: [('JIIIIIIIIIIITI', ' ')]
CUV19201028-V25-43-page1.txt: [('mmullonommummummmuung', ' ')]

Check Correction 8

In [38]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction8

Average verified rate: 0.961115032269591

Average of error rates: 0.046653120000000006

Total token count: 6335996

In [39]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[39]:
[('co', 20014),
 ('w', 14213),
 ('e', 13523),
 ('br', 9797),
 ('m', 8886),
 ('d', 7845),
 ('f', 7543),
 ('r', 7538),
 ('g', 7249),
 ("'", 6381),
 ('t', 4915),
 ('gc', 4423),
 ('od', 4168),
 ('n', 3956),
 ('ck', 2988),
 ('pa', 1933),
 ('k', 1760),
 ('mt', 1702),
 ('bf', 1522),
 ('va', 1465),
 ('th', 1134),
 ("canvassers'", 1077),
 ('es', 803),
 ('wm', 675),
 ('u', 611),
 ('pg', 545),
 ('z', 486),
 ("the'", 385),
 ('hm', 365),
 ('sp', 363),
 ('x', 338),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('mcelphatrick', 291),
 ('ok', 285),
 ('seventhday', 251),
 ('pp', 250),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('cc', 208),
 ('bfl', 206),
 ('barto', 204),
 ("to'", 196),
 ('re', 195),
 ('oertley', 189),
 ('wc', 186),
 ('al', 186),
 ('q', 185),
 ('syphers', 182),
 ("''", 180)]

Correction 9 -- Separate squashed words

In [40]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log

prev = cycle
cycle = "correction9"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

verified_tokens = []

for filename in corpus:  
    content = utilities.readfile(directories['prev'], filename)
    clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)

tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]

sorted_list_of_words = list(words_sorted_short['token'])

wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = utilities.strip_punct(content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    
    for token in tokens:
        if not token.lower() in spelling_dictionary:
            if len(token) > 17:
                if re.search(r"[\-\-\'\"]", token):
                    pass
                else:
                    split_string = clean.infer_spaces(token, wordcost, maxword)
                    list_split_string = split_string.split()
                    
                    if clean.verify_split_string(list_split_string, spelling_dictionary):
                        replacements.append((token, split_string))
                    else:
                        pass
            else:
                pass
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
CUV19030114-V07-01-page1.txt: [('faithfulthatpromised', 'faithful that promised')]
CUV19030225-V07-04-page1.txt: [('valuableassistance', 'valuable assistance')]
CUV19030520-V07-10-page4.txt: [('therealpossessionoftherealinher', 'the real possession of the real in her')]
CUV19030916-V07-25-page3.txt: [('hundredvisitseachweek', 'hundred visits each week')]
CUV19031104-V07-31-page3.txt: [('surroundingcountry', 'surrounding country')]
CUV19031216-V07-37-page1.txt: [('whentheworldarounduswillgive', 'when the world around us will give')]
CUV19040203-V08-05-page4.txt: [('Atthehomeofthebride', 'At the home of the bride')]
CUV19040217-V08-07-page3.txt: [('ourspiritualindolence', 'our spiritual indolence'), ('followingquotation', 'following quotation')]
CUV19040330-V08-13-page1.txt: [('delinquentchurches', 'delinquent churches')]
CUV19040413-V08-15-page2.txt: [('muchdifferentmotive', 'much different motive')]
CUV19040518-V08-20-page2.txt: [('couldconscientiously', 'could conscientiously')]
CUV19040629-V08-26-page2.txt: [('provirigtelaialvenderful', 'pro vi rig tel ai a l ven der f u l')]
CUV19040706-V08-27-page1.txt: [('Everydollarwillaid', 'Every dollar will aid')]
CUV19040706-V08-27-page3.txt: [('seriousconsideration', 'serious consideration')]
CUV19040713-V08-28-page2.txt: [('MethodistEpiscopal', 'Methodist Episcopal')]
CUV19040824-V08-33-page2.txt: [('frointheinconvenience', 'fro in the inconvenience')]
CUV19041123-V08-45-page1.txt: [('mecordeertainlyshows', 'me cor deer tain l y shows')]
CUV19041228-V08-50-page1.txt: [('ORGANIZATIONOFTHEOHIOCONFERENCE', 'ORGANIZATION OF THE OHIO CONFERENCE')]
CUV19050322-V09-12-page2.txt: [('traitstruthfulness', 'traits truthfulness')]
CUV19050405-V09-14-page4.txt: [('affectionatefather', 'affectionate father')]
CUV19050719-V09-29-page3.txt: [('tellingyouthowtheLord', 'telling y out how the Lord')]
CUV19050802-V09-31-page2.txt: [('tilliunrighteoustiess', 'till i unrighteous ties s')]
CUV19050830-V09-34-page2.txt: [('statementcalculated', 'statement calculated')]
CUV19051011-V09-39-page2.txt: [('Encounteringopposition', 'En counter ing opposition')]
CUV19051108-V09-43-page3.txt: [('withinleathercovers', 'within leather covers')]
CUV19051206-V09-47-page2.txt: [('respoitObillirtiesof', 'res poi tO bill ir ties of')]
CUV19051220-V09-49-page3.txt: [('Christiangentlemen', 'Christian gentlemen'), ('showethforthrighteousness', 'showeth forth righteousness')]
CUV19060214-V10-07-page3.txt: [('excellentsafeguard', 'excellent safeguard')]
CUV19060221-V10-08-page2.txt: [('churchorganization', 'church organization')]
CUV19060307-V10-10-page4.txt: [('subscriptionincluding', 'subscription including')]
CUV19060321-V10-12-page3.txt: [('andsingtothepeoplethu', 'and sing to the people thu')]
CUV19060321-V10-12-page4.txt: [('othersconsecrating', 'others consecrating')]
CUV19060328-V10-13-page2.txt: [('expressioncontrary', 'expression contrary')]
CUV19060502-V10-18-page4.txt: [('hislovingheavenlyrather', 'his loving heavenly rather')]
CUV19060516-V10-20-page3.txt: [('thelrefiittitlittidettAffeleilititi', 'the l ref i it tit lit tide t t A f f e l e i l i t i t i')]
CUV19060516-V10-20-page4.txt: [('NOTESFROMTHECONFERENCE', 'NOTES FROM THE CONFERENCE')]
CUV19060523-V10-21-page3.txt: [('conditiondescribed', 'condition described')]
CUV19060801-V10-31-page4.txt: [('WEhavereadwithprofound', 'WE have read with profound')]
CUV19060912-V10-35-page4.txt: [('andwithaneatappearanceinthe', 'and with a neat appearance in the')]
CUV19060919-V10-36-page1.txt: [('forrichirmiginatiOn', 'for rich ir m i g i n a t i O n')]
CUV19061003-V10-38-page1.txt: [('watchEngtheeducation', 'watch Eng the education')]
CUV19061003-V10-38-page3.txt: [('Tabernacleswassheld', 'Tabernacles was s held')]
CUV19061010-V10-39-page1.txt: [('conferencecomposing', 'conference composing')]
CUV19061010-V10-39-page4.txt: [('expressingsympathy', 'expressing sympathy')]
CUV19061212-V10-48-page3.txt: [('deniandedexorbitant', 'den i and ed exorbitant')]
CUV19061226-V10-50-page3.txt: [('traitsoreliaracter', 'traits ore liar act er'), ('greatdisadvantages', 'great disadvantages')]
CUV19070123-V11-04-page4.txt: [('accrediteddelegates', 'accredited delegates')]
CUV19070220-V11-07-page2.txt: [('regularexamination', 'regular examination')]
CUV19070306-V11-09-page3.txt: [('preciousexperiences', 'precious experiences')]
CUV19070313-V11-10-page4.txt: [('whichsheregularlyattended', 'which she regularly attended')]
CUV19070327-V11-12-page2.txt: [('redthatallwhowereinattendance', 'red that all who were in attendance'), ('blessedrichlymally', 'blessed richly mall y')]
CUV19070410-V11-14-page2.txt: [('independenceincreases', 'independence increases')]
CUV19070605-V11-22-page1.txt: [('interestinterwoven', 'interest interwoven')]
CUV19071106-V11-43-page3.txt: [('MOUNTVERNONCOLLEGE', 'MOUNT VERNON COLLEGE')]
CUV19080101-V12-01-page3.txt: [('preparetheitisolves', 'prepare the it i solves')]
CUV19080226-V12-07-page6.txt: [('conflictingpositions', 'conflicting positions')]
CUV19080506-V12-17-page1.txt: [('mathematicaltriumphs', 'mathematical triumphs')]
CUV19080624-V13-24-page6.txt: [('andwillgoforthfromthathomerealiz', 'and will go forth from that home real i z')]
CUV19080826-V13-32-page7.txt: [('ZimmermanMorrisonsCove', 'Zimmerman Morrison s Cove')]
CUV19080909-V13-34-page6.txt: [('withinthetimespecified', 'within the time specified')]
CUV19081007-V13-38-page6.txt: [('thattimenoonehadcomprehended', 'that time no one had comprehended')]
CUV19081028-V13-41-page3.txt: [('becominginterested', 'becoming interested')]
CUV19081028-V13-41-page6.txt: [('brethrenandmyselfhadtheprivilegeof', 'brethren and myself had the privilege of')]
CUV19081111-V13-43-page6.txt: [('PrinceFredricktown', 'Prince Fredrick town')]
CUV19090303-V14-08-page2.txt: [('characteristicthat', 'characteristic that')]
CUV19090505-V14-17-page4.txt: [('standingappointment', 'standing appointment')]
CUV19090519-V14-19-page7.txt: [('ScandinavianCompany', 'Scandinavian Company')]
CUV19090609-V14-21-page6.txt: [('churchschoolteachers', 'church school teachers')]
CUV19090908-V14-34-page8.txt: [('isonhiswaytotheWest', 'is on his way to the West')]
CUV19091013-V14-39-page2.txt: [('missionaryagenelee', 'missionary a gen el e e')]
CUV19091020-V14-40-page8.txt: [('COLUMBIAUNIONCONFERENCE', 'COLUMBIA UNION CONFERENCE'), ('uselessexpenditures', 'useless expenditures')]
CUV19091110-V14-43-page2.txt: [('thoroughlyfurnished', 'thoroughly furnished')]
CUV19091215-V14-48-page8.txt: [('preparedliterature', 'prepared literature')]
CUV19091222-V14-49-page4.txt: [('goodrepresentation', 'good representation')]
CUV19100302-V15-09-page7.txt: [('Weearnestlypraythat', 'We earnestly pray that')]
CUV19100323-V15-12-page8.txt: [('severalconferinneain', 'several confer inn e a i n')]
CUV19100504-V15-18-page1.txt: [('messengersrecognize', 'messengers recognize')]
CUV19100601-V15-22-page8.txt: [('legislationnutside', 'legislation nut side')]
CUV19100817-V15-32-page8.txt: [('faithfulcolporters', 'faithful colporters')]
CUV19100914-V15-36-page5.txt: [('fullrepresentation', 'full representation')]
CUV19101026-V15-42-page6.txt: [('andgreaterblessings', 'and greater blessings')]
CUV19101116-V15-45-page8.txt: [('considerableliterary', 'considerable literary')]
CUV19101214-V15-49-page8.txt: [('greatlyifacilitate', 'greatly i facilitate'), ('MedicalEvangelists', 'Medical Evangelists')]
CUV19110215-V16-07-page3.txt: [('convertinginfluence', 'converting influence')]
CUV19110412-V16-15-page8.txt: [('theWestPhiladelphiastation', 'the West Philadelphia station'), ('Besidesherimmediatefamilysheleavesto', 'Besides her immediate family she leaves to')]
CUV19110531-V16-22-page8.txt: [('OFFICIALORGANOFTHE', 'OFFICIAL ORGAN OF THE')]
CUV19110823-V16-34-page1.txt: [('theirLyoungichildren', 'their L young i children')]
CUV19110913-V16-36-page14.txt: [('ConstitutionoftheOhioConference', 'Constitution of the Ohio Conference')]
CUV19111206-V16-48-page5.txt: [('secretarytreasurer', 'secretary treasurer')]
CUV19120214-V17-07-page2.txt: [('secretarytreasurer', 'secretary treasurer')]
CUV19130305-V18-10-page6.txt: [('surnamedthemselves', 'sur named themselves')]
CUV19130430-V18-18-page3.txt: [('commandmentkeeping', 'commandment keeping')]
CUV19130903-V18-35-page6.txt: [('GerhattiBurlington', 'Ger h att i Burlington')]
CUV19130910-V18-36-page2.txt: [('presentindebtedness', 'present indebtedness')]
CUV19140114-V19-03-page1.txt: [('denominationaldebts', 'denominational debts')]
CUV19140318-V19-12-page2.txt: [('personalsoliciting', 'personal soliciting')]
CUV19140401-V19-14-page7.txt: [('ByMissionaryVolunteer', 'By Missionary Volunteer')]
CUV19140513-V19-20-page12.txt: [('MissionarylCollege', 'Missionary l College')]
CUV19140610-V19-24-page1.txt: [('effectuallyorganized', 'effectually organized')]
CUV19140624-V19-26-page2.txt: [('thirtyfivecounties', 'thirty five counties')]
CUV19140708-V19-28-page4.txt: [('specialcorrespondent', 'special correspondent')]
CUV19140930-V19-39-page2.txt: [('IngatheringReviews', 'Ingathering Reviews')]
CUV19150114-V20-02-page2.txt: [('consideringaccepting', 'considering accepting')]
CUV19150211-V20-06-page4.txt: [('Temperanceperiodical', 'Temperance periodical')]
CUV19150311-V20-10-page5.txt: [('intereststereopticon', 'interest stereopticon')]
CUV19150401-V20-13-page1.txt: [('yourcontinualeffort', 'your continual effort')]
CUV19150415-V20-15-page6.txt: [('secretarytreasurer', 'secretary treasurer')]
CUV19151223-V20-50-page2.txt: [('brethrenandsisters', 'brethren and sisters')]
CUV19160427-V21-17-page8.txt: [('greatgrandchildren', 'great grandchildren')]
CUV19160615-V21-24-page5.txt: [('studentIcolporteurs', 'student I colporteurs')]
CUV19161207-V21-48-page5.txt: [('accountsreceivable', 'accounts receivable')]
CUV19170111-V22-02-page1.txt: [('greatlystrengthened', 'greatly strengthened')]
CUV19170215-V22-07-page1.txt: [('appreciatecitheirtimely', 'appreciate cit heir timely')]
CUV19170301-V22-09-page3.txt: [('InventoriesExpense', 'Inventories Expense')]
CUV19170503-V22-18-page8.txt: [('DoyoueverthinkofChina', 'Do you ever think of China'), ('Buttohealfromplagueandsickness', 'But to heal from plague and sickness')]
CUV19170712-V22-28-page5.txt: [('distinguishbetWeen', 'distinguish betWeen')]
CUV19170712-V22-28-page8.txt: [('IIIIIIIIOOELIEEEDOOM', 'III III II O O ELI E E E D O O M')]
CUV19170809-V22-32-page8.txt: [('RaisingtheLightHigher', 'Raising the Light Higher')]
CUV19171004-V22-39-page8.txt: [('learnbakinginAdventist', 'learn baking in Adventist')]
CUV19171220-V22-50-page2.txt: [('cultivatedtendencies', 'cultivated tendencies')]
CUV19180103-V23-01-page6.txt: [('throwsbitriselcupon', 'throws bit rise l cup on'), ('thatihiMOrdittanee', 'that i hiM Ord it tan e e')]
CUV19180214-V23-07-page2.txt: [('sufferingfellowmen', 'suffering fellowmen')]
CUV19180314-V23-11-page4.txt: [('consciencestricken', 'conscience stricken')]
CUV19180502-V23-18-page8.txt: [('COLUMBIAUNIONVISITOR', 'COLUMBIA UNION VISITOR')]
CUV19180606-V23-23-page2.txt: [('ShenandoahValleyAcademy', 'Shenandoah Valley Academy')]
CUV19180718-V23-29-page3.txt: [('PennsylvanialConference', 'Pennsylvania l Conference')]
CUV19180822-V23-34-page8.txt: [('hadbeenlookingafterthechurchat', 'had been looking after the church at')]
CUV19180905-V23-35-page8.txt: [('advertiseneglected', 'advertise neglected')]
CUV19181024-V23-42-page5.txt: [('Harvestingathering', 'Harvest ingathering')]
CUV19181024-V23-42-page8.txt: [('inproportiontothetimeyoucan', 'in proportion to the time you can')]
CUV19181031-V23-43-page7.txt: [('secretarytreasurer', 'secretary treasurer')]
CUV19181107-V23-44-page5.txt: [('experimentalknowledge', 'experimental knowledge')]
CUV19190109-V24-02s-page4.txt: [('conductedoespecially', 'conducted o especially')]
CUV19190717-V24-29-page4.txt: [('placeinPhiladelphia', 'place in Philadelphia')]
CUV19190717-V24-29-page6.txt: [('betweenfourandfive', 'between four and five')]
CUV19190904-V24-35-page7.txt: [('superintenelentaskink', 'super in ten el en task ink')]
CUV19191211-V24-49-page6.txt: [('intelligentlooking', 'intelligent looking')]
CUV19200101-V25-01-page4.txt: [('HarvestIngatheringin', 'Harvest Ingathering in')]
CUV19200205-V25-06-page10.txt: [('differentdepartments', 'different departments')]
CUV19200520-V25-21-page5.txt: [('thoroughlyinvestigating', 'thoroughly investigating')]
CUV19200902-V25-35-page10.txt: [('EllwangerWorcester', 'Ell wan ger Worcester')]
CUV19201125-V25-47-page8.txt: [('Approvedadvertisements', 'Approved advertisements')]

Check Correction 9

In [41]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction9

Average verified rate: 0.9611371116587403

Average of error rates: 0.04662848

Total token count: 6336405

In [42]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:100]
Out[42]:
[('co', 20014),
 ('w', 14213),
 ('e', 13533),
 ('br', 9797),
 ('m', 8888),
 ('d', 7846),
 ('f', 7546),
 ('r', 7538),
 ('g', 7250),
 ("'", 6381),
 ('t', 4920),
 ('gc', 4423),
 ('od', 4168),
 ('n', 3959),
 ('ck', 2988),
 ('pa', 1933),
 ('k', 1760),
 ('mt', 1702),
 ('bf', 1522),
 ('va', 1465),
 ('th', 1134),
 ("canvassers'", 1077),
 ('es', 803),
 ('wm', 675),
 ('u', 612),
 ('pg', 545),
 ('z', 487),
 ("the'", 385),
 ('hm', 365),
 ('sp', 363),
 ('x', 338),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('mcelphatrick', 291),
 ('ok', 285),
 ('seventhday', 251),
 ('pp', 250),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('cc', 208),
 ('bfl', 206),
 ('barto', 204),
 ("to'", 196),
 ('re', 195),
 ('oertley', 189),
 ('wc', 186),
 ('al', 186),
 ('q', 185),
 ('syphers', 182),
 ("''", 180),
 ('ce', 178),
 ('nd', 174),
 ('phila', 172),
 ('tolliver', 169),
 ('charloe', 167),
 ('ca', 156),
 ('pengelly', 156),
 ('da', 155),
 ('dunkinson', 150),
 ("and'", 148),
 ('apsley', 145),
 ('silber', 144),
 ('ti', 142),
 ('ex', 141),
 ('gerhart', 139),
 ('tion', 138),
 ('ga', 134),
 ('midkiff', 132),
 ('ww', 128),
 ('id', 128),
 ('kohr', 126),
 ('harford', 125),
 ('il', 117),
 ("in'", 114),
 ('cd', 114),
 ('zimmerly', 113),
 ('maloney', 110),
 ('-', 109),
 ('eusey', 108),
 ('mahoning', 105),
 ('cabell', 103),
 ('muskingum', 101),
 ('greenspring', 97),
 ('mo', 97),
 ('pickaway', 94),
 ('tt', 93),
 ('bassler', 90),
 ('bentz', 90),
 ("a'", 90),
 ("officers'", 88),
 ('fairhill', 88),
 ('lb', 86),
 ('ia', 86),
 ("colporteur's", 85),
 ('se', 85),
 ('lh', 84),
 ('miscl', 84),
 ('sd', 83),
 ('twentyfive', 82),
 ('monongalia', 82)]
In [ ]: