
Columbia Union Visitor

Overall, the OCR for this title is messy -- it appears that the OCR engine had trouble with the column breaks.

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "CUV"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)


In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/baseline

Average verified rate: 0.9174753500833346

Average of error rates: 0.09053696

Total token count: 6484099

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 500 )
[('co', 19949),
 ('w', 14081),
 ('-', 13680),
 ('e', 13268),
 ('br', 9834),
 ('m', 8834),
 ('d', 7654),
 ('f', 7357),
 ('r', 7342),
 ('g', 7131),
 ("'", 6768),
 ('¥', 5964),
 ('re-', 5019),
 ('t', 4720),
 ('gc', 4452),
 ('con-', 4254),
 ('od', 4219),
 ('n', 3875),
 ('ñ', 3845),
 ('in-', 3327),
 ('tion', 3144),
 ('ck', 2989),
 ('be-', 2628),
 ('de-', 2034),
 ('pa', 1917),
 (')', 1884),
 ('ex-', 1788),
 ('k', 1740),
 ('ence', 1724),
 ('com-', 1721),
 ('mt', 1684),
 ('bf', 1518),
 ('en-', 1514),
 ('va', 1463),
 ('confer-', 1379),
 ('ment', 1358),
 ('ference', 1348),
 ('sab-', 1241),
 ('th', 1161),
 ('meet-', 1138),
 ('mis-', 1108),
 ('ad-', 1095),
 ('*', 1094),
 ('pro-', 1069),
 ("canvassers'", 1061),
 ('pre-', 1017),
 ('ers', 1016),
 ('peo-', 994),
 ('_', 970),
 ('at-', 967),
 ('ple', 961),
 ('ber', 922),
 ('ac-', 911),
 ('tions', 910),
 ('es', 892),
 ('un-', 868),
 ('col-', 839),
 ('im-', 818),
 ('dis-', 802),
 ('or-', 798),
 ('mem-', 794),
 ('(', 785),
 ('to-', 785),
 ('per-', 780),
 ('can-', 775),
 ('an-', 697),
 ('inter-', 680),
 ('ap-', 660),
 ('wm', 656),
 ("'the", 652),
 ('mes-', 643),
 ('for-', 636),
 ('ful', 626),
 ('u', 616),
 ('sionary', 611),
 ('ance', 599),
 ('ments', 597),
 ('ary', 588),
 ('al-', 580),
 ('--', 577),
 ('pg', 545),
 ('bers', 545),
 ('ser-', 539),
 ('camp-', 532),
 ('ent', 527),
 ('/', 520),
 ('mission-', 504),
 ('work-', 501)]

Correction 1 -- Check for special character use

In [12]:
[('¥', 5964),
 ('ñ', 3845),
 (')', 1884),
 ('*', 1094),
 ('_', 970),
 ('(', 785),
 ('/', 520),
 ('%', 470),
 ('ña', 312),
 ('ã', 297),
 ('¥¥', 275),
 ('ñthe', 272),
 ('(a)', 227),
 ('(b)', 212),
 ('*two', 196),
 ('=', 186),
 ('(for', 185),
 ('ñselected', 178),
 ('¡', 171),
 ('ñmrs', 148),
 ('`', 141),
 ('ñcom-', 134),
 ('(c)', 127),
 ('•', 126),
 ('+', 115),
 ('ñh', 114),
 ('(to', 110),
 ('ñcoming', 108),
 ('(academia', 102),
 ('(the', 101),
 (']', 90),
 ('ñw', 89),
 ('(columbia', 88),
 ('\\', 84),
 ('conferenceñmission', 84),
 ('(d)', 80),
 ('ñbible', 79),
 ('ñr', 78),
 ('❑', 78),
 ('(concluded)', 75),
 ('(continued', 74),
 ('¥the', 73),
 ('\ufeff', 72),
 ('a)', 72),
 ('ô', 68),
 ('second¥class', 68),
 ('*j', 64),
 ('(colored)', 63),
 ('(a', 63),
 ('¥¥¥', 59),
 ('[entered', 59),
 ('andñ', 59),
 ('continued)', 59),
 ('[', 58),
 ('*barnesville', 56),
 ('ñthat', 56),
 ('*broughton', 55),
 ('ñgreat', 55),
 ('_the', 55),
 ('the¥', 54),
 ('ñf', 54),
 ('the_', 54),
 ('>', 53),
 ('ñdied', 52),
 ('*west', 51),
 ('¤', 50),
 ('*companies', 50),
 ('ñin', 50),
 ('ñj', 49),
 ('*reedsville', 49),
 ('*jackson', 49),
 ('(continued)', 49),
 ('(german)', 48),
 ('-¥', 46),
 ('ñb', 45),
 ('ñto', 45),
 ('seventh¥day', 45),
 ('ñif', 45),
 ('ñe', 44),
 ('*a', 44),
 ('(e)', 44),
 ('ñreview', 43),
 ('wantedña', 43),
 ('ñvol', 42),
 ('*week', 41),
 ('(and', 41),
 ('(or', 41),
 ('—', 41),
 ('ý', 40),
 ('¥and', 40),
 ('¥of', 37),
 ('ñelder', 37),
 ('*gilboa', 37),
 ('¥¥¥¥', 36),
 ('*conant', 35),
 ('camp¥meeting', 35),
 ("'¥", 34),
 ('ñwe', 34),
 ('ohio)', 33),
 ('ñand', 33)]
In [13]:
# %load shared_elements/
prev = "baseline"
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    with open(join(directories['cycle'], filename), mode="w") as o:

Check Correction 1

In [14]:
# %load shared_elements/
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction1

Average verified rate: 0.9234446884710339

Average of error rates: 0.08394623999999999

Total token count: 6471517

In [15]:
# %load shared_elements/
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
[('co', 19981),
 ('w', 14250),
 ('-', 14205),
 ('e', 13469),
 ('br', 9842),
 ('m', 8888),
 ('d', 7814),
 ('r', 7520),
 ('f', 7507),
 ('g', 7228),
 ("'", 7009),
 ('re-', 5029),
 ('t', 4825),
 ('gc', 4454),
 ('con-', 4258),
 ('od', 4229),
 ('n', 3927),
 ('in-', 3334),
 ('tion', 3152),
 ('ck', 2990),
 ('be-', 2632),
 ('de-', 2045),
 ('pa', 1925),
 ('com-', 1858),
 ('ex-', 1795),
 ('k', 1758),
 ('ence', 1726),
 ('mt', 1690),
 ('bf', 1519),
 ('en-', 1517),
 ('va', 1467),
 ('confer-', 1380),
 ('ment', 1362),
 ('ference', 1353),
 ('sab-', 1246),
 ('th', 1182),
 ('meet-', 1140),
 ('mis-', 1123),
 ('ad-', 1095),
 ('pro-', 1070),
 ("canvassers'", 1062),
 ('ers', 1021),
 ('pre-', 1019),
 ('peo-', 995),
 ('at-', 969),
 ('ple', 962),
 ('ber', 927),
 ('tions', 915),
 ('ac-', 913),
 ('es', 902)]

Correction 2 -- Fix line endings

In [16]:
# %load shared_elements/
prev = cycle
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:

Check Correction 2

In [17]:
# %load shared_elements/
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction2

Average verified rate: 0.9530517833206363

Average of error rates: 0.05505456

Total token count: 6332232

In [18]:
# %load shared_elements/
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
[('co', 19976),
 ('w', 14245),
 ('-', 14088),
 ('e', 13462),
 ('br', 9842),
 ('m', 8882),
 ('d', 7809),
 ('r', 7510),
 ('f', 7493),
 ('g', 7221),
 ("'", 7009),
 ('t', 4811),
 ('gc', 4454),
 ('od', 4224),
 ('n', 3924),
 ('ck', 2990),
 ('pa', 1926),
 ('k', 1757),
 ('mt', 1690),
 ('bf', 1519),
 ('va', 1467),
 ('th', 1181),
 ("canvassers'", 1100),
 ('es', 823),
 ('wm', 671),
 ("'the", 657),
 ('--', 633),
 ('u', 631),
 ('pg', 545),
 ('-the', 491),
 ('z', 481),
 ("the'", 396),
 ('sp', 371),
 ('hm', 362),
 ("'of", 347),
 ('sabbathschool', 329),
 ('x', 324),
 ('reichenbach', 307),
 ('ok', 292),
 ('mcelphatrick', 291),
 ('-of', 288),
 ('pp', 249),
 ('seventhday', 247),
 ("colporteurs'", 247),
 ("'and", 246),
 ('-and', 236),
 ('-to', 234),
 ('buttermore', 230),
 ('al', 222),
 ('-a', 218)]

Correction 3 -- Remove extra quotation characters

In [19]:
# %load shared_elements/
prev = cycle
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 1:
                if token_list[-2] is 's' or 'S':
                    corrections.append((token, re.sub(r"'", r"", token)))
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
    if len(corrections) > 0:
#         print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)

    with open(join(directories['cycle'], filename), mode="w") as o:

Correction 4 -- Address extra dashes

In [20]:
# %load shared_elements/
prev = cycle
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
    if len(replacements) > 0:
#         print("{}: {}".format(filename, replacements))
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)

    with open(join(directories['cycle'], filename), mode="w") as o:

Check Correction 4

In [21]:
# %load shared_elements/
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction4

Average verified rate: 0.9602815412218036

Average of error rates: 0.047562560000000004

Total token count: 6340528

In [22]:
# %load shared_elements/
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
[('co', 20599),
 ('w', 14345),
 ('e', 13710),
 ('br', 9883),
 ('m', 8952),
 ('d', 7884),
 ('r', 7616),
 ('f', 7574),
 ('g', 7285),
 ("'", 6382),
 ('t', 4989),
 ('gc', 4462),
 ('od', 4234),
 ('n', 4006),
 ('ck', 2995),
 ('pa', 1946),
 ('k', 1779),
 ('mt', 1704),
 ('bf', 1522),
 ('va', 1472),
 ('th', 1236),
 ("canvassers'", 1079),
 ('es', 848),
 ('wm', 676),
 ('u', 656),
 ('pg', 545),
 ('z', 489),
 ('re', 474),
 ("the'", 385),
 ('sp', 375),
 ('hm', 365),
 ('x', 339),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('ok', 297),
 ('mcelphatrick', 291),
 ('al', 257),
 ('pp', 253),
 ('seventhday', 251),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('nd', 226),
 ('cc', 213),
 ('ce', 210),
 ('bfl', 206),
 ('barto', 204),
 ("to'", 196),
 ('oertley', 189),
 ('ca', 187),
 ('wc', 186)]

Correction 5 -- Address Burst Words

In [23]:
# %load shared_elements/
prev = cycle
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    pattern = re.compile("(\s(\w{1,2}\s){5,})")
    replacements = []
    clean.check_splits(pattern, spelling_dictionary, content, replacements)
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)

    with open(join(directories['cycle'], filename), mode="w") as o:

Check Correction 5

In [24]:
# %load shared_elements/
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction5

Average verified rate: 0.9602856941881112

Average of error rates: 0.047561599999999996

Total token count: 6340486

In [25]:
# %load shared_elements/
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
[('co', 20599),
 ('w', 14340),
 ('e', 13687),
 ('br', 9883),
 ('m', 8946),
 ('d', 7880),
 ('r', 7597),
 ('f', 7572),
 ('g', 7280),
 ("'", 6382),
 ('t', 4968),
 ('gc', 4462),
 ('od', 4234),
 ('n', 3996),
 ('ck', 2995),
 ('pa', 1946),
 ('k', 1776),
 ('mt', 1704),
 ('bf', 1522),
 ('va', 1472),
 ('th', 1236),
 ("canvassers'", 1079),
 ('es', 848),
 ('wm', 676),
 ('u', 653),
 ('pg', 545),
 ('z', 489),
 ('re', 475),
 ("the'", 385),
 ('sp', 375),
 ('hm', 365),
 ('x', 339),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('ok', 297),
 ('mcelphatrick', 291),
 ('al', 256),
 ('pp', 253),
 ('seventhday', 251),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('nd', 226),
 ('cc', 213),
 ('ce', 210),
 ('bfl', 206),
 ('barto', 204),
 ("to'", 196),
 ('oertley', 189),
 ('ca', 187),
 ('wc', 186)]

Correction 6 -- Address Split Words I

In [26]:
# %load shared_elements/
prev = cycle
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)

    with open(join(directories['cycle'], filename), mode="w") as o:

Check Correction 6

In [27]:
# %load shared_elements/
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction6

Average verified rate: 0.9608287928828794

Average of error rates: 0.04697536

Total token count: 6337844

In [28]:
# %load shared_elements/
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
[('co', 20033),
 ('w', 14323),
 ('e', 13528),
 ('br', 9806),
 ('m', 8931),
 ('d', 7855),
 ('f', 7567),
 ('r', 7546),
 ('g', 7259),
 ("'", 6382),
 ('t', 4932),
 ('gc', 4423),
 ('od', 4173),
 ('n', 3966),
 ('ck', 2989),
 ('pa', 1934),
 ('k', 1764),
 ('mt', 1703),
 ('bf', 1522),
 ('va', 1466),
 ('th', 1150),
 ("canvassers'", 1079),
 ('es', 826),
 ('wm', 675),
 ('u', 647),
 ('pg', 545),
 ('z', 488),
 ("the'", 385),
 ('hm', 365),
 ('sp', 363),
 ('x', 338),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('mcelphatrick', 291),
 ('ok', 287),
 ('re', 281),
 ('seventhday', 251),
 ('pp', 251),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('nd', 221),
 ('cc', 213),
 ('bfl', 206),
 ('barto', 204),
 ('al', 203),
 ('ce', 201),
 ("to'", 196),
 ('oertley', 189),
 ('wc', 186),
 ('q', 185)]

Correction 7 -- Address Split Words II

In [29]:
# %load shared_elements/
prev = cycle
cycle = "correction7"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)

    with open(join(directories['cycle'], filename), mode="w") as o:

Check Correction 7

In [30]:
# %load shared_elements/
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction7

Average verified rate: 0.9610930375444612

Average of error rates: 0.04669136

Total token count: 6336141

In [31]:
# %load shared_elements/
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
[('co', 20014),
 ('w', 14213),
 ('e', 13523),
 ('br', 9797),
 ('m', 8886),
 ('d', 7845),
 ('f', 7543),
 ('r', 7538),
 ('g', 7249),
 ("'", 6381),
 ('t', 4915),
 ('gc', 4423),
 ('od', 4168),
 ('n', 3956),
 ('ck', 2988),
 ('pa', 1933),
 ('k', 1760),
 ('mt', 1702),
 ('bf', 1522),
 ('va', 1465),
 ('th', 1134),
 ("canvassers'", 1077),
 ('es', 803),
 ('wm', 675),
 ('u', 611),
 ('pg', 545),
 ('z', 486),
 ("the'", 385),
 ('hm', 365),
 ('sp', 363),
 ('x', 338),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('mcelphatrick', 291),
 ('ok', 285),
 ('seventhday', 251),
 ('pp', 250),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('cc', 208),
 ('bfl', 206),
 ('barto', 204),
 ("to'", 196),
 ('re', 195),
 ('oertley', 189),
 ('wc', 186),
 ('al', 186),
 ('q', 185),
 ('syphers', 182),
 ("''", 180)]

Survey remaining errors

Get docs with high error rate

In [32]:
messy_docs = reports.docs_with_high_error_rate( summary, min_error_rate = .2 )
In [33]:
docs_2_check = [x[0] for x in messy_docs if x[1] > 0.3]
In [34]:
In [35]:
# utilities.open_original_docs(docs_2_check, directories['cycle'])

The documents with high error rates are tables with canvasser information and images.

Get long errors

In [36]:
reports.long_errors(errors_summary, min_length=15)
  • mommemmommmmmummmemmmem and other variations with long strings of "m". remove these to avoid noise. Locate a series of "m"s within a token -- findall(r'([m+]{2,})') -- and check the length of the results -- if len(findall) > 3.

  • Split of long words will be difficult as there seems to be a combination of spelling errors and conjoined words.

Correction 8 -- Remove long error tokens

In [37]:
# %load shared_elements/
prev = cycle
cycle = "correction8"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    sub_list = ["m|M", "i|I"]
    replacements = []
    for sub in sub_list:
        replacements.append(clean.check_for_repeating_characters(tokens, sub))
    replacements = [item for sublist in replacements for item in sublist]
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    with open(join(directories['cycle'], filename), mode="w") as o:
CUV19060613-V10-24-page4.txt: [('Ammummamontommok', ' ')]
CUV19160921-V21-37-page8.txt: [('EIMIIIIIIIIII', ' ')]
CUV19160928-V21-38-page8.txt: [('HONSICIIIIIIMEIROMMINIIIIMMIll', ' ')]
CUV19161005-V21-39-page8.txt: [('OMMEMMOMOMMOMMOMMUMMEMN', ' ')]
CUV19170215-V22-07-page8.txt: [('INIMMEMMEMMEMMEMMEM', ' '), ('MMINIMMUMMEMEMMEMMUMMINIMME', ' '), ('MUMMINIMMINIMMUMMEMMUMMOIMMS', ' '), ('imummummummummummum', ' ')]
CUV19170308-V22-10-page8.txt: [('MIMIIIIIIIMMIIIIIIM', ' ')]
CUV19170322-V22-12-page8.txt: [('INIIIIIMIIIIIIIMIll', ' ')]
CUV19170329-V22-13-page8.txt: [('IIMIIIIIIIIMIEM', ' ')]
CUV19170705-V22-27-page8.txt: [('MIIIIIIIIIIIIIIIII', ' ')]
CUV19170719-V22-29-page8.txt: [('MMIMUNINIIIIIIIIIIIIIIN', ' ')]
CUV19170809-V22-32-page8.txt: [('MINIMMEMEMEMEMEMEMMMUMMEMO', ' '), ('immumminsmummummainsmsnm', ' '), ('mummiumusummunismammiums', ' '), ('AMMENUMMEMEMEMEMMENMEMMEM', ' '), ('moommummummummummumummis', ' ')]
CUV19170823-V22-34-page8.txt: [('MORMIIIIIIIIIIMIIIIII', ' '), ('ESIMENEMEMMIIIIIIIIIIII', ' ')]
CUV19170913-V22-36-page8.txt: [('MUMMIMMUUMMOMMEMMEMEMMOMMH', ' '), ('ROMMERMEMINIMMEMMOMMEMMINIM', ' '), ('mommummumummensimmum', ' '), ('MINIMMINIMMIMMEMMMEMOIMMENUM', ' '), ('MUMEMMUMMIIMUMMORMOMUMMENE', ' '), ('MMUUMMUUMMEMEMUSEMOMMINIMM', ' '), ('mniumposommiummmummint', ' ')]
CUV19171004-V22-39-page8.txt: [('milsommummummom', ' '), ('MUMMINIUMNSUMUMMOMMEM', ' '), ('UNUMMEMMEMOMMEMOMMEMEM', ' '), ('MMOMMEMUMOMMOMMOMMEMM', ' '), ('moimmommommummmumm', ' '), ('imminsimmummommumm', ' ')]
CUV19171018-V22-41-page8.txt: [('lommuntimmunimmumminmemma', ' ')]
CUV19171025-V22-42-page8.txt: [('MMINIMEINIMIIIIIIIIII', ' ')]
CUV19171101-V22-43-page8.txt: [('MEMEMERIVIIIIIIIIII', ' ')]
CUV19171129-V22-47-page8.txt: [('INIIIMINEIMIIIIIIMMOWNIPIRET', ' ')]
CUV19171206-V22-48-page8.txt: [('EIIIIIIIIIMIIIIIMINI', ' ')]
CUV19180103-V23-01-page8.txt: [('EIBININIIIIIIIIIIIE', ' ')]
CUV19180110-V23-02-page8.txt: [('MMIIIIRMIIIIIIII', ' ')]
CUV19180124-V23-04-page8.txt: [('mAnummommummin', ' ')]
CUV19180228-V23-09-page8.txt: [('MINIMMINIMMEMMMIMEMMEMMEMOM', ' '), ('MMINIMMEMEMMINIMMEEMMEMMEMM', ' '), ('mmimmemummomminummisummom', ' '), ('mommommiummommmmommiumwm', ' '), ('mmummomummismursammmemm', ' '), ('minummumommummilmimummamm', ' '), ('manummommammimmismiammis', ' '), ('MMEMEMEMOMEMMEMMMEEMMOIMMIN', ' '), ('Imminmemsmommommimummilms', ' ')]
CUV19200408-V25-15-page5.txt: [('JIIIIIIIIIIITI', ' ')]
CUV19201028-V25-43-page1.txt: [('mmullonommummummmuung', ' ')]

Check Correction 8

In [38]:
# %load shared_elements/
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction8

Average verified rate: 0.961115032269591

Average of error rates: 0.046653120000000006

Total token count: 6335996

In [39]:
# %load shared_elements/
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
[('co', 20014),
 ('w', 14213),
 ('e', 13523),
 ('br', 9797),
 ('m', 8886),
 ('d', 7845),
 ('f', 7543),
 ('r', 7538),
 ('g', 7249),
 ("'", 6381),
 ('t', 4915),
 ('gc', 4423),
 ('od', 4168),
 ('n', 3956),
 ('ck', 2988),
 ('pa', 1933),
 ('k', 1760),
 ('mt', 1702),
 ('bf', 1522),
 ('va', 1465),
 ('th', 1134),
 ("canvassers'", 1077),
 ('es', 803),
 ('wm', 675),
 ('u', 611),
 ('pg', 545),
 ('z', 486),
 ("the'", 385),
 ('hm', 365),
 ('sp', 363),
 ('x', 338),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('mcelphatrick', 291),
 ('ok', 285),
 ('seventhday', 251),
 ('pp', 250),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('cc', 208),
 ('bfl', 206),
 ('barto', 204),
 ("to'", 196),
 ('re', 195),
 ('oertley', 189),
 ('wc', 186),
 ('al', 186),
 ('q', 185),
 ('syphers', 182),
 ("''", 180)]

Correction 9 -- Separate squashed words

In [40]:
# %load shared_elements/
import pandas as pd
from math import log

prev = cycle
cycle = "correction9"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

verified_tokens = []

for filename in corpus:  
    content = utilities.readfile(directories['prev'], filename)
    clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)

tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]

sorted_list_of_words = list(words_sorted_short['token'])

wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = utilities.strip_punct(content)
    tokens = utilities.tokenize_text(text)
    replacements = []
    for token in tokens:
        if not token.lower() in spelling_dictionary:
            if len(token) > 17:
                if"[\-\-\'\"]", token):
                    split_string = clean.infer_spaces(token, wordcost, maxword)
                    list_split_string = split_string.split()
                    if clean.verify_split_string(list_split_string, spelling_dictionary):
                        replacements.append((token, split_string))
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)

    with open(join(directories['cycle'], filename), mode="w") as o:
CUV19030114-V07-01-page1.txt: [('faithfulthatpromised', 'faithful that promised')]
CUV19030225-V07-04-page1.txt: [('valuableassistance', 'valuable assistance')]
CUV19030520-V07-10-page4.txt: [('therealpossessionoftherealinher', 'the real possession of the real in her')]
CUV19030916-V07-25-page3.txt: [('hundredvisitseachweek', 'hundred visits each week')]
CUV19031104-V07-31-page3.txt: [('surroundingcountry', 'surrounding country')]
CUV19031216-V07-37-page1.txt: [('whentheworldarounduswillgive', 'when the world around us will give')]
CUV19040203-V08-05-page4.txt: [('Atthehomeofthebride', 'At the home of the bride')]
CUV19040217-V08-07-page3.txt: [('ourspiritualindolence', 'our spiritual indolence'), ('followingquotation', 'following quotation')]
CUV19040330-V08-13-page1.txt: [('delinquentchurches', 'delinquent churches')]
CUV19040413-V08-15-page2.txt: [('muchdifferentmotive', 'much different motive')]
CUV19040518-V08-20-page2.txt: [('couldconscientiously', 'could conscientiously')]
CUV19040629-V08-26-page2.txt: [('provirigtelaialvenderful', 'pro vi rig tel ai a l ven der f u l')]
CUV19040706-V08-27-page1.txt: [('Everydollarwillaid', 'Every dollar will aid')]
CUV19040706-V08-27-page3.txt: [('seriousconsideration', 'serious consideration')]
CUV19040713-V08-28-page2.txt: [('MethodistEpiscopal', 'Methodist Episcopal')]
CUV19040824-V08-33-page2.txt: [('frointheinconvenience', 'fro in the inconvenience')]
CUV19041123-V08-45-page1.txt: [('mecordeertainlyshows', 'me cor deer tain l y shows')]
CUV19050322-V09-12-page2.txt: [('traitstruthfulness', 'traits truthfulness')]
CUV19050405-V09-14-page4.txt: [('affectionatefather', 'affectionate father')]
CUV19050719-V09-29-page3.txt: [('tellingyouthowtheLord', 'telling y out how the Lord')]
CUV19050802-V09-31-page2.txt: [('tilliunrighteoustiess', 'till i unrighteous ties s')]
CUV19050830-V09-34-page2.txt: [('statementcalculated', 'statement calculated')]
CUV19051011-V09-39-page2.txt: [('Encounteringopposition', 'En counter ing opposition')]
CUV19051108-V09-43-page3.txt: [('withinleathercovers', 'within leather covers')]
CUV19051206-V09-47-page2.txt: [('respoitObillirtiesof', 'res poi tO bill ir ties of')]
CUV19051220-V09-49-page3.txt: [('Christiangentlemen', 'Christian gentlemen'), ('showethforthrighteousness', 'showeth forth righteousness')]
CUV19060214-V10-07-page3.txt: [('excellentsafeguard', 'excellent safeguard')]
CUV19060221-V10-08-page2.txt: [('churchorganization', 'church organization')]
CUV19060307-V10-10-page4.txt: [('subscriptionincluding', 'subscription including')]
CUV19060321-V10-12-page3.txt: [('andsingtothepeoplethu', 'and sing to the people thu')]
CUV19060321-V10-12-page4.txt: [('othersconsecrating', 'others consecrating')]
CUV19060328-V10-13-page2.txt: [('expressioncontrary', 'expression contrary')]
CUV19060502-V10-18-page4.txt: [('hislovingheavenlyrather', 'his loving heavenly rather')]
CUV19060516-V10-20-page3.txt: [('thelrefiittitlittidettAffeleilititi', 'the l ref i it tit lit tide t t A f f e l e i l i t i t i')]
CUV19060523-V10-21-page3.txt: [('conditiondescribed', 'condition described')]
CUV19060801-V10-31-page4.txt: [('WEhavereadwithprofound', 'WE have read with profound')]
CUV19060912-V10-35-page4.txt: [('andwithaneatappearanceinthe', 'and with a neat appearance in the')]
CUV19060919-V10-36-page1.txt: [('forrichirmiginatiOn', 'for rich ir m i g i n a t i O n')]
CUV19061003-V10-38-page1.txt: [('watchEngtheeducation', 'watch Eng the education')]
CUV19061003-V10-38-page3.txt: [('Tabernacleswassheld', 'Tabernacles was s held')]
CUV19061010-V10-39-page1.txt: [('conferencecomposing', 'conference composing')]
CUV19061010-V10-39-page4.txt: [('expressingsympathy', 'expressing sympathy')]
CUV19061212-V10-48-page3.txt: [('deniandedexorbitant', 'den i and ed exorbitant')]
CUV19061226-V10-50-page3.txt: [('traitsoreliaracter', 'traits ore liar act er'), ('greatdisadvantages', 'great disadvantages')]
CUV19070123-V11-04-page4.txt: [('accrediteddelegates', 'accredited delegates')]
CUV19070220-V11-07-page2.txt: [('regularexamination', 'regular examination')]
CUV19070306-V11-09-page3.txt: [('preciousexperiences', 'precious experiences')]
CUV19070313-V11-10-page4.txt: [('whichsheregularlyattended', 'which she regularly attended')]
CUV19070327-V11-12-page2.txt: [('redthatallwhowereinattendance', 'red that all who were in attendance'), ('blessedrichlymally', 'blessed richly mall y')]
CUV19070410-V11-14-page2.txt: [('independenceincreases', 'independence increases')]
CUV19070605-V11-22-page1.txt: [('interestinterwoven', 'interest interwoven')]
CUV19080101-V12-01-page3.txt: [('preparetheitisolves', 'prepare the it i solves')]
CUV19080226-V12-07-page6.txt: [('conflictingpositions', 'conflicting positions')]
CUV19080506-V12-17-page1.txt: [('mathematicaltriumphs', 'mathematical triumphs')]
CUV19080624-V13-24-page6.txt: [('andwillgoforthfromthathomerealiz', 'and will go forth from that home real i z')]
CUV19080826-V13-32-page7.txt: [('ZimmermanMorrisonsCove', 'Zimmerman Morrison s Cove')]
CUV19080909-V13-34-page6.txt: [('withinthetimespecified', 'within the time specified')]
CUV19081007-V13-38-page6.txt: [('thattimenoonehadcomprehended', 'that time no one had comprehended')]
CUV19081028-V13-41-page3.txt: [('becominginterested', 'becoming interested')]
CUV19081028-V13-41-page6.txt: [('brethrenandmyselfhadtheprivilegeof', 'brethren and myself had the privilege of')]
CUV19081111-V13-43-page6.txt: [('PrinceFredricktown', 'Prince Fredrick town')]
CUV19090303-V14-08-page2.txt: [('characteristicthat', 'characteristic that')]
CUV19090505-V14-17-page4.txt: [('standingappointment', 'standing appointment')]
CUV19090519-V14-19-page7.txt: [('ScandinavianCompany', 'Scandinavian Company')]
CUV19090609-V14-21-page6.txt: [('churchschoolteachers', 'church school teachers')]
CUV19090908-V14-34-page8.txt: [('isonhiswaytotheWest', 'is on his way to the West')]
CUV19091013-V14-39-page2.txt: [('missionaryagenelee', 'missionary a gen el e e')]
CUV19091020-V14-40-page8.txt: [('COLUMBIAUNIONCONFERENCE', 'COLUMBIA UNION CONFERENCE'), ('uselessexpenditures', 'useless expenditures')]
CUV19091110-V14-43-page2.txt: [('thoroughlyfurnished', 'thoroughly furnished')]
CUV19091215-V14-48-page8.txt: [('preparedliterature', 'prepared literature')]
CUV19091222-V14-49-page4.txt: [('goodrepresentation', 'good representation')]
CUV19100302-V15-09-page7.txt: [('Weearnestlypraythat', 'We earnestly pray that')]
CUV19100323-V15-12-page8.txt: [('severalconferinneain', 'several confer inn e a i n')]
CUV19100504-V15-18-page1.txt: [('messengersrecognize', 'messengers recognize')]
CUV19100601-V15-22-page8.txt: [('legislationnutside', 'legislation nut side')]
CUV19100817-V15-32-page8.txt: [('faithfulcolporters', 'faithful colporters')]
CUV19100914-V15-36-page5.txt: [('fullrepresentation', 'full representation')]
CUV19101026-V15-42-page6.txt: [('andgreaterblessings', 'and greater blessings')]
CUV19101116-V15-45-page8.txt: [('considerableliterary', 'considerable literary')]
CUV19101214-V15-49-page8.txt: [('greatlyifacilitate', 'greatly i facilitate'), ('MedicalEvangelists', 'Medical Evangelists')]
CUV19110215-V16-07-page3.txt: [('convertinginfluence', 'converting influence')]
CUV19110412-V16-15-page8.txt: [('theWestPhiladelphiastation', 'the West Philadelphia station'), ('Besidesherimmediatefamilysheleavesto', 'Besides her immediate family she leaves to')]
CUV19110823-V16-34-page1.txt: [('theirLyoungichildren', 'their L young i children')]
CUV19110913-V16-36-page14.txt: [('ConstitutionoftheOhioConference', 'Constitution of the Ohio Conference')]
CUV19111206-V16-48-page5.txt: [('secretarytreasurer', 'secretary treasurer')]
CUV19120214-V17-07-page2.txt: [('secretarytreasurer', 'secretary treasurer')]
CUV19130305-V18-10-page6.txt: [('surnamedthemselves', 'sur named themselves')]
CUV19130430-V18-18-page3.txt: [('commandmentkeeping', 'commandment keeping')]
CUV19130903-V18-35-page6.txt: [('GerhattiBurlington', 'Ger h att i Burlington')]
CUV19130910-V18-36-page2.txt: [('presentindebtedness', 'present indebtedness')]
CUV19140114-V19-03-page1.txt: [('denominationaldebts', 'denominational debts')]
CUV19140318-V19-12-page2.txt: [('personalsoliciting', 'personal soliciting')]
CUV19140401-V19-14-page7.txt: [('ByMissionaryVolunteer', 'By Missionary Volunteer')]
CUV19140513-V19-20-page12.txt: [('MissionarylCollege', 'Missionary l College')]
CUV19140610-V19-24-page1.txt: [('effectuallyorganized', 'effectually organized')]
CUV19140624-V19-26-page2.txt: [('thirtyfivecounties', 'thirty five counties')]
CUV19140708-V19-28-page4.txt: [('specialcorrespondent', 'special correspondent')]
CUV19140930-V19-39-page2.txt: [('IngatheringReviews', 'Ingathering Reviews')]
CUV19150114-V20-02-page2.txt: [('consideringaccepting', 'considering accepting')]
CUV19150211-V20-06-page4.txt: [('Temperanceperiodical', 'Temperance periodical')]
CUV19150311-V20-10-page5.txt: [('intereststereopticon', 'interest stereopticon')]
CUV19150401-V20-13-page1.txt: [('yourcontinualeffort', 'your continual effort')]
CUV19150415-V20-15-page6.txt: [('secretarytreasurer', 'secretary treasurer')]
CUV19151223-V20-50-page2.txt: [('brethrenandsisters', 'brethren and sisters')]
CUV19160427-V21-17-page8.txt: [('greatgrandchildren', 'great grandchildren')]
CUV19160615-V21-24-page5.txt: [('studentIcolporteurs', 'student I colporteurs')]
CUV19161207-V21-48-page5.txt: [('accountsreceivable', 'accounts receivable')]
CUV19170111-V22-02-page1.txt: [('greatlystrengthened', 'greatly strengthened')]
CUV19170215-V22-07-page1.txt: [('appreciatecitheirtimely', 'appreciate cit heir timely')]
CUV19170301-V22-09-page3.txt: [('InventoriesExpense', 'Inventories Expense')]
CUV19170503-V22-18-page8.txt: [('DoyoueverthinkofChina', 'Do you ever think of China'), ('Buttohealfromplagueandsickness', 'But to heal from plague and sickness')]
CUV19170712-V22-28-page5.txt: [('distinguishbetWeen', 'distinguish betWeen')]
CUV19170712-V22-28-page8.txt: [('IIIIIIIIOOELIEEEDOOM', 'III III II O O ELI E E E D O O M')]
CUV19170809-V22-32-page8.txt: [('RaisingtheLightHigher', 'Raising the Light Higher')]
CUV19171004-V22-39-page8.txt: [('learnbakinginAdventist', 'learn baking in Adventist')]
CUV19171220-V22-50-page2.txt: [('cultivatedtendencies', 'cultivated tendencies')]
CUV19180103-V23-01-page6.txt: [('throwsbitriselcupon', 'throws bit rise l cup on'), ('thatihiMOrdittanee', 'that i hiM Ord it tan e e')]
CUV19180214-V23-07-page2.txt: [('sufferingfellowmen', 'suffering fellowmen')]
CUV19180314-V23-11-page4.txt: [('consciencestricken', 'conscience stricken')]
CUV19180606-V23-23-page2.txt: [('ShenandoahValleyAcademy', 'Shenandoah Valley Academy')]
CUV19180718-V23-29-page3.txt: [('PennsylvanialConference', 'Pennsylvania l Conference')]
CUV19180822-V23-34-page8.txt: [('hadbeenlookingafterthechurchat', 'had been looking after the church at')]
CUV19180905-V23-35-page8.txt: [('advertiseneglected', 'advertise neglected')]
CUV19181024-V23-42-page5.txt: [('Harvestingathering', 'Harvest ingathering')]
CUV19181024-V23-42-page8.txt: [('inproportiontothetimeyoucan', 'in proportion to the time you can')]
CUV19181031-V23-43-page7.txt: [('secretarytreasurer', 'secretary treasurer')]
CUV19181107-V23-44-page5.txt: [('experimentalknowledge', 'experimental knowledge')]
CUV19190109-V24-02s-page4.txt: [('conductedoespecially', 'conducted o especially')]
CUV19190717-V24-29-page4.txt: [('placeinPhiladelphia', 'place in Philadelphia')]
CUV19190717-V24-29-page6.txt: [('betweenfourandfive', 'between four and five')]
CUV19190904-V24-35-page7.txt: [('superintenelentaskink', 'super in ten el en task ink')]
CUV19191211-V24-49-page6.txt: [('intelligentlooking', 'intelligent looking')]
CUV19200101-V25-01-page4.txt: [('HarvestIngatheringin', 'Harvest Ingathering in')]
CUV19200205-V25-06-page10.txt: [('differentdepartments', 'different departments')]
CUV19200520-V25-21-page5.txt: [('thoroughlyinvestigating', 'thoroughly investigating')]
CUV19200902-V25-35-page10.txt: [('EllwangerWorcester', 'Ell wan ger Worcester')]
CUV19201125-V25-47-page8.txt: [('Approvedadvertisements', 'Approved advertisements')]

Check Correction 9

In [41]:
# %load shared_elements/
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CUV/correction9

Average verified rate: 0.9611371116587403

Average of error rates: 0.04662848

Total token count: 6336405

In [42]:
# %load shared_elements/
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:100]
[('co', 20014),
 ('w', 14213),
 ('e', 13533),
 ('br', 9797),
 ('m', 8888),
 ('d', 7846),
 ('f', 7546),
 ('r', 7538),
 ('g', 7250),
 ("'", 6381),
 ('t', 4920),
 ('gc', 4423),
 ('od', 4168),
 ('n', 3959),
 ('ck', 2988),
 ('pa', 1933),
 ('k', 1760),
 ('mt', 1702),
 ('bf', 1522),
 ('va', 1465),
 ('th', 1134),
 ("canvassers'", 1077),
 ('es', 803),
 ('wm', 675),
 ('u', 612),
 ('pg', 545),
 ('z', 487),
 ("the'", 385),
 ('hm', 365),
 ('sp', 363),
 ('x', 338),
 ('sabbathschool', 332),
 ('reichenbach', 307),
 ('mcelphatrick', 291),
 ('ok', 285),
 ('seventhday', 251),
 ('pp', 250),
 ("colporteurs'", 247),
 ('buttermore', 230),
 ('cc', 208),
 ('bfl', 206),
 ('barto', 204),
 ("to'", 196),
 ('re', 195),
 ('oertley', 189),
 ('wc', 186),
 ('al', 186),
 ('q', 185),
 ('syphers', 182),
 ("''", 180),
 ('ce', 178),
 ('nd', 174),
 ('phila', 172),
 ('tolliver', 169),
 ('charloe', 167),
 ('ca', 156),
 ('pengelly', 156),
 ('da', 155),
 ('dunkinson', 150),
 ("and'", 148),
 ('apsley', 145),
 ('silber', 144),
 ('ti', 142),
 ('ex', 141),
 ('gerhart', 139),
 ('tion', 138),
 ('ga', 134),
 ('midkiff', 132),
 ('ww', 128),
 ('id', 128),
 ('kohr', 126),
 ('harford', 125),
 ('il', 117),
 ("in'", 114),
 ('cd', 114),
 ('zimmerly', 113),
 ('maloney', 110),
 ('-', 109),
 ('eusey', 108),
 ('mahoning', 105),
 ('cabell', 103),
 ('muskingum', 101),
 ('greenspring', 97),
 ('mo', 97),
 ('pickaway', 94),
 ('tt', 93),
 ('bassler', 90),
 ('bentz', 90),
 ("a'", 90),
 ("officers'", 88),
 ('fairhill', 88),
 ('lb', 86),
 ('ia', 86),
 ("colporteur's", 85),
 ('se', 85),
 ('lh', 84),
 ('miscl', 84),
 ('sd', 83),
 ('twentyfive', 82),
 ('monongalia', 82)]
In [ ]: