HR-OCR-Evaluation-and-Correction

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-02-14-Roman-Numerals.txt",
             "2017-03-01-Additional-Approved-Words.txt"
            ]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "HR"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)

Baseline

In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/HR/baseline

Average verified rate: 0.9381916199487721

Average of error rates: 0.07917422477146442

Total token count: 14244185

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 500 )
Out[11]:
[('ñ', 30042),
 ('m', 15023),
 ('-', 14921),
 ('¥', 11779),
 ("'", 11243),
 ('d', 11089),
 ('tion', 9128),
 ('e', 8622),
 ('in-', 7649),
 ('con-', 7490),
 ('re-', 7387),
 ('t', 6864),
 ('w', 6692),
 ('f', 6658),
 ('r', 6358),
 (')', 6073),
 ('co', 5713),
 ('n', 5464),
 ('ex-', 4718),
 ('de-', 4140),
 ('be-', 4091),
 ('ò', 4035),
 ('ment', 3835),
 ('dis-', 3773),
 ('pm', 3748),
 ('ó', 3558),
 ('com-', 3444),
 ('(', 3333),
 ('g', 3071),
 ('ñthe', 2992),
 ('pro-', 2715),
 ('tions', 2609),
 ('un-', 2592),
 ('per-', 2445),
 ('¡', 2260),
 ('*', 2073),
 ('ña', 2049),
 ('im-', 2045),
 ('en-', 1973),
 ('pre-', 1951),
 ('al-', 1711),
 ('ac-', 1652),
 ('ap-', 1646),
 ('th', 1631),
 ('ments', 1583),
 ('sub-', 1572),
 ('ful', 1506),
 ('u', 1499),
 ('ad-', 1368),
 ('/', 1340),
 ('ble', 1324),
 ('k', 1306),
 ('ous', 1265),
 ('_', 1261),
 ('x', 1212),
 ('an-', 1115),
 ('ers', 1112),
 ('at-', 1073),
 ('to-', 1046),
 ('di-', 1036),
 ('ture', 1015),
 ('ence', 1008),
 ('--', 982),
 ('ical', 975),
 ('some-', 974),
 ('ance', 948),
 ('treat-', 927),
 ('ob-', 892),
 ('pa-', 879),
 ('ple', 858),
 ('re', 823),
 ('ity', 822),
 ('dren', 808),
 ('or-', 805),
 ('ent', 805),
 ('with-', 801),
 ('chil-', 799),
 ('fol-', 782),
 ('ure', 747),
 ('sup-', 742),
 ('mo', 738),
 ('%', 726),
 ('ex', 694),
 ('over-', 694),
 ('tem-', 688),
 ('sur-', 680),
 ('z', 679),
 ('ber', 677),
 ('inter-', 676),
 ('gen-', 658),
 ('ab-', 649),
 ('eral', 643),
 ('pa', 642),
 ('em-', 640),
 ('ly', 637),
 ('tive', 619),
 ('par-', 616),
 ('ar-', 602),
 ('pos-', 602),
 ('stom-', 602),
 ('ach', 585),
 ('ma-', 585),
 ('es-', 582),
 ('tle', 580),
 ('hun-', 577),
 ('tem', 576),
 ('condi-', 574),
 ('ô', 570),
 ('ñit', 563),
 ('=', 562),
 ('tained', 557),
 ('for-', 554),
 ('physi-', 543),
 ('¥¥', 540),
 ('as-', 539),
 ('man-', 538),
 ('under-', 537),
 ("an'", 537),
 ('pur-', 534),
 ('sys-', 531),
 ('exer-', 529),
 ("'s", 529),
 ('duced', 525),
 ('can-', 524),
 ('them-', 523),
 ('cer-', 519),
 ('•', 516),
 ('hy-', 504),
 ('quently', 504),
 ('thor-', 502)]

Review Special Characters

In [12]:
reports.tokens_with_special_characters(errors_summary)[:150]
Out[12]:
[('ñ', 30042),
 ('¥', 11779),
 (')', 6073),
 ('ò', 4035),
 ('ó', 3558),
 ('(', 3333),
 ('ñthe', 2992),
 ('¡', 2260),
 ('*', 2073),
 ('ña', 2049),
 ('/', 1340),
 ('_', 1261),
 ('%', 726),
 ('ô', 570),
 ('ñit', 563),
 ('=', 562),
 ('¥¥', 540),
 ('•', 516),
 ('**', 489),
 ('ã', 484),
 (']', 482),
 ('ñsel', 469),
 ('(m)', 440),
 ('ñan', 421),
 ('ñin', 416),
 ('\\', 391),
 ('(the', 374),
 ('ñdr', 366),
 ('(to', 350),
 ('õ', 334),
 ('ñwe', 323),
 ('***', 312),
 ('(illustrated)', 311),
 ('ñthis', 304),
 ('ñand', 277),
 ('¥¥¥', 258),
 ('ñno', 258),
 ('`', 254),
 ('(new)', 251),
 ('ñj', 250),
 ('ñthere', 235),
 ('(a', 234),
 ('(see', 230),
 ('o¡', 218),
 ('ñmrs', 218),
 ('(poetry)', 217),
 ('ñhow', 206),
 ('(and', 194),
 ('(or', 191),
 ('(no', 177),
 ('ñone', 176),
 ('ñthat', 174),
 ('ñto', 165),
 ('ñi', 165),
 ('(which', 163),
 ('ñm', 161),
 ('+', 157),
 ('[', 153),
 ('(bulk', 151),
 ('ñprof', 147),
 ('(new', 144),
 ('(not', 144),
 ('(in', 140),
 ('\ufeff', 139),
 ('(fig', 139),
 ('[the', 135),
 ('ñaccording', 134),
 ('-¥', 134),
 ('(w)', 133),
 ('^', 133),
 ('(france)', 131),
 ('ñfor', 131),
 ('ñif', 131),
 ('ñall', 126),
 ("¥'", 123),
 ('ñe', 122),
 ('¥¥¥¥', 122),
 ('ñc', 119),
 ('(for', 117),
 ('(as', 116),
 ('ñas', 113),
 ('<', 113),
 ('#', 113),
 ('donõt', 112),
 ('(concluded', 112),
 ('(nos', 109),
 ('nõt', 109),
 ('—', 108),
 ('”', 108),
 ('ñs', 107),
 ('ñw', 107),
 ('(a)', 105),
 ('ñat', 104),
 ('i)', 104),
 ('¥-', 104),
 ('(i', 104),
 ('ñh', 101),
 ('ñnot', 95),
 ('ñwhen', 95),
 ("'¥", 93),
 ('ñyes', 93),
 ('ñsome', 93),
 ('ñr', 92),
 ('ñtake', 92),
 ('(mich', 91),
 ('(medicinal)', 90),
 ('(b)', 89),
 ('addressñ', 85),
 ('£', 85),
 ('¥¥¥¥¥', 84),
 ('__', 81),
 ('(i)', 79),
 ('ñt', 79),
 ('ñn', 79),
 ('ñthese', 79),
 ('(if', 78),
 ('(library', 76),
 ('ñen', 76),
 ('(with', 76),
 ('ñwhat', 75),
 ('manõs', 75),
 ('(one', 75),
 ('ñby', 74),
 ('ñbut', 74),
 ('style)', 74),
 ('*n', 71),
 ('(c', 71),
 ('ñf', 70),
 ('ñmr', 69),
 ('ñl', 69),
 ('(f', 68),
 ('|', 66),
 ('¥n', 66),
 ('(c)', 65),
 ('o)', 65),
 ('>', 64),
 ('ofñ', 64),
 ('only)', 63),
 ('ñnew', 63),
 ('¥¥¥¥¥¥¥¥', 62),
 ('ñex', 60),
 ('“', 59),
 ('ñtwo', 58),
 ('womanõs', 58),
 ('(illus', 57),
 ('ñis', 57),
 ('¥-¥', 57),
 ('the¥', 57),
 ('ñmany', 56),
 ('ñof', 56)]

Correction 1 -- Replace "õ" with "'"

In [13]:
# %load shared_elements/replace_accented_o.py
prev = "baseline"
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    with open(join(directories['prev'], filename)) as f:
        content = f.read()
        
    content = re.sub(r"(\w+)(õ|Õ)", r"\1'", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/HR/correction1

Average verified rate: 0.9383412950618094

Average of error rates: 0.07905354902312242

Total token count: 14244185

In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[('ñ', 30042),
 ('m', 15023),
 ('-', 14921),
 ('¥', 11779),
 ("'", 11245),
 ('d', 11089),
 ('tion', 9128),
 ('e', 8622),
 ('in-', 7649),
 ('con-', 7490),
 ('re-', 7387),
 ('t', 6864),
 ('w', 6692),
 ('f', 6658),
 ('r', 6358),
 (')', 6073),
 ('co', 5713),
 ('n', 5464),
 ('ex-', 4718),
 ('de-', 4140),
 ('be-', 4091),
 ('ò', 4035),
 ('ment', 3835),
 ('dis-', 3773),
 ('pm', 3748),
 ('ó', 3558),
 ('com-', 3444),
 ('(', 3333),
 ('g', 3071),
 ('ñthe', 2992),
 ('pro-', 2715),
 ('tions', 2609),
 ('un-', 2592),
 ('per-', 2445),
 ('¡', 2260),
 ('*', 2073),
 ('ña', 2049),
 ('im-', 2045),
 ('en-', 1973),
 ('pre-', 1951),
 ('al-', 1711),
 ('ac-', 1652),
 ('ap-', 1646),
 ('th', 1631),
 ('ments', 1583),
 ('sub-', 1572),
 ('ful', 1506),
 ('u', 1499),
 ('ad-', 1368),
 ('/', 1340)]

Correction 2 -- Normalize Characters

In [16]:
# %load shared_elements/normalize_characters.py
prev = "correction1"
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
    
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/HR/correction2

Average verified rate: 0.9474700623950211

Average of error rates: 0.06657375873812511

Total token count: 14185587

In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('-', 16542),
 ('m', 15895),
 ("'", 12059),
 ('d', 11280),
 ('tion', 9178),
 ('e', 9129),
 ('in-', 7668),
 ('con-', 7499),
 ('t', 7473),
 ('re-', 7402),
 ('w', 7034),
 ('f', 7023),
 ('r', 6884),
 ('n', 5856),
 ('co', 5797),
 ('ex-', 4725),
 ('de-', 4151),
 ('be-', 4103),
 ('ment', 3847),
 ('dis-', 3776),
 ('pm', 3757),
 ('com-', 3446),
 ('g', 3224),
 ('pro-', 2720),
 ('tions', 2617),
 ('un-', 2601),
 ('per-', 2451),
 ('im-', 2045),
 ('en-', 1988),
 ('pre-', 1951),
 ('al-', 1715),
 ('th', 1682),
 ('ap-', 1663),
 ('ac-', 1662),
 ('ments', 1591),
 ('sub-', 1573),
 ('u', 1549),
 ('ful', 1511),
 ('k', 1435),
 ('ad-', 1369),
 ('ble', 1342),
 ('x', 1294),
 ('ous', 1266),
 ('--', 1229),
 ('an-', 1118),
 ('ers', 1118),
 ('at-', 1074),
 ('to-', 1052),
 ('di-', 1036),
 ('ture', 1024)]

Correction 3 -- Fix Line Endings

In [19]:
# %load shared_elements/correct_line_endings.py
prev = "correction2"
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [20]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/HR/correction3

Average verified rate: 0.9741713414931088

Average of error rates: 0.041146666069188036

Total token count: 13922442

In [21]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[21]:
[('-', 16407),
 ('m', 15881),
 ("'", 12059),
 ('d', 11272),
 ('e', 9110),
 ('t', 7413),
 ('w', 7028),
 ('f', 7003),
 ('r', 6863),
 ('n', 5849),
 ('co', 5770),
 ('pm', 3757),
 ('g', 3217),
 ('th', 1681),
 ('u', 1545),
 ('k', 1431),
 ('x', 1293),
 ('--', 1229),
 ('tion', 1003),
 ('re', 876),
 ('ex', 773),
 ('z', 764),
 ('mo', 747),
 ('pa', 659),
 ('sel', 655),
 ("an'", 558),
 ("'s", 544),
 ('lb', 500),
 ('pp', 475),
 ('al', 471),
 ('ment', 447),
 ('oz', 445),
 ('wm', 354),
 ('mc', 339),
 ('ti', 325),
 ('re-', 318),
 ('q', 304),
 ('---', 298),
 ('tions', 293),
 ('ft', 277),
 ("'t", 272),
 ('se', 270),
 ('ro', 266),
 ('io', 254),
 ('es', 253),
 ('un', 247),
 ('oo', 244),
 ('pt', 241),
 ("'the", 241),
 ('mt', 240)]

Correction 4 -- Remove Extra Dashes

In [22]:
# %load shared_elements/remove_extra_dashes.py
prev = "correction3"
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
            
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
        else:
            pass
        
    if len(replacements) > 0:
#         print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [23]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/HR/correction4

Average verified rate: 0.9767487018539854

Average of error rates: 0.03690231224233734

Total token count: 13925244

In [24]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[24]:
[('m', 15989),
 ("'", 12451),
 ('d', 11322),
 ('e', 9321),
 ('t', 7609),
 ('f', 7097),
 ('r', 7093),
 ('w', 7092),
 ('n', 5918),
 ('co', 5858),
 ('pm', 3761),
 ('g', 3281),
 ('th', 1690),
 ('u', 1555),
 ('k', 1463),
 ('x', 1343),
 ('re', 1277),
 ('tion', 1006),
 ('ex', 937),
 ('z', 832),
 ('mo', 760),
 ('oz', 677),
 ('pa', 676),
 ('sel', 665),
 ("an'", 558),
 ("'s", 544),
 ('lb', 533),
 ('al', 520),
 ('pp', 475),
 ('ment', 449),
 ('-', 424),
 ('wm', 355),
 ('ti', 355),
 ('mc', 351),
 ('q', 316),
 ('un', 313),
 ('tions', 293),
 ('se', 287),
 ('ft', 286),
 ('pre', 274),
 ("'t", 274),
 ('es', 273),
 ('ro', 272),
 ('io', 259),
 ("''", 250),
 ('pt', 244),
 ('oo', 244),
 ('mt', 243),
 ("'the", 241),
 ('ry', 226)]

Correction 5 -- Remove extra quotation marks

In [25]:
# %load shared_elements/replace_extra_quotation_marks.py
prev = "correction4"
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 1:
                if token_list[-2] is 's' or 'S':
                    pass
                else:
                    corrections.append((token, re.sub(r"'", r"", token)))
            else:
                pass
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
        else:
            pass
    
    if len(corrections) > 0:
#         print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [26]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/HR/correction5

Average verified rate: 0.9771528524967492

Average of error rates: 0.03617063989962359

Total token count: 13925327

In [27]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[27]:
[('m', 16115),
 ('d', 11423),
 ("'", 11285),
 ('e', 9354),
 ('t', 8191),
 ('r', 7147),
 ('f', 7111),
 ('w', 7107),
 ('n', 6132),
 ('co', 5859),
 ('pm', 3762),
 ('g', 3283),
 ('th', 1692),
 ('u', 1561),
 ('k', 1466),
 ('x', 1344),
 ('re', 1331),
 ('tion', 1006),
 ('ex', 940),
 ('z', 836),
 ('mo', 760),
 ('pa', 677),
 ('oz', 677),
 ('sel', 665),
 ("an'", 555),
 ('lb', 536),
 ('al', 525),
 ('pp', 475),
 ('ment', 450),
 ('-', 426),
 ('ti', 358),
 ('wm', 355),
 ('mc', 351),
 ('em', 325),
 ('q', 319),
 ('un', 316),
 ('tions', 293),
 ('ft', 291),
 ('se', 287),
 ('es', 276),
 ('pre', 274),
 ('ro', 273),
 ('io', 259),
 ('oo', 249),
 ('pt', 244),
 ('mt', 244),
 ('il', 227),
 ('ry', 226),
 ('li', 222),
 ('ll', 221)]

Correction 6 -- Rejoin Burst Words

In [28]:
# %load shared_elements/rejoin_burst_words.py
prev = "correction5"
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    pattern = re.compile("(\s(\w{1,2}\s){5,})")
    
    replacements = []
    clean.check_splits(pattern, spelling_dictionary, content, replacements)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
HR18660801-V01-01-page1.txt: [('\nT e r m s ', 'Terms')]
HR18660801-V01-01-page15.txt: [('\nT e r m s ', 'Terms')]
HR18660801-V01-01-page21.txt: [('\nC e l e r y ', 'Celery')]
HR18660801-V01-01-page7.txt: [('\nC e l e r y ', 'Celery')]
HR18660901-V01-02-page1.txt: [(' G e s e n i u s ', 'Gesenius'), (' h a m a h ', 'hamah')]
HR18660901-V01-02-page12.txt: [(' R e f o r m e r ', 'Reformer')]
HR18660901-V01-02-page13.txt: [(' W a n t e d ', 'Wanted')]
HR18660901-V01-02-page14.txt: [(' Ca n r ig h t ', 'Canright')]
HR18660901-V01-02-page16.txt: [('\nDe f ic ie n t ', 'Deficient')]
HR18660901-V01-02-page2.txt: [(' B u t t e r ', 'Butter'), (' B u t t e r ', 'Butter')]
HR18660901-V01-02-page3.txt: [(' R e f o r m e r ', 'Reformer')]
HR18661001-V01-03-page1.txt: [('\nT e r m s ', 'Terms')]
HR18661101-V01-04-page1.txt: [(' E D IT O R ', 'EDITOR'), ('\nT e r m s ', 'Terms')]
HR18661101-V01-04-page4.txt: [(' r e f o r m e r ', 'reformer')]
HR18661201-V01-05-page1.txt: [(' E D I T O R ', 'EDITOR'), ('\nT e r m s ', 'Terms'), ('\nHe a l t h ', 'Health')]
HR18661201-V01-05-page11.txt: [(' L a m s o n ', 'Lamson')]
HR18661201-V01-05-page12.txt: [('\nW e a l t h ', 'Wealth')]
HR18661201-V01-05-page7.txt: [(' d e c a y\n', 'decay'), (' f o r e v e r ', 'forever'), ('\nP o r c i n e ', 'Porcine')]
HR18670101-V01-06-page16.txt: [(' I n d i v i d u a l s ', 'Individuals'), ('\nA d v e r t i s e m e n t s ', 'Advertisements')]
HR18670201-V01-07-page1.txt: [(' E D I T O R ', 'EDITOR'), ('\nT e r m s ', 'Terms'), (' W i l l i s ', 'Willis')]
HR18670201-V01-07-page16.txt: [('\nA d v e r t is e m e n t s ', 'Advertisements'), ('On', 'On')]
HR18670301-V01-08-page1.txt: [(' w a t e r ', 'water')]
HR18670301-V01-08-page10.txt: [('\nB r e a t h i n g ', 'Breathing'), ('\nCa l m n e s s ', 'Calmness')]
HR18670301-V01-08-page11.txt: [(' h a v i n g ', 'having'), (' s i s s y ', 'sissy')]
HR18670301-V01-08-page6.txt: [(' c o w a n ', 'cowan')]
HR18670301-V01-08-page9.txt: [('\nA b o u t ', 'About')]
HR18670401-V01-09-page1.txt: [(' E D IT O R ', 'EDITOR'), ('\nT e r m s ', 'Terms')]
HR18670401-V01-09-page14.txt: [('\nI n d o l e n c e ', 'Indolence')]
HR18670401-V01-09-page7.txt: [('\nN a t u r e ', 'Nature')]
HR18670401-V01-09-page9.txt: [('\nS c o l d i n g ', 'Scolding')]
HR18670501-V01-10-page1.txt: [(' e d i t o r ', 'editor'), ('\nT e r m s ', 'Terms')]
HR18670501-V01-10-page10.txt: [(' L o n d o n ', 'London')]
HR18670501-V01-10-page11.txt: [('\nR e p e n t a n c e ', 'Repentance')]
HR18670501-V01-10-page12.txt: [(' W i l l i a m s ', 'Williams')]
HR18670501-V01-10-page4.txt: [('\nP e r h a p s ', 'Perhaps')]
HR18670501-V01-10-page5.txt: [('\nA v e r y ', 'Avery'), ('\nE v e r t ', 'Evert')]
HR18670501-V01-10-page7.txt: [('\nU n d e r ', 'Under')]
HR18670601-V01-11,12-page1.txt: [(' e d i t o r ', 'editor'), ('\nT e r m s ', 'Terms')]
HR18670601-V01-11,12-page12.txt: [('\nB e s id e s ', 'Besides')]
HR18670601-V01-11,12-page19.txt: [('\nN o t h i n g\n', 'Nothing')]
HR18670601-V01-11,12-page22.txt: [(' c a n r i g h t ', 'canright')]
HR18670601-V01-11,12-page28.txt: [(' Sl e e p W a l k in g ', 'SleepWalking')]
HR18670601-V01-11,12-page29.txt: [(' B o d ie s ', 'Bodies')]
HR18670601-V01-11,12-page3.txt: [('\nH E A L T H\n', 'HEALTH'), ('It', 'It')]
HR18670601-V01-11,12-page8.txt: [(' r e f o r m e r ', 'reformer')]
HR18670701-V02-01-page1.txt: [('\nT e r m s ', 'Terms')]
HR18670701-V02-01-page12.txt: [('\nG e o r g e ', 'George'), (' T r a i n ', 'Train')]
HR18670701-V02-01-page14.txt: [(' r h o d a ', 'rhoda')]
HR18670701-V02-01-page15.txt: [('\nN o t h i n g ', 'Nothing'), ('\nC r u e l ', 'Cruel')]
HR18670701-V02-01-page16.txt: [('\nEx c u r s io n ', 'Excursion')]
HR18670701-V02-01-page6.txt: [(' m e a c h a m ', 'meacham')]
HR18670701-V02-01-page8.txt: [('\nQ u i t e\n', 'Quite')]
HR18681001-V03-04-page16.txt: [('It', 'It')]
HR18690401-V03-10-page6.txt: [('Ir', 'Ir')]
HR18711001-V06-04-page22.txt: [('If', 'If')]
HR18711201-V06-06-page31.txt: [('It', 'It')]
HR18730101-V08-01-page5.txt: [('Or', 'Or')]
HR18740801-V09-08-page11.txt: [('It', 'It')]
HR18740801-V09-08-page2.txt: [('If', 'If'), ('In', 'In')]
HR18750501-V10-05-page12.txt: [('If', 'If')]
HR18750601-V10-06-page16.txt: [('It', 'It')]
HR18751001-V10-10-page17.txt: [('Is', 'Is'), ('Do', 'Do')]
HR18760501-V11-05-page14.txt: [('If', 'If')]
HR18770901-V12-09-page11.txt: [('He', 'He')]
HR18790101-V14-01-page29.txt: [('If', 'If')]
HR18790101-V14-01-page6.txt: [('If', 'If')]
HR18790201-V14-02-page1.txt: [('It', 'It')]
HR18790701-V14-07-page1.txt: [('It', 'It')]
HR18800501-V15-05-page1.txt: [('It', 'It')]
HR18800501-V15-05-page26.txt: [('As', 'As')]
HR18800701-V15-07-page14.txt: [('Is', 'Is')]
HR18800701-V15-07-page32.txt: [('Ii', 'Ii')]
HR18800801-V15-08-page6.txt: [('At', 'At')]
HR18801001-V15-10-page1.txt: [('It', 'It'), ('It', 'It'), ('It', 'It')]
HR18801201-V15-12-page40.txt: [('At', 'At')]
HR18810101-V16-01-page1.txt: [('It', 'It')]
HR18810401-V16-04-page17.txt: [('Be', 'Be')]
HR18810401-V16-04-page32.txt: [('Go', 'Go'), ('He', 'He'), ('Go', 'Go'), ('He', 'He')]
HR18810501-V16-05-page14.txt: [('\nO u t s id e ', 'Outside')]
HR18810501-V16-05-page27.txt: [(' J e r o m e ', 'Jerome')]
HR18810501-V16-05-page3.txt: [(' w h i c h ', 'which')]
HR18810501-V16-05-page4.txt: [('\nU n d e r ', 'Under'), (' T o b a c c o ', 'Tobacco'), (' A m a d o n ', 'Amadon')]
HR18810501-V16-05-page7.txt: [('\nA n y t h in g ', 'Anything')]
HR18810601-V16-06-page14.txt: [('\nL e a r n\n', 'Learn'), ('\nW A I T I N G ', 'WAITING'), ('\nC l a r a\n', 'Clara')]
HR18810601-V16-06-page23.txt: [('\nW i t h i n ', 'Within')]
HR18810601-V16-06-page25.txt: [(' M o r e l ', 'Morel')]
HR18810601-V16-06-page30.txt: [('\nW h e r e a s ', 'Whereas')]
HR18810601-V16-06-page31.txt: [('An', 'An')]
HR18810701-V16-07-page16.txt: [(' g o u g h ', 'gough')]
HR18810701-V16-07-page26.txt: [('\nA l m o s t ', 'Almost')]
HR18810801-V16-08-page30.txt: [(' H e r a l d ', 'Herald')]
HR18810801-V16-08-page34.txt: [(' S T Y L E ', 'STYLE')]
HR18810901-V16-09-page16.txt: [('\nW o u ld s t ', 'Wouldst')]
HR18810901-V16-09-page19.txt: [(' Sy l v e s t e r ', 'Sylvester')]
HR18810901-V16-09-page32.txt: [('\nW a n t e d ', 'Wanted')]
HR18811001-V16-10-page1.txt: [(' c o n c l u d e d ', 'concluded')]
HR18811001-V16-10-page11.txt: [('\nCa u s e s ', 'Causes')]
HR18811001-V16-10-page12.txt: [('\nTr e a t me n t ', 'Treatment')]
HR18811001-V16-10-page18.txt: [('\nA b o v e ', 'Above')]
HR18811001-V16-10-page32.txt: [('Go', 'Go'), ('He', 'He'), ('Go', 'Go')]
HR18811001-V16-10-page5.txt: [('\nP r o b a b l y ', 'Probably')]
HR18811101-V16-11-page25.txt: [(' L i v e r m o r e ', 'Livermore'), (' C a r p e n t e r ', 'Carpenter')]
HR18811201-V16-12-page12.txt: [('\nN e a r l y ', 'Nearly')]
HR18811201-V16-12-page22.txt: [('\nS o m e b o d y ', 'Somebody')]
HR18811201-V16-12-page23.txt: [(' H e a l t h ', 'Health')]
HR18811201-V16-12-page24.txt: [('\nU n d e r\n', 'Under')]
HR18811201-V16-12-page26.txt: [('\nCh ie f l y ', 'Chiefly')]
HR18811201-V16-12-page29.txt: [(' St e a r n s ', 'Stearns'), (' Ch il\nt o n ', 'Chilton'), ('No', 'No'), ('Am', 'Am')]
HR18811201-V16-12-page30.txt: [('\nE v e r y ', 'Every'), ('Go', 'Go'), ('He', 'He')]
HR18811201-V16-12-page33.txt: [('Go', 'Go'), ('Go', 'Go'), (' H e a l t h ', 'Health'), (' F a r m e r ', 'Farmer'), ('Go', 'Go'), ('Go', 'Go')]
HR18811201-V16-12-page34.txt: [(' S T Y L E ', 'STYLE')]
HR18811201-V16-12-page36.txt: [(' d is a d v a n ta g e s ', 'disadvantages')]
HR18811201-V16-12-page38.txt: [(' W h i t e ', 'White'), ('\nM e s s e n g e r\n', 'Messenger'), (' t w e n t y ', 'twenty')]
HR18811201-V16-12-page39.txt: [(' W h i t e ', 'White')]
HR18811201-V16-12-page40.txt: [(' S A L I S B U R Y ', 'SALISBURY'), (' H Y G I E N I C ', 'HYGIENIC'), (' C O R S E T ', 'CORSET')]
HR18811201-V16-12-page5.txt: [('It', 'It')]
HR18820101-V17-01-page13.txt: [(' F e l i x ', 'Felix'), (' O s w a l d ', 'Oswald')]
HR18820101-V17-01-page14.txt: [('\nT h o u g h ', 'Though')]
HR18820101-V17-01-page19.txt: [(' H u t c h i n s ', 'Hutchins')]
HR18820101-V17-01-page31.txt: [('Gu', 'Gu')]
HR18820101-V17-01-page32.txt: [('Go', 'Go')]
HR18820201-V17-02-page13.txt: [(' K e l l o g g ', 'Kellogg')]
HR18820201-V17-02-page18.txt: [('Go', 'Go')]
HR18820201-V17-02-page19.txt: [(' H e a l t h ', 'Health')]
HR18820201-V17-02-page2.txt: [('To', 'To')]
HR18820201-V17-02-page9.txt: [('\nS y m p t o m s ', 'Symptoms')]
HR18820301-V17-03-page18.txt: [(' G i b s o n ', 'Gibson')]
HR18820301-V17-03-page23.txt: [('\nT h e r e ', 'There')]
HR18820301-V17-03-page31.txt: [('\nR e a d i n g ', 'Reading')]
HR18820301-V17-03-page7.txt: [('\nSy m p t o m s ', 'Symptoms')]
HR18820401-V17-04-page14.txt: [(' S P R I N G ', 'SPRING')]
HR18820401-V17-04-page28.txt: [(' H e a l t h ', 'Health')]
HR18820401-V17-04-page32.txt: [('Go', 'Go')]
HR18820501-V17-05-page20.txt: [('\nT h o s e ', 'Those')]
HR18820501-V17-05-page22.txt: [('Ar', 'Ar')]
HR18820501-V17-05-page30.txt: [(' H e a l t h ', 'Health')]
HR18820501-V17-05-page7.txt: [('\nP o m e r a n i a ', 'Pomerania'), ('\nP r u s s i a ', 'Prussia'), ('\nSa x o n y ', 'Saxony'), ('\nBa v a r ia ', 'Bavaria'), (' Co u n t r ie s ', 'Countries'), ('\nR u s s i a ', 'Russia'), ('\nI r e l a n d ', 'Ireland'), (' E n g l a n d ', 'England'), (' B e l g i u m ', 'Belgium')]
HR18820601-V17-06-page17.txt: [('\nT h e r e ', 'There')]
HR18820601-V17-06-page31.txt: [(' R e s o u r c e s ', 'Resources')]
HR18820701-V17-07-page14.txt: [(' Pa r k e r ', 'Parker')]
HR18820701-V17-07-page29.txt: [('\nC a u s e s ', 'Causes'), ('\nT h e s e ', 'These'), ('\nT r e a t m e n t ', 'Treatment')]
HR18820801-V17-08-page14.txt: [('\nPr a c t ic e\n', 'Practice')]
HR18820801-V17-08-page22.txt: [('\nG r e a t ', 'Great')]
HR18820801-V17-08-page23.txt: [('\nA c u t e ', 'Acute'), ('\nT h e r e ', 'There')]
HR18820801-V17-08-page24.txt: [(' A y e r s ', 'Ayers')]
HR18820801-V17-08-page25.txt: [('\nS i n c e ', 'Since')]
HR18820801-V17-08-page27.txt: [(' H e a l t h ', 'Health')]
HR18820801-V17-08-page31.txt: [('In', 'In')]
HR18820801-V17-08-page6.txt: [(' W il l ia m ', 'William')]
HR18820901-V17-09-page2.txt: [('\nA p p a r a t u s ', 'Apparatus')]
HR18820901-V17-09-page20.txt: [('\nW o m a n ', 'Woman')]
HR18820901-V17-09-page21.txt: [(' R e c e n t ', 'Recent')]
HR18820901-V17-09-page22.txt: [(' T E M P E R A N C E ', 'TEMPERANCE'), (' B IT T E R S ', 'BITTERS')]
HR18820901-V17-09-page31.txt: [('\nM o n t h l y ', 'Monthly')]
HR18820901-V17-09-page32.txt: [(' T h o m p s o n ', 'Thompson')]
HR18821001-V17-10-page10.txt: [(' Co n t in u e d ', 'Continued')]
HR18821001-V17-10-page18.txt: [('\nP r o b a b l y\n', 'Probably')]
HR18821001-V17-10-page8.txt: [('\nT r e a t m e n t ', 'Treatment')]
HR18821001-V17-10-page9.txt: [('\nT r e a t m e n t ', 'Treatment'), (' T r e a t m e n t ', 'Treatment')]
HR18821101-V17-11-page16.txt: [(' A b s t a in ', 'Abstain')]
HR18821101-V17-11-page22.txt: [('\nM ic h ig a n ', 'Michigan')]
HR18821101-V17-11-page28.txt: [(' Co n t in u e d ', 'Continued')]
HR18821101-V17-11-page31.txt: [('Go', 'Go')]
HR18821201-V17-12-page21.txt: [(' R e y n o l d s ', 'Reynolds')]
HR18821201-V17-12-page28.txt: [(' Sy m pt o m s ', 'Symptoms')]
HR18821201-V17-12-page32.txt: [('\nE D IT O R IA L ', 'EDITORIAL')]
HR18821201-V17-12-page5.txt: [(' Co n t in u e d ', 'Continued')]
HR18821201-V17-12-page8.txt: [('\nW h e n e v e r ', 'Whenever'), ('\nA d v e r s it y ', 'Adversity')]
HR18830101-V18-01-page18.txt: [('Go', 'Go')]
HR18830101-V18-01-page27.txt: [(' S m i t h ', 'Smith')]
HR18830101-V18-01-page28.txt: [(' D E P A R T M E N T ', 'DEPARTMENT')]
HR18830101-V18-01-page29.txt: [('\nW h e n c e ', 'Whence')]
HR18830101-V18-01-page3.txt: [('\nP e r s o n ', 'Person')]
HR18830101-V18-01-page4.txt: [('\nL e a v i n g ', 'Leaving')]
HR18830101-V18-01-page7.txt: [('\ne p i g r a m ', 'epigram')]
HR18830201-V18-02-page20.txt: [(' A m a d o n ', 'Amadon')]
HR18830201-V18-02-page3.txt: [(' r u b e o l a ', 'rubeola')]
HR18830301-V18-03-page18.txt: [('Go', 'Go')]
HR18830301-V18-03-page29.txt: [('So', 'So'), ('\nA n s w e r ', 'Answer'), ('\nA l m o s t ', 'Almost'), ('\nA n sw e r ', 'Answer'), ('\nA n s w e r ', 'Answer')]
HR18830301-V18-03-page31.txt: [(' C h i l d ', 'Child'), (' P a p e r ', 'Paper'), ('\nR e v i e w ', 'Review')]
HR18830301-V18-03-page32.txt: [(' H e a l t h ', 'Health'), ('Go', 'Go')]
HR18830401-V18-04-page1.txt: [(' M A T T E R ', 'MATTER')]
HR18830401-V18-04-page17.txt: [('\nF O R T IT U D E ', 'FORTITUDE')]
HR18830401-V18-04-page20.txt: [('\nCh o r u s ', 'Chorus'), ('\nCh o r u s ', 'Chorus'), ('\nC h o r u s ', 'Chorus')]
HR18830401-V18-04-page23.txt: [('\nA m o n g ', 'Among')]
HR18830401-V18-04-page29.txt: [(' e l i c i t ', 'elicit')]
HR18830401-V18-04-page31.txt: [('St', 'St')]
HR18830601-V18-06-page1.txt: [('Ii', 'Ii')]
HR18830801-V18-08-page1.txt: [('It', 'It')]
HR18831101-V18-11-page17.txt: [('Is', 'Is')]
HR18831201-V18-12-page1.txt: [('Ir', 'Ir')]
HR18831201-V18-12-page32.txt: [('An', 'An')]
HR18840201-V19-02-page1.txt: [('Ii', 'Ii')]
HR18850301-V20-03-page1.txt: [('It', 'It')]
HR18850701-V20-07-page1.txt: [('Ii', 'Ii')]
HR18860301-V21-03-page2.txt: [('\nT H OU G H T S ', 'THOUGHTS')]
HR18870101-V22-01-page44.txt: [('We', 'We')]
HR18870201-V22-02-page44.txt: [('We', 'We')]
HR18870601-V22-06-page22.txt: [('It', 'It')]
HR18870601-V22-06-page30.txt: [('Mr', 'Mr')]
HR18870901-V22-09-page1.txt: [('It', 'It')]
HR18871201-V22-12-page34.txt: [('El', 'El')]
HR18880101-V23-01-page50.txt: [('If', 'If')]
HR18880201-V23-02-page54.txt: [('It', 'It'), ('It', 'It')]
HR18881001-V23-10-page54.txt: [('In', 'In')]
HR18890201-V24-02-page46.txt: [('\nS P EC I A L ', 'SPECIAL')]
HR18890601-V24-06-page45.txt: [(' C O L O R E D ', 'COLORED')]
HR18890801-V24-08-page17.txt: [(' re c e di ng ', 'receding')]
HR18890901-V24-09-page45.txt: [(' I N C U B A T O R S\n', 'INCUBATORS')]
HR18891101-V24-11-page40.txt: [(' M e r r y ', 'Merry')]
HR18891201-V24-12-page5.txt: [('Et', 'Et'), ('If', 'If'), ('We', 'We')]
HR18900301-V25-03-page39.txt: [('Ot', 'Ot')]
HR18900501-V25-05-page14.txt: [('Et', 'Et'), ('To', 'To')]
HR18900501-V25-05-page42.txt: [('De', 'De')]
HR18900601-V25-06-page41.txt: [('Jo', 'Jo')]
HR18900701-V25-07-page41.txt: [('It', 'It')]
HR18901001-V25-10-page41.txt: [('\nSA NI TA RI UM ', 'SANITARIUM')]
HR18901101-V25-11-page34.txt: [('If', 'If')]
HR18901201-V25-12-page49.txt: [('To', 'To')]
HR18911101-V26-11-page30.txt: [('In', 'In')]
HR18911201-V26-12-page48.txt: [('Di', 'Di')]
HR18920101-V27-01-page41.txt: [(' S O C IA L\n', 'SOCIAL')]
HR18920201-V27-02-page17.txt: [('If', 'If')]
HR18920401-V27-04-page20.txt: [('We', 'We')]
HR18920501-V27-05-page44.txt: [('To', 'To'), ('To', 'To')]
HR18920701-V27-07-page44.txt: [('To', 'To'), ('To', 'To'), ('To', 'To'), ('To', 'To')]
HR18920801-V27-08-page44.txt: [('Og', 'Og')]
HR18921001-V27-10-page44.txt: [(' P r in c ip a l ', 'Principal'), ('To', 'To'), ('To', 'To')]
HR18921201-V27-12-page45.txt: [(' B a ls a m ', 'Balsam')]
HR18930601-V28-06-page17.txt: [('Ai', 'Ai')]
HR18930601-V28-06-page41.txt: [('Ms', 'Ms')]
HR18931101-V28-11-page15.txt: [('If', 'If')]
HR18940801-V29-08-page4.txt: [('It', 'It')]
HR18940901-V29-09-page2.txt: [('\nT E R M S ', 'TERMS')]
HR18941001-V29-10-page2.txt: [('\nT E R M S ', 'TERMS')]
HR18941001-V29-10-page42.txt: [('In', 'In')]
HR18950401-V30-04-page45.txt: [('In', 'In')]
HR18950601-V30-06-page41.txt: [('In', 'In')]
HR18950901-V30-09-page43.txt: [('\nB a t t l e\n', 'Battle')]
HR18951101-V30-11-page44.txt: [('In', 'In')]
HR18951201-V30-12-page53.txt: [('\nB a t t l e ', 'Battle')]
HR18960101-V31-01-page24.txt: [('It', 'It')]
HR18960901-V31-09-page46.txt: [('Ed', 'Ed')]
HR18961001-V31-10-page42.txt: [('St', 'St')]
HR18961101-V31-11-page51.txt: [('In', 'In')]
HR18961101-V31-12-page39.txt: [(' B A T T L E\n', 'BATTLE')]
HR18971101-V32-11-page49.txt: [('It', 'It')]
HR18980401-V33-04-page41.txt: [(' p e pp e r ', 'pepper')]
HR18981201-V33-12-page66.txt: [(' K N O W N ', 'KNOWN')]
HR18981201-V33-12-page8.txt: [('If', 'If')]
HR18990201-V34-02-page32.txt: [(' E i g h t ', 'Eight')]
HR18990301-V34-03-page21.txt: [('Is', 'Is')]
HR18991201-V34-12-page38.txt: [(' s u g a r ', 'sugar')]
HR19000501-V35-05-page71.txt: [('Jr', 'Jr')]
HR19000601-V35-06-page11.txt: [(' p e o p l e\n', 'people'), (' s p e ci al ', 'special')]
HR19000701-V35-07-page31.txt: [(' a f t e r ', 'after')]
HR19000701-V35-07-page69.txt: [('It', 'It')]
HR19000801-V35-08-page76.txt: [(' S Y S T E M ', 'SYSTEM')]
HR19001001-V35-10-page34.txt: [('If', 'If')]
HR19001001-V35-10-page56.txt: [(' C H IC A G O ', 'CHICAGO')]
HR19001201-V35-12-page103.txt: [('Is', 'Is')]
HR19001201-V35-12-page117.txt: [('Jr', 'Jr')]
HR19001201-V35-12-page35.txt: [('If', 'If')]
HR19020101-V37-01-page80.txt: [('It', 'It'), ('It', 'It')]
HR19020101-V37-01-page81.txt: [('St', 'St')]
HR19020101-V37-01-page87.txt: [('\nB o s to n ', 'Boston')]
HR19020301-V37-03-page57.txt: [('Ot', 'Ot'), ('Ot', 'Ot')]
HR19020301-V37-03-page75.txt: [('It', 'It')]
HR19020301-V37-03-page76.txt: [('It', 'It')]
HR19020401-V37-04-page15.txt: [(' s an i t a r y ', 'sanitary')]
HR19020501-V37-05-page26.txt: [(' f o o d s\n', 'foods')]
HR19020701-V37-07-page59.txt: [(' M IC H IG A N\n', 'MICHIGAN')]
HR19020701-V37-07-page69.txt: [('Et', 'Et')]
HR19020801-V37-08-page65.txt: [('In', 'In')]
HR19020901-V37-09-page23.txt: [(' b e au t y ', 'beauty')]
HR19030101-V38-01-page17.txt: [(' n e v e r ', 'never'), (' G r e e k\n', 'Greek')]
HR19030101-V38-01-page67.txt: [('El', 'El'), ('El', 'El')]
HR19030201-V38-02-page21.txt: [('\nh a n d s ', 'hands')]
HR19030201-V38-02-page62.txt: [('It', 'It')]
HR19030401-V38-04-page26.txt: [(' c h e s t ', 'chest')]
HR19030401-V38-04-page75.txt: [('In', 'In')]
HR19030901-V38-09-page77.txt: [('It', 'It')]
HR19031001-V38-10-page84.txt: [('\nP E R C Y ', 'PERCY')]
HR19031101-V38-11-page61.txt: [('No', 'No')]
HR19031201-V38-12-page89.txt: [('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It')]
HR19040101-V39-01-page9.txt: [(' f o u n d ', 'found')]
HR19040401-V39-04-page70.txt: [(' f o o d s ', 'foods')]
HR19040501-V39-05-page85.txt: [('If', 'If')]
HR19040801-V39-08-page86.txt: [(' F o o d s ', 'Foods')]
HR19040801-V39-08-page91.txt: [('It', 'It')]
HR19041001-V39-10-page13.txt: [('No', 'No')]
HR19041201-V39-12-page63.txt: [(' t a s t e d ', 'tasted')]
HR19050101-V40-01-page10.txt: [('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It')]
HR19050201-V40-02-page2.txt: [('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It')]
HR19050201-V40-02-page85.txt: [('It', 'It')]
HR19050401-V40-04-page71.txt: [(' M ic h ig a n\n', 'Michigan')]
HR19050501-V40-05-page77.txt: [('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It')]
HR19050601-V40-06-page48.txt: [('Or', 'Or')]
HR19050601-V40-06-page72.txt: [('\nA d d r e s s\n', 'Address')]
HR19050601-V40-06-page79.txt: [('It', 'It')]
HR19050701-V40-07-page15.txt: [(' R o u n d ', 'Round')]
HR19050701-V40-07-page8.txt: [('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It')]
HR19050801-V40-08-page11.txt: [('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It'), ('It', 'It')]
HR19050901-V40-09-page15.txt: [('If', 'If')]
HR19050901-V40-09-page81.txt: [('It', 'It'), ('It', 'It'), ('It', 'It')]
HR19050901-V40-09-page85.txt: [(' R o u n d ', 'Round')]
HR19060201-V41-02-page76.txt: [(' L o u i s ', 'Louis')]
HR19060301-V41-03-page39.txt: [(' e v e r y ', 'every')]
HR19060501-V41-05-page12.txt: [('To', 'To')]
HR19060701-V41-07-page12.txt: [('\nC o u r s e ', 'Course')]
HR19060801-V41-08-page31.txt: [(' B r y an t ', 'Bryant')]
HR19060801-V41-08-page53.txt: [('It', 'It')]
HR19060901-V41-09-page65.txt: [('Cf', 'Cf')]
HR19070101-V42-01-page56.txt: [('Jo', 'Jo')]
HR19070701-V42-07-page65.txt: [('If', 'If')]
HR19070801-V42-08-page90.txt: [('If', 'If')]
HR19070901-V42-09-page19.txt: [('It', 'It')]
In [29]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/HR/correction6

Average verified rate: 0.9772093399046619

Average of error rates: 0.03611597060405091

Total token count: 13924125

In [30]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[30]:
[('m', 16053),
 ('d', 11371),
 ("'", 11286),
 ('e', 9145),
 ('t', 8057),
 ('f', 7096),
 ('w', 7082),
 ('r', 7009),
 ('n', 6039),
 ('co', 5855),
 ('pm', 3762),
 ('g', 3249),
 ('th', 1692),
 ('u', 1518),
 ('k', 1461),
 ('x', 1342),
 ('re', 1330),
 ('tion', 1006),
 ('ex', 939),
 ('z', 836),
 ('mo', 760),
 ('oz', 677),
 ('pa', 676),
 ('sel', 665),
 ("an'", 555),
 ('lb', 536),
 ('al', 524),
 ('pp', 474),
 ('ment', 450),
 ('-', 426),
 ('ti', 358),
 ('wm', 355),
 ('mc', 351),
 ('em', 325),
 ('q', 318),
 ('un', 316),
 ('tions', 293),
 ('ft', 291),
 ('se', 287),
 ('es', 276),
 ('pre', 274),
 ('ro', 273),
 ('io', 258),
 ('oo', 249),
 ('mt', 244),
 ('pt', 243),
 ('ry', 226),
 ('il', 225),
 ('li', 222),
 ('ll', 221)]

Correction 7 -- Rejoin Split Words

In [31]:
# %load shared_elements/rejoin_split_words.py
prev = "correction6"
cycle = "correction7"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [32]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/HR/correction7

Average verified rate: 0.9782451294859534

Average of error rates: 0.03500528768596523

Total token count: 13914907

In [33]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[33]:
[('m', 16037),
 ('d', 11345),
 ("'", 11286),
 ('e', 8999),
 ('t', 7978),
 ('f', 7087),
 ('w', 7078),
 ('r', 6972),
 ('n', 5982),
 ('co', 5734),
 ('pm', 3762),
 ('g', 3229),
 ('th', 1594),
 ('u', 1512),
 ('k', 1449),
 ('x', 1336),
 ('z', 836),
 ('mo', 733),
 ('oz', 677),
 ('sel', 657),
 ('ex', 642),
 ('re', 637),
 ('pa', 604),
 ("an'", 555),
 ('lb', 535),
 ('tion', 527),
 ('pp', 474),
 ('-', 426),
 ('ment', 402),
 ('al', 355),
 ('wm', 355),
 ('q', 317),
 ('ti', 305),
 ('ft', 290),
 ('em', 274),
 ('ro', 259),
 ('io', 251),
 ('oo', 245),
 ('mt', 243),
 ('pt', 241),
 ('se', 236),
 ('es', 229),
 ('ry', 220),
 ('ll', 214),
 ('il', 212),
 ('ia', 193),
 ('tt', 183),
 ('li', 181),
 ("hours'", 180),
 ("''", 171)]

Correction 8 -- Rejoin Split Words II

In [34]:
# %load shared_elements/rejoin_split_words.py
prev = "correction7"
cycle = "correction8"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [35]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/HR/correction8

Average verified rate: 0.9787064241653848

Average of error rates: 0.03450824520523392

Total token count: 13908655

In [36]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:500]
Out[36]:
[('m', 15949),
 ('d', 11327),
 ("'", 11285),
 ('e', 8969),
 ('t', 7931),
 ('f', 7056),
 ('w', 6940),
 ('r', 6925),
 ('n', 5966),
 ('co', 5725),
 ('pm', 3762),
 ('g', 3208),
 ('th', 1569),
 ('u', 1506),
 ('k', 1445),
 ('x', 1336),
 ('z', 834),
 ('mo', 723),
 ('oz', 676),
 ('sel', 657),
 ('ex', 624),
 ('pa', 603),
 ("an'", 555),
 ('lb', 535),
 ('tion', 489),
 ('pp', 474),
 ('re', 465),
 ('-', 426),
 ('wm', 355),
 ('q', 316),
 ('al', 312),
 ('ti', 298),
 ('ft', 280),
 ('em', 267),
 ('io', 251),
 ('ro', 250),
 ('ment', 248),
 ('mt', 243),
 ('pt', 240),
 ('oo', 239),
 ('ry', 215),
 ('es', 203),
 ('il', 200),
 ('ll', 188),
 ('se', 185),
 ('ia', 183),
 ('tt', 181),
 ("hours'", 180),
 ('li', 178),
 ("''", 171),
 ('vt', 167),
 ('tri', 165),
 ('si', 159),
 ('op', 157),
 ('ma', 153),
 ('zo', 151),
 ('ne', 150),
 ('fl', 143),
 ("of'", 143),
 ('tions', 142),
 ('gr', 141),
 ('te', 140),
 ('deimel', 140),
 ('keeley', 139),
 ('mc', 138),
 ('ni', 137),
 ('un', 136),
 ('ch', 135),
 ('ve', 134),
 ('va', 131),
 ('ip', 131),
 ('ow', 129),
 ('cornaro', 126),
 ("the'", 124),
 ('tr', 119),
 ('wuz', 119),
 ('ic', 119),
 ('oc', 117),
 ('id', 116),
 ('onehalf', 116),
 ('mi', 108),
 ('fehr', 107),
 ("vick's", 106),
 ('ac', 105),
 ('ri', 104),
 ('socalled', 103),
 ('fr', 103),
 ('ili', 103),
 ('ridpath', 102),
 ('ci', 101),
 ("i'", 101),
 ('ky', 99),
 ('ph', 99),
 ("'s", 98),
 ('ea', 97),
 ('ga', 97),
 ('basle', 96),
 ('ble', 94),
 ('ly', 93),
 ('ful', 93),
 ('munn', 92),
 ('centrale', 89),
 ('rt', 89),
 ('ent', 88),
 ('ceo', 88),
 ('da', 88),
 ('bo', 88),
 ('na', 87),
 ('ity', 87),
 ('twentyfive', 87),
 ('sr', 86),
 ('nr', 86),
 ('cr', 85),
 ('fi', 85),
 ('pre', 85),
 ('sc', 84),
 ('soo', 84),
 ('ct', 83),
 ('nd', 83),
 ('ts', 83),
 ('wood-allen', 83),
 ('ments', 82),
 ('rd', 81),
 ('institut', 81),
 ('vo', 81),
 ('rs', 80),
 ('sp', 79),
 ('sa', 77),
 ('lt', 77),
 ('ca', 77),
 ('wi', 77),
 ('twentyfour', 75),
 ('wo', 74),
 ('bloodvessels', 74),
 ('farnum', 73),
 ('ob', 72),
 ('vaux', 72),
 ('ay', 72),
 ('ers', 71),
 ("a'", 71),
 ('ap', 71),
 ('ance', 71),
 ('tl', 71),
 ('ss', 70),
 ('wellknown', 69),
 ('pr', 69),
 ('pawlow', 69),
 ('ioo', 68),
 ('tc', 68),
 ('ei', 68),
 ('--', 68),
 ('nt', 67),
 ('ta', 67),
 ('ealth', 66),
 ('tle', 65),
 ('ce', 65),
 ('marchand', 65),
 ('sano', 64),
 ('ie', 64),
 ('rn', 64),
 ('micr', 64),
 ('seaver', 63),
 ('ao', 63),
 ('ous', 63),
 ('rr', 63),
 ('thos', 62),
 ('od', 62),
 ("cents'", 62),
 ('um', 61),
 ('iu', 61),
 ('wus', 61),
 ('ence', 61),
 ('wa', 61),
 ('ns', 60),
 ('ple', 60),
 ('os', 60),
 ('good-by', 60),
 ('ginley', 60),
 ('bez', 59),
 ('cc', 59),
 ('ra', 59),
 ('dren', 58),
 ('qt', 58),
 ('cood', 58),
 ('mm', 58),
 ("and'", 58),
 ('ture', 57),
 ('ical', 57),
 ('segner', 57),
 ('oi', 57),
 ('po', 56),
 ('cl', 56),
 ('mens', 55),
 ('lmtd', 55),
 ('ure', 55),
 ('ig', 55),
 ('tem', 54),
 ('und', 53),
 ('ef', 53),
 ('hy', 53),
 ('fa', 52),
 ('ff', 51),
 ('ber', 51),
 ("l't'd", 51),
 ('ee', 51),
 ('ny', 51),
 ('cience', 50),
 ('graefenberg', 50),
 ('sez', 50),
 ('illy', 49),
 ('fo', 49),
 ('in-doors', 49),
 ('ke', 49),
 ('gi', 48),
 ('muriatic', 48),
 ('ev', 48),
 ('goguac', 48),
 ('accom', 48),
 ('mal', 48),
 ('pl', 48),
 ('iti', 47),
 ("r'y", 47),
 ('letheby', 47),
 ('neimyer', 47),
 ('jenness', 47),
 ('cd', 47),
 ('td', 47),
 ('ou', 46),
 ('cess-pool', 46),
 ('gg', 46),
 ('ab', 46),
 ('featherby', 46),
 ('thein', 46),
 ('pintsch', 46),
 ('tbe', 46),
 ('ey', 45),
 ('brunton', 45),
 ('umschlag', 45),
 ("powder'", 45),
 ('fbr', 45),
 ('nu', 45),
 ('foie', 45),
 ('sanitariu', 45),
 ('lenna', 44),
 ('ng', 44),
 ('dori', 44),
 ('tkt', 44),
 ('dio', 44),
 ('papanui', 44),
 ('av', 44),
 ('wagnalls', 43),
 ('aa', 43),
 ('fresnillo', 43),
 ('tne', 43),
 ('celaya', 43),
 ('bacco', 43),
 ('georgie', 43),
 ('ofthe', 43),
 ("t'", 43),
 ('ani', 43),
 ('electriclight', 41),
 ('loth', 41),
 ('davos', 41),
 ('ood', 41),
 ('moki', 41),
 ('ozs', 41),
 ("l'", 40),
 ('pmd', 40),
 ('oe', 40),
 ('battlecreek', 40),
 ('oldfashioned', 40),
 ('ive', 39),
 ("booklover's", 39),
 ('ery', 39),
 ('sy', 39),
 ('guanacevi', 39),
 ('ji', 39),
 ('outof-door', 38),
 ('vis', 38),
 ('om', 38),
 ('caf', 38),
 ('ec', 38),
 ('anitary', 38),
 ("to'", 38),
 ('ostermoor', 38),
 ('colax', 38),
 ('tn', 38),
 ('alth', 37),
 ('eh', 37),
 ('ation', 37),
 ('obdomen', 37),
 ('oa', 37),
 ('cs', 37),
 ('lyster', 37),
 ('aduertisements', 37),
 ('gras', 37),
 ('wid', 37),
 ('ge', 37),
 ('ht', 37),
 ('ther', 37),
 ('iterary', 36),
 ('tb', 36),
 ('malic', 36),
 ('-the', 36),
 ('quently', 36),
 ('edmunds', 36),
 ('eral', 36),
 ('firstclass', 36),
 ('itt', 36),
 ('lth', 36),
 ('cise', 36),
 ('irapuato', 36),
 ('tf', 36),
 ('employes', 36),
 ('parkes', 36),
 ('tht', 35),
 ('out-ofdoor', 35),
 ('ik', 35),
 ('pc', 35),
 ('delsarte', 35),
 ("demorest's", 35),
 ('anb', 35),
 ('selfcontrol', 35),
 ('nal', 35),
 ('abbie', 35),
 ('tive', 35),
 ('threefourths', 35),
 ('rm', 35),
 ("in'", 35),
 ('kr', 34),
 ('oat-meal', 34),
 ('kniskern', 34),
 ('tii', 34),
 ('mis', 34),
 ('bergh', 34),
 ('ith', 34),
 ('well-ordered', 34),
 ('sea-sickness', 34),
 ('ut', 34),
 ('lc', 34),
 ('ov', 33),
 ("goin'", 33),
 ('healthwise', 33),
 ('sani', 33),
 ('polypi', 33),
 ('af', 33),
 ('ae', 33),
 ('sible', 33),
 ('sus', 33),
 ('mit', 33),
 ('onethird', 33),
 ('nonflesh', 33),
 ("c'", 33),
 ("bein'", 33),
 ('iiii', 33),
 ('jacokes', 33),
 ('por', 32),
 ('alti', 32),
 ('cess-pools', 32),
 ('trainingschool', 32),
 ('gm', 32),
 ('tolstoi', 32),
 ('gheel', 32),
 ('su', 32),
 ("s'", 32),
 ('wt', 32),
 ('gential', 32),
 ('mn', 32),
 ('mu', 32),
 ('muffet', 31),
 ('bucknum', 31),
 ('beaumetz', 31),
 ("winslow's", 31),
 ('ba', 31),
 ('dujardin', 31),
 ('dodds', 31),
 ('ze', 31),
 ('bos', 31),
 ('chil', 31),
 ('az', 31),
 ('spiring', 31),
 ('pawkins', 31),
 ('agin', 31),
 ('ol', 31),
 ('mantz', 31),
 ('cu', 31),
 ('ij', 31),
 ("p'", 30),
 ('creatin', 30),
 ('wellequipped', 30),
 ('-a', 30),
 ('ist', 29),
 ('iz', 29),
 ('pf', 29),
 ('mapimi', 29),
 ('ach', 29),
 ('bi', 29),
 ('onefourth', 29),
 ('xl', 28),
 ('attle', 28),
 ('aro', 28),
 ('wel', 28),
 ('mokis', 28),
 ('diningroom', 28),
 ('cn', 28),
 ('gofio', 28),
 ('fp', 28),
 ('fiske', 28),
 ('schantz', 28),
 ('exer', 28),
 ("if'", 28),
 ('ject', 27),
 ('self-preservation', 27),
 ('physi', 27),
 ('sw', 27),
 ('sydenham', 27),
 ("harve'", 27),
 ('prt', 27),
 ('steam-heated', 27),
 ('potassa', 27),
 ('miehle', 27),
 ('ole', 27),
 ('kal', 27),
 ('mouth-breathing', 27),
 ('iy', 27),
 ("housekeepers'", 27),
 ('hufeland', 27),
 ('eighty-four', 27),
 ('ets', 27),
 ('vrooman', 27),
 ('whitcomb', 27),
 ('rir', 27),
 ('ight', 27),
 ('water-cures', 27),
 ('ww', 27),
 ('rumseller', 27),
 ('soda-water', 27),
 ('noko', 27),
 ('ul', 27),
 ('well-informed', 27),
 ('cotton-seed', 27),
 ('ag', 27),
 ('loisette', 27),
 ('ramabai', 27),
 ('mioh', 27),
 ("r'", 27),
 ('bric-a-brac', 26),
 ('tv', 26),
 ('ess', 26),
 ('wu', 26),
 ('quired', 26),
 ('meat-eaters', 26),
 ("gov't", 26),
 ('able-bodied', 26),
 ('rosy-cheeked', 26),
 ('prac', 26),
 ('incased', 26),
 ('hights', 26),
 ('atl', 26),
 ('haverly', 26),
 ('otices', 26),
 ('under-clothing', 26),
 ('eighty-two', 26),
 ('dition', 26),
 ('starting-point', 26),
 ('tape-worms', 26),
 ('epartment', 26),
 ('sewer-gas', 26),
 ('sewing-machine', 26),
 ('ents', 26),
 ('wash-tub', 26),
 ("nothin'", 26),
 ('dark-colored', 25),
 ('by-laws', 25),
 ('sary', 25),
 ('rumsellers', 25),
 ('wetsheet', 25),
 ('up-stairs', 25),
 ('lecture-room', 25),
 ('fellow-citizens', 25),
 ('living-room', 25),
 ('yr', 25),
 ('overfat', 25),
 ('coal-tar', 25),
 ('seventy-six', 25),
 ('ano', 25),
 ('woodallen', 25),
 ('fora', 25),
 ('ich', 25),
 ('all-wise', 25),
 ('cast-iron', 25),
 ('df', 25),
 ('whalebones', 25),
 ('sea-water', 25),
 ('yamanoto', 25),
 ('sea-shore', 25),
 ('fusel', 25),
 ('money-making', 25),
 ('living-rooms', 25),
 ('paget', 25)]

Review Remaining Errors

In [37]:
reports.docs_with_high_error_rate(summary)[:100]
Out[37]:
[('HR18920501-V27-05-page3.txt', 1.0),
 ('HR18940101-V29-01-page3.txt', 1.0),
 ('HR18900101-V25-01-page1.txt', 1.0),
 ('HR18910401-V26-04-page3.txt', 1.0),
 ('HR18940301-V29-03-page3.txt', 1.0),
 ('HR19030701-V38-07-page6.txt', 1.0),
 ('HR18920801-V27-08-page3.txt', 1.0),
 ('HR18890601-V24-06-page1.txt', 1.0),
 ('HR18931101-V28-11-page3.txt', 1.0),
 ('HR19040701-V39-07-page2.txt', 1.0),
 ('HR19030501-V38-05-page4.txt', 1.0),
 ('HR18911201-V26-12-page1.txt', 1.0),
 ('HR18971001-V32-10-page3.txt', 1.0),
 ('HR18941201-V29-12-page3.txt', 1.0),
 ('HR18970401-V32-04-page4.txt', 1.0),
 ('HR18911001-V26-10-page3.txt', 1.0),
 ('HR18991001-V34-10-page4.txt', 1.0),
 ('HR18880301-V23-03-page12.txt', 1.0),
 ('HR18890501-V24-05-page3.txt', 1.0),
 ('HR19040301-V39-03-page4.txt', 1.0),
 ('HR18980801-V33-08-page8.txt', 1.0),
 ('HR18890401-V24-04-page1.txt', 1.0),
 ('HR18970901-V32-09-page9.txt', 1.0),
 ('HR18870201-V22-02-page8.txt', 1.0),
 ('HR18990401-V34-04-page4.txt', 1.0),
 ('HR19041101-V39-11-page21.txt', 1.0),
 ('HR19000701-V35-07-page3.txt', 1.0),
 ('HR18920701-V27-07-page4.txt', 1.0),
 ('HR19070401-V42-04-page11.txt', 1.0),
 ('HR18880701-V23-07-page4.txt', 1.0),
 ('HR19030301-V38-03-page4.txt', 1.0),
 ('HR18930501-V28-05-page4.txt', 1.0),
 ('HR18990401-V34-04-page3.txt', 0.923),
 ('HR18921201-V27-12-page7.txt', 0.917),
 ('HR18910801-V26-08-page1.txt', 0.875),
 ('HR18970701-V32-07-page3.txt', 0.857),
 ('HR18970501-V32-05-page5.txt', 0.846),
 ('HR18940701-V29-07-page1.txt', 0.846),
 ('HR18910501-V26-05-page3.txt', 0.833),
 ('HR18930301-V28-03-page3.txt', 0.833),
 ('HR19070901-V42-09-page21.txt', 0.833),
 ('HR18981101-V33-11-page4.txt', 0.833),
 ('HR19061001-V41-10-page19.txt', 0.826),
 ('HR18931201-V28-12-page3.txt', 0.826),
 ('HR18880701-V23-07-page1.txt', 0.818),
 ('HR19070701-V42-07-page21.txt', 0.818),
 ('HR19060901-V41-09-page19.txt', 0.818),
 ('HR18940701-V29-07-page3.txt', 0.812),
 ('HR19061201-V41-12-page19.txt', 0.8),
 ('HR19031101-V38-11-page4.txt', 0.8),
 ('HR19020801-V37-08-page4.txt', 0.8),
 ('HR19070901-V42-09-page1.txt', 0.8),
 ('HR18961101-V31-12-page1.txt', 0.8),
 ('HR18950601-V30-06-page1.txt', 0.8),
 ('HR19000701-V35-07-page52.txt', 0.789),
 ('HR18940201-V29-02-page3.txt', 0.786),
 ('HR19060401-V41-04-page19.txt', 0.783),
 ('HR19020501-V37-05-page3.txt', 0.778),
 ('HR19000601-V35-06-page3.txt', 0.778),
 ('HR18910901-V26-09-page1.txt', 0.778),
 ('HR18870401-V22-04-page3.txt', 0.778),
 ('HR18990401-V34-04-page67.txt', 0.778),
 ('HR18920401-V27-04-page3.txt', 0.769),
 ('HR18920201-V27-02-page3.txt', 0.769),
 ('HR18910101-V26-01-page7.txt', 0.765),
 ('HR18880901-V23-09-page1.txt', 0.762),
 ('HR19030801-V38-08-page3.txt', 0.75),
 ('HR18970701-V32-07-page12.txt', 0.75),
 ('HR18910601-V26-06-page3.txt', 0.75),
 ('HR18990201-V34-02-page3.txt', 0.75),
 ('HR18960301-V31-03-page21.txt', 0.75),
 ('HR19060801-V41-08-page19.txt', 0.75),
 ('HR18890301-V24-03-page3.txt', 0.744),
 ('HR18920301-V27-03-page1.txt', 0.733),
 ('HR18921101-V27-11-page3.txt', 0.733),
 ('HR18921101-V27-11-page1.txt', 0.733),
 ('HR19060301-V41-03-page20.txt', 0.731),
 ('HR19020101-V37-01-page10.txt', 0.727),
 ('HR19031101-V38-11-page3.txt', 0.727),
 ('HR18960401-V31-04-page1.txt', 0.727),
 ('HR18890101-V24-01-page3.txt', 0.722),
 ('HR19061101-V41-11-page19.txt', 0.722),
 ('HR18940901-V29-09-page1.txt', 0.722),
 ('HR19030601-V38-06-page3.txt', 0.72),
 ('HR18880201-V23-02-page1.txt', 0.714),
 ('HR19040201-V39-02-page3.txt', 0.714),
 ('HR19020401-V37-04-page3.txt', 0.714),
 ('HR18951101-V30-11-page1.txt', 0.714),
 ('HR19030301-V38-03-page3.txt', 0.714),
 ('HR19020901-V37-09-page4.txt', 0.714),
 ('HR19051201-V40-12-page19.txt', 0.7),
 ('HR19001201-V35-12-page4.txt', 0.7),
 ('HR18901101-V25-11-page1.txt', 0.7),
 ('HR18960501-V31-05-page20.txt', 0.698),
 ('HR18900801-V25-08-page3.txt', 0.698),
 ('HR18971201-V32-12-page9.txt', 0.696),
 ('HR19020101-V37-01-page12.txt', 0.696),
 ('HR18930601-V28-06-page1.txt', 0.692),
 ('HR19060201-V41-02-page19.txt', 0.69),
 ('HR18991201-V34-12-page3.txt', 0.688)]
In [38]:
reports.long_errors(errors_summary, min_length=15)
Out[38]:
(['sendforcircularsof',
  'consumersvgetadanufaeturingen',
  'somethingelseifications',
  "introductionoftempei'ancetextbooksinto",
  'oneofthegreatcausesofdesire',
  'thatthesepeoplehave',
  'good-for-nothings',
  'fifteen-per-cent',
  'tsaekceoszspiifcytuourews',
  'ifeelthaticanneverfor',
  'ordrawnfromthestill',
  'force-production',
  'ericanmotherhood',
  'anothergentlemanby',
  "thendon'tsaywehavelostevery",
  'andnotstandbyandseeothersdothem',
  'iiscienceinmehouserold',
  'dyspepsiaproducing',
  'thisisoneofthemost',
  'andallmaypartakeofthebene',
  'acorrectposition',
  'environpurchased',
  'medicallmissionary',
  'someothersingulardisease',
  'iftruehappinessistheresult',
  'semi-occasionally',
  'daintily-broiled',
  'poison-factories',
  "injurious'effects",
  'well-authenticated',
  'andthatitisvastlybettertopay',
  'corset-fashioned',
  'intezreissintheanpyubvlsahye',
  'thuslearnhowtokeepoutofit',
  'other-worldliness',
  'thirst-producing',
  'fairlyuponthehollowplacespreparedfor',
  'inhealthanddisease',
  'one-fifteen-thousandth',
  'withoutourrealizingit',
  'fellow-islanders',
  'world-renouncing',
  'chemically-prepared',
  'ifnfegrotordmeersnaann',
  'brain-paralyzing',
  'ghdoohdousekeeping',
  'pseudo-membranous',
  'too-much-planted',
  'apprenticeexercise',
  'frommonthtomonth',
  'exercise-machine',
  "is'ocroictlgeirgeogif",
  'tosystematically',
  'hehimselfisnotcapableofdo',
  'wmoteenbyearnfeusit',
  'eluepeartririairats',
  'towumbowisicoruni',
  'two-dollars-and-fiftycents',
  'passive-exercise',
  'reform-producing',
  'fourteen-year-old',
  'opportunitytonaturally',
  'breach-of-promise',
  'hasshenotasmucharighttolocomotion',
  'suntheloveofsuchahome',
  'ordinarily-healthy',
  "above'appliances",
  'liglrigiiiiiiniiiril',
  'world-renouncers',
  'goodallthewaythrough',
  'itisatimesaverforallbusypeople',
  'canadianexponent',
  'indiffersea-port',
  'milliamperemeter',
  'whitesowiliallachille',
  'thisisamuchpleas',
  'maiirmaimpopprmw',
  'tfroteralyialacento',
  'letusmakeyoulikewiseprosperous',
  'latiffitritalliaditgo',
  'andmorenaturalour',
  'miserable-looking',
  'butwomanhasnofreedomofrespira',
  'corruptshisownlan',
  'chamber-shutters',
  'miasma-absorbing',
  'health-destroyers',
  'ofonagoodmattress',
  'niaplewoodandeuren',
  'aniiniestlgation',
  'beingslaughtered',
  'laudanum-drinkers',
  'itottiodutoovqpnioui',
  'portrait-painter',
  'nutritiousgrasses',
  'penny-in-the-slot',
  'ex-superintendent',
  'itateamomentinabstainingfromthem',
  'world-conquering',
  'over-development',
  'disproporexamining',
  'yilttionluorrdelrie',
  'rehua-kai-tangata',
  'eleven-hundredths',
  'andelectricalsupplyco',
  'liminotimommunniamounionuminiammoinnuoiniiiniiima',
  'fine-flour-bread',
  'suggesindividual',
  'apoplectic-looking',
  'innunnnunmmunmuommommuumummnimumnummnmunnunnumummunnunumnunmummnamnununnunnumuunnummummumumnnunnuummumunnumunm',
  'infectioncausing',
  'promotethecomfortandinterestofothers',
  'shrubbery-garden',
  'fellowcountryman',
  'dead-line-atfifty',
  'buffet-library-smoking',
  'at-present-closed',
  'heat-prostration',
  'sallowness-tan-pimples',
  'encourunderstand',
  'forthetreatmentofdiseasesoftheeye',
  'smoke-swallowing',
  'broodingapprehensions',
  'dyspepsia-producer',
  'saidaphysiciantohisser',
  'hastily-prepared',
  'sell-gratification',
  'special-excursion',
  'spongyincharacter',
  'wwwwwwwwattittyytywittyytt',
  'semi-invatittsabroad',
  'thepaperhavingthe',
  'afewweekswithusandreturnedtotheir',
  'theyicutrefaction',
  'thirty-three--becomes',
  'eiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiie',
  'weekhasbeenbelow',
  'drinking-vessels',
  'letaloneativeness',
  'easily-cultivated',
  'household-ledger',
  'suchkindoffoodasthecreatordesigned',
  "partially'counteracted",
  'osdedapapwethiitset',
  'upas-tree----the',
  'specialadvantages',
  'windpipe-slasher',
  'variousotherways',
  'trouville-sur-mer',
  'lieutenantgovernor',
  'fifty-thousandth',
  'downin-the-mouth',
  'explainstheonlywayofper',
  'texasandlouisiana',
  'csaetnadlofgoure',
  'hydrotherapy--theoretical',
  'augustnumberoftheorremfer',
  'poorlyventilated',
  'periodicalpublished',
  'andaltogetheritwasa',
  'ittittimttittiiittitt',
  'workequallywellforgirls',
  'americanassociation',
  'lightentheirburdens',
  'withthatonwhichyouallowedittogrow',
  'compositionofcanesugariscmh',
  'teethofacowfedondistilleryslops',
  'seek-no-furthers',
  'itcanhardlybedetermined',
  'iylolunkknoowfth',
  'ishouldhaveabottlein',
  'tobeabletoaccommo',
  'malaria-stricken',
  'norcompletelyfatigued',
  'whichnaturehassoabundantlysuppliedus',
  'thought-processes',
  'barbarous-rustic',
  'extepmecptaetriaotneopfelrisfoenosf',
  'prejudice-blinded',
  'movements--manual',
  'intoxicating-liqors',
  'chemico-physiological',
  'all-comprehensive',
  'bursting-with-fun',
  'fashion-banished',
  'hydrotiierapeutic',
  'spinal-curvatured',
  'resolution-breaking',
  'ieieiceikeeeectwcteccircirarczerzoremi',
  'soul-and-body-destroying',
  'iiiiiiiiiiiiiiiiiiii',
  'shouldnotthinkenoughofyou',
  'brain-enervating',
  'walkbutafewstepsatatime',
  'house-furnisharpets',
  'followingparticulars',
  'aremorelikelytodiscoverthemuponthe',
  'consideraappetite',
  'bicwageshustlers',
  'unboltedflourormeal',
  'yanunfashionable',
  'lkinbergartening',
  "generally'uncomfortable",
  'higgledy-piggledy',
  'mixiiiosipcollso',
  'fashion-pampered',
  'forsimilartowhat',
  'bymailofthepublishers',
  'out-of-door-exercise',
  'destruction-proof',
  'neck-handkerchiefs',
  'disease-preventing',
  'classifieddietary',
  'notwithstantling',
  'territory--there',
  'pleasure-hunters',
  'parentwillemployateacher',
  'breathingapparatus',
  'total-abstinence',
  'werepresentedatthesanitaryconven',
  'ectriciiisupplyco',
  'volaucmiuesoupsressoeiurvtieodn',
  'mailingdirectoryof',
  'within-the-reach-of-all',
  'santiicaticiltinel',
  'naturally-derived',
  'thick-skinnedness',
  'foodoutoftheearth',
  'determinapositions',
  'strong-mindedness',
  'commendatorially',
  'perharandexpenses',
  'scienceileaflets',
  'enruimptbioenrstofror',
  'mtnipommmimmmimmitm',
  'voraciously-devouring',
  'self-restoration',
  'icoinsaametimail',
  'stable-imprisoned',
  'tokeepthefamilyingoodhealth',
  'mentionthispublication',
  'layupmoneyfortheir',
  'virtreliraluinmaaft',
  'ex-sergeant-at-arms',
  'mountain-summerers',
  'ball-andsocket-joint',
  'railroad-traveling',
  'super-oxygenated',
  'inreadingandseeking',
  'prosextraordinary',
  'unfortunnecessary',
  'itsoonbringsonanawkward',
  'pamphletissupplemented',
  'incomprenensible',
  'roastwithdressing',
  'tasetinljsqcytuourews',
  'pseudophilosophy',
  'much-to-bedreaded',
  'speech-bewrangler',
  'whatyoudoingthere',
  'nothwithstanding',
  'sahbath-scliools',
  'alolalaallallallaa',
  'nolownwallellitomm',
  'alkaloid-containing',
  'mind-distracting',
  'falsehood-spreader',
  'whichmakethewingpiano',
  'orbyoverloadingit',
  'theseinterestingstatistics',
  'strength-testing',
  'stomach-faintness',
  'trans-continental',
  'delicately-lined',
  'never-endingness',
  'mostcomprehensive',
  'otherscunninglywrought',
  'hypohydrochloria',
  'twentiethcentury',
  'harmless-looking',
  'thatthepeoplebetaughthowtolive',
  'upthismostimportantsubject',
  'lsancriteaerbiucrma',
  'training-grounds',
  'philoprogenitiveness',
  'vammippiiimpopipieimpirwpwripppwappiwiviii',
  'neighborlykindness',
  'iiirufiviplillimbue',
  'writeto-dayforourillustratedprospectus',
  'anti-tea-drinking',
  'suspicious-looking',
  'stonefitforawallwillnotlieontheroad',
  'pippiprimmiiimpumpopimminiiiiiin',
  'all-around-the-world',
  'primitive-looking',
  'filteringremoves',
  'irsaaaaaairipiairaairato',
  'mommomvollinespe',
  'photiphorescence',
  'unphysiologically',
  'sainnytkaormaisutiposis',
  'lustratestheeffectofayellowday',
  'ccitonwvrtiltlelw',
  'apartmentsaremade',
  'gastro-enteritis',
  'balance-circulation',
  'violet-embroidered',
  'iroluivivitallimill',
  'over-sensitiveness',
  'over-squeamishness',
  'undoubtedlyalsoincreasestheactionofall',
  "inminnvinionfifiwpoivio'ri",
  'gold-and-crimson',
  'tahnedcmasoeroetchoemrwf',
  'carefully-covered',
  "andattwoo'clockeatsahearty",
  "twenty-five'cows",
  "readily-'excitement",
  'antiodontalgicus',
  'ninety-nine-hundredths',
  'brain-intoxicating',
  "compensating'offset",
  'inspector-general',
  "elecrtrcic'-dligehtirctiibl",
  'intellectual-looking',
  'candomoretosavelifeandre',
  'indgailrfefwreestr',
  'fleshasfoodmustbedecidedly',
  'forty-five-minute',
  'good-for-nothingness',
  'maid-of-all-work',
  'foldingbathtubco',
  'old-age-producing',
  'spleen-instiller',
  'bottini-freudenberg',
  'cattle-attracting',
  'sterilizing-room',
  'nsmiotkuitniitto',
  'corioyivnritgfelogaph',
  'over-punctilious',
  'ganceonthepretenceofharmony',
  'cciatwripaineoint',
  'effectofstrongliq',
  'tissue-paralyzing',
  'iussossousinsulopsionswinfanipiii',
  'chicken-stealers',
  'life-occupations',
  'half-million-dollar',
  'freedomworshiping',
  'great-grandfathers',
  'shorttimewewereallloadedinalargespringimaginary',
  'system-shatterer',
  "instrument-maker's",
  'uncrystallizable',
  "plaster-o'-paris",
  'cheerful-hearted',
  'statistic-makers',
  'whichwasbeforedamp',
  'oftheirownraising',
  'oftheirintoxicatingpower',
  'hololololololololoo',
  'chinefullywarranted',
  'well-constructed',
  'sonprraecvtaedlenattanthdiseamsily',
  'tooth-extraction',
  'keeplittlechildrenoutoffactories',
  'exacttsirommanonhisphysicalside',
  'rapidly-accumulating',
  'operating-roomsand',
  'mmominaviszemweniai',
  'occasionally-opened',
  'stoke-upon-trent',
  'fliffiliiffivalk',
  'twenty-four-page',
  'healthadvertising',
  'uncertain-tempered',
  'agaentthewloncon',
  'typho-malarialmeningitis',
  'trance-slumbering',
  'afewofthemorewealthy',
  'self-gratification',
  'battlecreekbuggyco',
  'mosquito-infected',
  'comfortable-looking',
  'peiniiviiiniuiiiii',
  'andstandinyourlight',
  'cliandoo-tschandu',
  'poorly-constructed',
  'andinwhichifkeptin',
  'eighteen-monthsold',
  'withthewholeburdenofitspeople',
  'fully-authenticated',
  'arrisonmanufacturingco',
  'anti-vaccinators',
  'fruit-and-nut-eating',
  'salvation-seekers',
  'reachclose-stand',
  'bathing-exercise-diet',
  'sittingperishable',
  'discovcontributes',
  'blizzard-climate',
  'fromcommunicationsreceived',
  'perfectly-formed',
  'sesveevnetneteenntth',
  'elainimimlinieeecinale',
  'breakfast-tables',
  'torideornottoride',
  'experimentaiating',
  'chinese-luncheon',
  'gentlemanly-dressed',
  'pseudo-stricture',
  'mischief-breeder',
  'high-heartedness',
  'siinitityrndectriciilslipplico',
  'weather-sensitive',
  'commanderin-chief',
  'theadulterations',
  'recipewitheachmill',
  'theywillgooutintothecold',
  'antiphysiological',
  'water-receptacle',
  'gantly-decorated',
  'exposedtoadoubledanger',
  'someofthemjoined',
  'icnombviiteedeltracd',
  'ofwineandsamplesofbeer',
  'nothingelselikeit',
  'andisartisticallyprintedand',
  'fictipkessiiewythmerican',
  'secretary-of-war',
  'thatwhichisharmoniousandtemperate',
  'andatlanticexpresses',
  "respectfully'refers",
  'pseudo-diphtheria',
  'half-teaspoonful',
  'reverie-breeding',
  'beenperusingthisarticle',
  'ilirlraralenralogimemiar',
  'state-sanctioned',
  'thefiresidehasalways',
  'staroccultations',
  'mouth-and-nose-wash',
  'anklesfromtheeye',
  'anti-expectoration',
  'health-inspiring',
  'daily-administered',
  'influenza-stricken',
  'theotheriswellqualifiedforthestation',
  'ordinaryphysiological',
  'nerve-strengthening',
  'supplyofhygienic',
  'hefindsitrisingquiterapidlyheknows',
  'ihroun-al-raschid',
  'distilling-apparatus',
  'opfrecvhernontsicfceayseers',
  'anyonedoesyouaninjury',
  'wellillustratesthepointwhich',
  'onthewarpbeamdirectfromthespools',
  'ofworkasksafarmerifhedoesnotwant',
  'portland-cemented',
  'vegetaintelligible',
  'cheicmo-physiologists',
  'iintiediges-iric',
  'andhavethingssoarrangedasnottolose',
  'prolongmylifeuntili',
  'tobetruetonatureasa',
  'nerve-irritating',
  'dust-stirring-up',
  'manypeoplehavegottheno',
  'sewed-through-the-skin',
  'mmtnutintwinnommimumtimitumuiliniminfilllintillta',
  'csonprraecvteadlenattatndsearsnilye',
  'concentratedform',
  'causetheirpresenceisunwelcome',
  'todayforcatalogue',
  'comfortable-feeling',
  'interest-bearing',
  'diogenes-in-the-tub',
  'leathery-skinned',
  'mosquitobreeding',
  "f'ellow-subjects",
  'prohibitionments',
  'great-great-great-great-great-great',
  'pleasant-appearing',
  'constantly-increasing',
  'innocent-looking',
  'intippiilrgrivew',
  'ivvvvvvvvvamaanyv',
  'ofafire-worshiper',
  'complacent-looking',
  'anti-comfortable',
  'soul-and-bodydestroying',
  'forward-bendings',
  'contra-indicated',
  'self-development',
  'neitherdowenowdeemitneces',
  'inexorablelooking',
  'donotdrinkenough',
  'sentenceagainstanevilworkis',
  'apothecary-shops',
  'blood-circulation',
  'innocent-hearted',
  'pervadedbyacharitablespirit',
  'largequantitiesofsaltandothercon',
  'universalsurface',
  'wouldnodoubtberegardedasatyranny',
  'consumptive-looking',
  'reformertoourmanyreaders',
  'wouldbe-educators',
  'countenance-lifting',
  'fast-approaching',
  'lestlifeiusilencepass',
  'fivethousand-fold',
  'machine-manufactories',
  'twenty-five-inch',
  'daughters-in-law',
  'throughthepoisonouseffects',
  "nighttdetr'itimail",
  'everypractitioner',
  'anedwfanneceywork',
  'strength-producing',
  'franklineducationalco',
  'taryprincipleofthemind',
  'lee-e-f-s-a-e-me-e-e-e-e-e-e-',
  'icanneverunderstandher',
  'readbyfromthreetofivethousandtoeverythou',
  'healthofbodyandmind',
  'tbhefiennsplbania',
  'thesconstituents',
  'secondwouldbethirteendollars',
  'tobringupanumber',
  'whoaredeafanddumb',
  'oftiresfromanyonne',
  'abirdlectureonemorning',
  'freshly-expressed',
  'soyoungandunprotected',
  'thespherewhichitistheirgod-givenright',
  'nlnnnumimnimmnnuuun',
  'popularly-educated',
  'readingversusknowledge',
  'disinfecting-urns',
  'theillustratedscien',
  'cigarette-smoking',
  'enmanyoftheso-calledhealthreformers',
  'pirpqmpluirmwerg',
  'strengthcanbemadeforoneortwocents',
  'chaelliefonrainiia',
  "these'precautions",
  'racedeteriorating',
  'thefoundationsofdisease',
  'high-diddle-diddle-the-eat-and-the-fiddle',
  'picitcuturreegallery',
  'presentogeneration',
  'forcircularsaddreets',
  'cheese-poisoning',
  'theeggsoftape-wormsareoften',
  'chemico-physiology',
  'distilleryswill-fed',
  'furoveninapanhalffulloflionigwater',
  'distinguishcause',
  'rabbit-despising',
  'long-accumulated',
  'thermo-electrical',
  'mgakgitrsefrootta',
  'physicianswereconversinginthehearing',
  'gastro-intestinal',
  'generalpublicity',
  'too-oft-repeated',
  'watchandfretover',
  'iiiiiiiiiiiiiiiiiiiiiii',
  'ingonthesubjectofoillamps',
  'underpreparations',
  "drill-sergeant's",
  'sulmountain-side',
  'central-american',
  'nighttdeteittmaila',
  'crlitnttiosovceor',
  'supra-centenarians',
  'stretch-standing',
  'andthefatherislockedoutlikethe',
  'foster-daughters',
  'slaughter-houses',
  'corset-strangulation',
  'eight-months-old',
  'andafterdryingthe',
  'commissioner-in-chief',
  'magneto-electric',
  'trichinae-infected',
  'bazaar-merchants',
  'mounteney-jephson',
  "sawillthere'saway",
  '--------------------------------------------------------------------------------------------k',
  'wouldthinkhecouldlivedressed',
  'forcierysipelatous',
  'godhasblesseduswitha',
  'spinning-jennies',
  'unseasonableness',
  'uniformly-temperate',
  'lightshedabroadfromtheinstitute',
  'beautiful-tinted',
  'gloryofegypthaslongsincede',
  'forest-destructions',
  'hereditarycontinent',
  'ethenylbromophenylenediamine',
  'progressive-euchre',
  'transportaintellectual',
  'thereestablishment',
  'hydrotherapeutically',
  'uric-acid-saturated',
  'goemnetsorfotrave',
  "strawberry'juice",
  'nobiltransmissible',
  'rubbing-wet-sheet',
  'saortdmiandeficivith',
  'semi-professionalism',
  'creeskasnanitiatarriium',
  'tooth-destroying',
  'aggravatprinciple',
  'dangerousgasolinestoves',
  'amoderatedrinker',
  'coodsensecorteid',
  'antisepticdentifrice',
  'stimulatingarticles',
  'transmissibility',
  'nebraskasanitarium',
  'sodifficultisthisworkthat',
  'before-mentioned',
  'attractiveotfabreero',
  'iiimiiimiiimiiiiimiiinniiiiiimiimmer',
  'makingthetwocostthesub',
  'occipito-frontalie',
  'inalatenumberofthecoun',
  'numonomminmstimmmminn',
  'all-the-way-between',
  'acommoncomplication',
  'beinggenerallyunderstood',
  'feredupthatitsprogressmightbestayed',
  'rwcislhuiziothwenpriitc',
  'recently-devised',
  'heredityofalcohol',
  'ponoenntinagtaligoernuetsorfotrr',
  'benetilliocrobero',
  'air-contamination',
  'chloride-of-barium',
  "experiments'upon",
  'electrotherapeutic',
  'trumpeting-calls',
  'wouldgettheirdeathofcold',
  'their-own-living-instead',
  'trans-mississippi',
  'longto-be-remembered',
  'mmcmcmccmccmimmmmmmmmmmummummmwmvmmmmwo',
  'kneading-machine',
  'itismadeofthesoakageoftheswamps',
  'mechano-therapeutic',
  'thusitisthatcertainorgansbecome',
  'ever-accumulating',
  'temperature-being',
  'toiwsritefolrlouribigfrii',
  'professconstitutional',
  'eldermattesonhasalsoissued',
  'brain-circulation',
  'fellow-naturalists',
  'recently-published',
  'halfateaspoonful',
  'freptocoocusproduces',
  'fourteen-stomachpower',
  'immeditwenty-four',
  'disarrangegrowth',
  "plesofhygieneareembracedinthediet'",
  'acceptabletothemostfastidiouspalate',
  'semi-centenarian',
  'knitting-needles',
  'properly-ventilated',
  'tutionandrelationevidentlyshowusthat',
  'twelve-thousandth',
  'iniewileiguimpuipirmt',
  'veryinconsistent',
  'nineteenth-century',
  'terracesseveralof',
  'disease-stupefied',
  'hindasthestarsonthebrowofevening',
  'disease-protaken',
  'mmaucnucimmecancemmacnionmomucamni',
  'advertisingpatthese',
  'prepreposterously',
  'malaria-infected',
  'tiinrecsoainntendielaatima',
  'constrictionfliracles',
  'achargethoucouldstnothear',
  'whichshouldnotbeoverburdened',
  'palmermentionsinalludingtolukell',
  'workisdifficulttobehad',
  'standard-bearers',
  'straight-jackets',
  'botanisir-lrommel',
  'ofwhichoxygenistheprimesup',
  'spiritual-mindedness',
  'ultra-serviceable',
  'memory-treasured',
  'thepineisanativeofamerica',
  'havetreatedwithsuccessa',
  'lpaohrdtmlanadrhtofstahnesfierarrnac',
  'smallof-the-back',
  'bedorleavingaroomforatime',
  'sealing-wax-like',
  'self-condemnation',
  'functional-development',
  'gelatin-liquefying',
  'hygeio-therapeutist',
  'modernmedicinepub',
  'chromo-lithographk',
  'thedistinctivenameoftemperanceschool',
  'naturaldepravity',
  'autoiritoxications',
  'self-unconsciousness',
  'technicallyknown',
  "whenthejob'sneatlydonebythe'fairieswemeet",
  'butheeitherdidnot',
  'curiously-wrought',
  'curvainvestigated',
  'followcommensurate',
  'inconvenimpaired',
  'drug-superstition',
  "thebooklover'smagazine",
  'health-andbeauty-destroying',
  'liquor-prescribing',
  'irgttgeesbwoixtubot',
  'ignordisturbances',
  "educational'institution",
  'winovertotheranksof',
  'two-pair-of-stairs',
  'riedthousandstothegrave',
  'asthelastnumberoftheyearwas',
  'springfashionboo',
  'fashionablydressed',
  'cowwwwwwwwwwwwwww',
  'wereceivethehealth',
  'madeoinapseixrsaoizless',
  'ofthefunguspresumably',
  'energy-containing',
  'inaamanaaaaanwaainaa',
  'moderateinyoureating',
  'newpribmericanyews',
  'whotreadonyourdress',
  'cftofrormounthde-tnriportihckevtservyiadatyhe',
  'temperate-climate',
  'thenourishmentoftwoandahalfpounds',
  'lightning-stroke',
  'pavement-sweeping',
  'brilliant-colored',
  'over-prescription',
  'unpleasstimulated',
  'morning-glorylady',
  'theexcessiveuseofardentspirits',
  'isthenameofaweeklyjournalpub',
  'agentusniwteadntsetadtiens',
  'whichhasbeenvitiatedby',
  'thesignsofthetimes',
  'three-hundredpage',
  'improperpoeition',
  'ibtottotritotitdotlitber',
  "fcst'incareofthesanitarium",
  'umlimpoinumattiminipurunfirixpageu',
  'anesthetic-table',
  'semi-philanthropic',
  'semi-contraction',
  'leadingretailers',
  'atchreosgsretahteeaesnt',
  'complacentlyremarked',
  'stockingsshouldnotbewornmore',
  'totertwynotivizp',
  'icitelpeairtirrivaiticts',
  'completecatalogueofbooksuponrequest',
  'puritanwstaitlel',
  'cccceifeeeimmipicipieeeeipipipiefe',
  'dayschoollessons',
  'danish-norwegian',
  'ventilating-apparatus',
  'feeble-mindedness',
  'considerablylarger',
  'pacifiukesscivevyjkiinierilan',
  'theaboveisthenameofasociety',
  'itwasadogmaamongreflective',
  'umnsarefilledwithexcellent',
  'thenervoussystem',
  'leicester-to-london',
  'canrcoptindirnto',
  'useoftobaccoisbecom',
  'fthsacteicntravras',
  'withpalecheekandtearless',
  'callileucocapillaire',
  'fashion-instructions',
  'threshing-machine',
  'nostudentprivilegedtositun',
  'disease-communicating',
  'plesoftemperance',
  'definitionoftheword',
  'werethoughtofhighlyby',
  'ewmoorkstfciollisnaplwetaenatnlodntiraecganzedoicnon',
  'intercommunications',
  'suchgrossviolationsofthe',
  'disease-infected',
  'withoutsuchcomplementsasmilk',
  'imummarimminommurimii',
  'ghost-yarn-spinning',
  'cigarette-stands',
  'ingwiththeutmostconfidence',
  'family-dwellings',
  'a--s----caivilization',
  'hospital-steward',
  'rubbingvigorously',
  'inthefirstplacenoonecanlong',
  'zakrawawirialgfiakiccceirogif',
  'self-justification',
  'etwaveiweeewesoweveiftergearrwair',
  'drug-prescription',
  'thereareillsthatwecannotescape',
  'stimulant-habits',
  'startsthisspring',
  'whichuponreflectionwillbefoundindi',
  'youcancounteractthelawwhichhe',
  'thiscouponisworth',
  'aaaaaamaaaaaaaivvvvvvv',
  'jshowstheatrophied',
  'whattheeffectmustbeofso',
  'yearsandconsumeseachdayofthatperi',
  'narcotico-disinfectant',
  'givehertheportionofmeattowhichsheisentitled',
  'tender-heartedness',
  'reciprocatefficiency',
  'heardabovetherest',
  'co-operationlofltheihundreds',
  'straight-brimmed',
  'aiiiiiiiiialuilia',
  'non-crystallizable',
  'theterminationofthewar',
  'suchaninstitutionisthatatsouthclasaterr',
  'itoillustrations',
  'areallworthlessinthedark',
  'machinestitching',
  'goxolodioxcialkoio',
  'uponherthefullconsciousnessofhiswork',
  'commissary-general',
  'yiedicalturgical',
  'isnotpublishedasameansof',
  'homuststudythelawsofhealth',
  'fast-diminishing',
  'stomach-digesand',
  'thrashing-machine',
  'stainfromsaltorgreasyfood',
  'ourunderclothing',
  'otrislilsonathnes',
  'ifthesepreceptsarenotfollowed',
  'thoiraezimragosfarneflniirpti',
  'amaaaaaaaaaabovvmwwvvvvvv',
  'visithadbeenmade',
  'whittlesey-phelps',
  'weeeeeeeeeeeeeeeeeee',
  'aglowwithpromise',
  'ttittyyttititittittittty',
  'ourdesignistomakethisausefuljournal',
  'whenproperlycooked',
  'artist-photographer',
  'riffillrillrinikimiwif',
  'bone-and-musclebuilding',
  'corretoprottento',
  'sulphureted-hydrogen',
  'chewing-contrivances',
  'provision-markets',
  'cointspripatioiv',
  'complicationsare',
  'saygoodornothing',
  'andnesumemaerystlees',
  'withoutanyofthesesuddenshocks',
  'invention-wonderful',
  'counter-influences',
  'misfortune-proof',
  'csondrraecvtaedlenattatnhdi',
  'supposischool-room',
  'right-sightedness',
  'andtherapidexten',
  'partially-organized',
  'andwinethatmakethgladthe',
  'cultureexperiments',
  'iilstrtalatiltgle',
  'forthepurposeoftakingtreatmentfortherecov',
  'ultra-rationalists',
  'isespeciallyadaptedtoweavingragcarpets',
  'kelloggwilldeliver',
  "boy'smagazineintheworld",
  'tioninregardtohealthandprivilegesbe',
  'astacniocmoucnorytt',
  'muco-menibranous',
  'backward-bendings',
  'eighteen-months-old',
  'ehtohwylliefregorshort',
  'itailkiliiiilikilifkifff',
  'self-stultification',
  'iseamaitearicaavea',
  'stomach-scouring',
  'anti-vaccinationist',
  'mimimimiyaihmimimi',
  'under-great-ohligatiens-to--the--wisconsin',
  'square-shouldered',
  'perineosinuexereeinator',
  'neuropathologist',
  'adjutant-general',
  'thirteenyear-old',
  'asepticoperating-rooms',
  "children'sindisposition",
  'socialsettlement',
  'marchnumberofthenorth',
  'diseaseproducing',
  'therearelovingwords',
  'cold-water-drinking',
  'respectable-looking',
  'chrommitrograpric',
  "greenhillsof'malvern",
  'carefullydressed',
  'drop-the-handkerchief',
  'directly-connected',
  'theinformationwhichwilltendtoenlightenthem',
  'milk-and-vegetable',
  'soerilitaitivsinin',
  'workpoethewealthy',
  'monthlytoourtable',
  'kitchen-middings',
  'temperance-school',
  'littexxcicoliptx',
  'andwewillsendpat',
  'antoiretreilcieti',
  'applicavasoconstrictors',
  'howstrongisakindword',
  'uponadietofgoodbreadmadeofunbolted',
  'instubborncasesof',
  'long-neglectedto-be-buried',
  'self-examination',
  'licrtatesalxcolcil',
  'street-crossings',
  'ten-thousand-franc',
  'entirely-depopulated',
  'lieutenant-governor',
  'anti-vivisectionists',
  'ailiiiiiiimillill',
  'attorneys-at-law',
  'consciencestricken',
  'peratureatthecloseofthebath',
  'two-thousand-mile',
  'onthestreetcorners',
  'pneumatic-solid-tip',
  'fieldforthoughtandobservation',
  'theboardslaysmore',
  'iwateredsatvheosuslaivneds',
  'rifiliragronipin',
  'tdiullemnoaunryisyhemaresnot',
  'ndeiectricirsupplyco',
  'lilifillswilthlelswwlsiudtini',
  'turetorenderitimpossibleforhimtoremain',
  'supposed-to-be-incurable',
  'butididhaveabitofarow',
  'twwwwwwwwwwwwwwwwittitittit',
  'forourbigfribiyeeato',
  'hippopotamus-liver',
  'addedtomakeuptheoriginalquantity',
  'fence-philosophers',
  'arenodronesinthesystem',
  'self-registering',
  'character-forming',
  'gaveitninecalomelpowders',
  'tempest-scatterer',
  'orthoamidotoluenesulphonic',
  'andhestchaiirrssiin',
  'irgiftgltittirst',
  'absinthe-gullets',
  'literarymiscellany',
  'mmmmmmmmmmmmmmmmmmmmmmm',
  'mountainclimbing',
  'srutlenrtiltplitbiobnrs',
  'home-surroundings',
  'koproeuodmotikion',
  'passingthroughthe',
  "three-months'-old",
  'ofacenturyagotheuseof',
  "fellow-creature's",
  'digesting-machine',
  'graefenbergesses',
  'partywhosogrosslyinsultedyour',
  'cosmetically-daubed',
  'fashion-fettered',
  'scavenger-in-chief',
  'disease-conditions',
  'poison-producing',
  'sumptuous-looking',
  'anti-vaccination',
  'rough-and-tumble',
  'small-of-the-back',
  'hob-nailed-liver',
  'copper-poisoning',
  'kneesofthechubbymorsel',
  ...],
 15)

Correction 9 -- Remove long errors strings

In [43]:
# %load shared_elements/remove-tokens-with-long-strings-of-characters.py
prev = "correction8"
cycle = "correction9"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    replacements.append(clean.check_for_repeating_characters(tokens, "m|M"))
    replacements.append(clean.check_for_repeating_characters(tokens, "l|L"))
    replacements.append(clean.check_for_repeating_characters(tokens, "e|E"))
    replacements.append(clean.check_for_repeating_characters(tokens, "i|I"))
    replacements.append(clean.check_for_repeating_characters(tokens, "a|A"))
    replacements.append(clean.check_for_repeating_characters(tokens, "n|N"))
    replacements.append(clean.check_for_repeating_characters(tokens, "e|E"))
    replacements.append(clean.check_for_repeating_characters(tokens, "v|V"))
    
    replacements = [item for sublist in replacements for item in sublist]
            
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
HR18671101-V02-05-page12.txt: [('EPEIENEERIREESERUEEIENIElli', ' '), ('EPEIENEERIREESERUEEIENIElli', ' ')]
HR18680901-V03-03-page14.txt: [('U.PEINIIVIIINIUIIIII', ' ')]
HR18681201-V03-06-page12.txt: [('IMUMMARIMMINOMMURIMII', ' ')]
HR18681201-V03-06-page2.txt: [('WWINJuniglilliallIMIIIIPPIPIIIINIIMIUTIMPIP"', ' ')]
HR18690101-V03-07-page12.txt: [('OFIIIIIIINIIRIA', ' ')]
HR18701201-V05-06-page8.txt: [('erIwilltellyouallaboutit."I', ' ')]
HR18740401-V09-04-page14.txt: [('P"M"rommmultimmmilmirmongumrmmlwtr.mm.Alloommi."', ' ')]
HR18741001-V09-10-page6.txt: [('PRIIINWRIPPligiPPWrinFRORIPWWTMIIMMIIIMIR', ' ')]
HR18780801-V13-08-page2.txt: [('PIPPIPRIMMIIIMPUMPOPIMMINIIIIIIN', ' ')]
HR18780801-V13-08-page30.txt: [('IMINVIIMPIIIIIIMAgglIgggiggliglIPSYNIMINIMPAPPIlla', ' ')]
HR18800101-V15-01-page1.txt: [('EIMELNIMMILMEMIIIIIIIIIIIIMIKIRIMMIIIIIIIIIII', ' ')]
HR18801201-V15-12-page1.txt: [('.IIIIIIIIIIIIIII', ' ')]
HR18810201-V16-02-page1.txt: [('IIIitIIIIIIIIIII', ' '), ('IIIIIIIIIIIIIIII', ' ')]
HR18840401-V19-04-page1.txt: [('IIIIIIIIIIIIIIIIIIIIIII', ' ')]
HR18850701-V20-07-page1.txt: [('IIIIIIIIIIIIIIIIIIII', ' ')]
HR18870101-V22-01-page35.txt: [('......A.MAMMAMA.MMAMAISAAMMAA.....W.MAMM......A.....AMM', ' ')]
HR18870201-V22-02-page36.txt: [('nnynnm..umnnmauuui', ' ')]
HR18870301-V22-03-page18.txt: [("EIEEE'EMEIENEEENEE", ' '), ("EIEEE'EMEIENEEENEE", ' ')]
HR18870301-V22-03-page33.txt: [('cmMwMMMMMM.Uwyr.qm..', ' ')]
HR18870401-V22-04-page42.txt: [('ligLrIgiiiiiiniiiril.i', ' ')]
HR18870601-V22-06-page34.txt: [('iiiiiiiiiiiiiiiiiiiii', ' ')]
HR18870601-V22-06-page42.txt: [('rymiloriiiinfiiiiiffklurigii', ' ')]
HR18870801-V22-08-page18.txt: [('MIMIIIIIMIIIIIMIIIIIMMIIMHIMIIIMIIPIMMIIIII', ' ')]
HR18870801-V22-08-page30.txt: [('MMMMMMMMMMMMMMMMM', ' ')]
HR18870901-V22-09-page18.txt: [('mmmmmmmmmmmmmmm', ' ')]
HR18870901-V22-09-page30.txt: [('MMMMMMMMMMMMM', ' ')]
HR18870901-V22-09-page34.txt: [('...mmcMCMCCMCcmimmmmmmmMMMUMMumMMwMvMMMMWO.', ' ')]
HR18870901-V22-09-page43.txt: [("II'Illiiillllllh", ' ')]
HR18871001-V22-10-page3.txt: [('illffilliElliiallar', ' ')]
HR18871001-V22-10-page30.txt: [('MMMMMMMMMMMMMMMMMMMMMMM', ' ')]
HR18871001-V22-10-page34.txt: [('NAAAAAAAAAAAAAAAA', ' ')]
HR18880201-V23-02-page5.txt: [('ffiiIs.II.IIIIIIIi.rx.ililff', ' ')]
HR18880301-V23-03-page10.txt: [('II\'IIIUIIIIIa"\'', ' ')]
HR18880401-V23-04-page45.txt: [('innunnnunmmunmuommommuumummnimumnummnmunnunnumummunnunumnunmummnamnununnunnumuunnummummumumnnunnuummumunnumunm', ' '), ('innunnnunmmunmuommommuumummnimumnummnmunnunnumummunnunumnunmummnamnununnunnumuunnummummumumnnunnuummumunnumunm', ' ')]
HR18880401-V23-04-page50.txt: [('IIIVIIIIIIIUIIUIIIiI', ' ')]
HR18880501-V23-05-page49.txt: [('muummummimmimmummmummmmum', ' '), ('LIMIIIIIIMIHIMMITIMMITMIMIRMITITTIMMlllllllllMMIMMIMMMIRIMMni', ' '), ('lllllIIIMIIIIIMITMMTIIIMITIMMIITIMR.', ' ')]
HR18880601-V23-06-page45.txt: [('mmamumnmunumnnunontoollumummummnummttnunnmunummummunnumunnumnmnumtn', ' '), ('mmamumnmunumnnunontoollumummummnummttnunnmunummummunnumunnumnmnumtn', ' ')]
HR18880601-V23-06-page48.txt: [('lllllllllllllllllllllllllllllllllll', ' '), ('lllllllllllll', ' ')]
HR18880801-V23-08-page45.txt: [('illilllllllllmlaal', ' '), ('iiiiiiiiiiiiuIuui', ' ')]
HR18880801-V23-08-page51.txt: [('mtnipommmimmmimmiTm', ' '), ('llllllllllllll', ' '), ('llllllllllllllll', ' ')]
HR18880901-V23-09-page45.txt: [('MEATuassuilimusumsavesesseuraosumniummussminsinsumuumensuountammtommusiaSeSSLIWASiilerlirellliellillenii', ' '), ('MEATuassuilimusumsavesesseuraosumniummussminsinsumuumensuountammtommusiaSeSSLIWASiilerlirellliellillenii', ' ')]
HR18881101-V23-11-page50.txt: [('ITOrrnmmimmmmnzinmnimnimumiumennzmummwnzammmmunmenmwmnymn', ' ')]
HR18881201-V23-12-page57.txt: [('llllllllllllllllllll', ' ')]
HR18911201-V26-12-page8.txt: [('iiiIIIIiillililiiiiIIIII.', ' ')]
HR18930601-V28-06-page18.txt: [('mmaucnucimmecancEmmacnionmomucamni..', ' '), ('CEELEIBICEIELECIEEMILELEEMEHMERE', ' '), ('ciEEEmemEEEEDEEEREEci', ' '), ('CEELEIBICEIELECIEEMILELEEMEHMERE', ' '), ('ciEEEmemEEEEDEEEREEci', ' ')]
HR18931001-V28-10-page40.txt: [('MIUMBIUMMIIIMMIUMMNIMIN', ' ')]
HR18931001-V28-10-page41.txt: [('MMIIIIIIMMIIMIMMIIIMIMIIMMIIIIIIMMITIMIMMIMIIMMMIN', ' '), ('MMIIIIIIMMIIMIMMIIIMIMIIMMIIIIIIMMITIMIMMIMIIMMMIN', ' ')]
HR18940201-V29-02-page2.txt: [('imilliiiiiiiiiiiiiiiiiiiminiiiiiiiiiiiiiliiiiiiiiiiiiiiiiiiiiillaniiiliiiimilliiiiiimiiiiiiiiiiiiiiiiiiiiiiiiimillia', ' '), ('imilliiiiiiiiiiiiiiiiiiiminiiiiiiiiiiiiiliiiiiiiiiiiiiiiiiiiiillaniiiliiiimilliiiiiimiiiiiiiiiiiiiiiiiiiiiiiiimillia', ' ')]
HR18940301-V29-03-page44.txt: [('Ainfilimuniiiiiiiiiiiminiminioniomiiimimounioulinillininuminwilimillioni', ' ')]
HR18940401-V29-04-page41.txt: [('VIIIIIIIIIIIIIIIIilliwummulunin', ' ')]
HR18940501-V29-05-page41.txt: [('MIMMIIMMIIIIIIIIIIIIIIIIIIIIIIIIIIIIII', ' ')]
HR18940601-V29-06-page43.txt: [('MIIIIIIIIIIIII', ' ')]
HR18940801-V29-08-page41.txt: [('iiimiiiiiimiliniminiiiiimmiumummummuiminimiumitimiliummiiiiminiumiiimilr', ' '), ('iiimiiiiiimiliniminiiiiimmiumummummuiminimiumitimiliummiiiiminiumiiimilr', ' ')]
HR18940901-V29-09-page1.txt: [('MNIMMMannOMMOMMIMMImmillMIIIIII', ' ')]
HR18941001-V29-10-page43.txt: [('louthmuullommurnumuummumuliwimiffillimuummuilliimilimilimumuumui', ' '), ('louthmuullommurnumuummumuliwimiffillimuummuilliimilimilimumuumui', ' ')]
HR18950201-V30-02-page41.txt: [('EIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIE', ' ')]
HR18960101-V31-01-page42.txt: [('effeeeeeeleeeeee', ' '), ('eeeeeeeeeeeleeeeeee', ' '), ('effeeeeeeleeeeee', ' '), ('eeeeeeeeeeeleeeeeee', ' ')]
HR18960101-V31-01-page45.txt: [('Coilleilleill', ' ')]
HR18960201-V31-02-page40.txt: [('eotoecoeceeeeetopeookeecoeeeeeeeo', ' '), ('eotoecoeceeeeetopeookeecoeeeeeeeo', ' ')]
HR18960301-V31-03-page45.txt: [('MIIIIIMIIIIII', ' ')]
HR18960301-V31-03-page47.txt: [('SIINIIIIRIIINDELECTRICALSUPPLYCO.', ' ')]
HR18960301-V31-03-page48.txt: [('MMTIVITMITTMTITIMMMTIMMTIM', ' ')]
HR18960401-V31-04-page41.txt: [('Neeeeeeeeeeeeeeeereeeeeeeeeee', ' '), ('eeeereeeeeeeeeeeeeeeereeeeeeeee', ' '), ('Neeeeeeeeeeeeeeeereeeeeeeeeee', ' '), ('eeeereeeeeeeeeeeeeeeereeeeeeeee', ' ')]
HR18960501-V31-05-page42.txt: [('Nocenceeeceeceopececeeecenece', ' '), ('SIINIIRRIANDELECTRICIIISUPPLYCO.', ' '), ('Nocenceeeceeceopececeeecenece', ' ')]
HR18960601-V31-06-page41.txt: [('weeeeeeeeeeeeeeeeeee', ' '), ('weeeeeeeeeeeeeeeeeee', ' ')]
HR18960701-V31-07-page46.txt: [('mmillitmlmmMtIttIrmmmii', ' ')]
HR18961001-V31-10-page61.txt: [('hillfillillCilliefS', ' ')]
HR18961001-V31-10-page64.txt: [('MITIMIIIIIIMITIMMIIIMMTIIMITIMMITMITIMIT', ' '), ('MITIMIIIIIIMITIMMIIIMMTIIMITIMMITMITIMIT', ' ')]
HR18961101-V31-11-page52.txt: [('Mullillcillfers', ' ')]
HR18961101-V31-12-page42.txt: [('"reeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeco', ' '), ('"reeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeco', ' ')]
HR18971101-V32-11-page68.txt: [('rieefeeocielereemee', ' '), ('rieefeeocielereemee', ' ')]
HR19000101-V35-01-page70.txt: [('AMAAAAAAAAAABOVVMWWVVVVvV', ' '), ('AAAAWAAAAWAAAAAA', ' '), ('AMAAAAAAAAAABOVVMWWVVVVvV', ' ')]
HR19000101-V35-01-page85.txt: [('IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII', ' ')]
HR19000201-V35-02-page78.txt: [('VvVVVVVVYVVVYWVVVNI', ' ')]
HR19000301-V35-03-page78.txt: [('kAAAAAAAAAAArVYYVNAAAAAAA', ' '), ('VVVIAAAAAAAAAA', ' ')]
HR19000401-V35-04-page78.txt: [('lllllllllllllllllllllllllllllllllllllllllllll', ' ')]
HR19000501-V35-05-page72.txt: [('alolalaallallallaa', ' ')]
HR19000601-V35-06-page83.txt: [('.airairaaaaa.aaaaaa', ' ')]
HR19000701-V35-07-page66.txt: [('IIIIIUlIIIIIIIIIIIIIII', ' ')]
HR19000701-V35-07-page81.txt: [("kie'VVVYVVVVOINVVVVV", ' ')]
HR19000801-V35-08-page57.txt: [('SAAAA.A.A.WAAAWAAAAAAA.', ' ')]
HR19000801-V35-08-page67.txt: [('AAAIVWSAAAAMAAWAIVc', ' ')]
HR19000801-V35-08-page71.txt: [('EtEEEEEEEEECCEWCFEE', ' '), ('FEIPEEEEEEFEEEE', ' '), ('aaisaalvsliralaaaalla', ' '), ('irsaaaaaairipiairaairato.', ' '), ('EtEEEEEEEEECCEWCFEE', ' '), ('FEIPEEEEEEFEEEE', ' ')]
HR19000901-V35-09-page67.txt: [('INAAMANAAAAANWAAINAA"AAAAAAMAAAAAAAIVVVVVVV', ' '), ('INAAAAAAAAAAAAAAAAAMA', ' '), ('WOWAAAWAAAAAAAAWAAAAAAAAAAAAAA', ' ')]
HR19001001-V35-10-page67.txt: [('WIVVNAAAAAAWAAAAAAAAAAAAAAAAAAAAAAW".', ' ')]
HR19001001-V35-10-page70.txt: [('povviviovvvvvvvv', ' ')]
HR19001001-V35-10-page76.txt: [('AIVVVVVVIANWVYVVVVVVVVVVVyvY', ' ')]
HR19001201-V35-12-page101.txt: [('AAAAAAAAAAAAA"AAAAAA', ' '), ('wkosAAAAAAAAAAAAAAMAAAAWAA', ' ')]
HR19001201-V35-12-page108.txt: [('MPP"\'is\'allIrmIlliwillroll', ' ')]
HR19001201-V35-12-page110.txt: [('AAAAAAAWAAAWANWA', ' ')]
HR19020501-V37-05-page61.txt: [('umulllllllgiiillulglol', ' ')]
HR19031001-V38-10-page75.txt: [('EIMEIMMEIMMEIMMMEIME', ' ')]
HR19040301-V39-03-page21.txt: [('IIIIVIIVPiIIiiIIIV', ' ')]
HR19040501-V39-05-page76.txt: [('PaNiiieWCiiiiCEWEVgiiiMifigaMECON', ' ')]
HR19050401-V40-04-page14.txt: [('runalwllilliallnall', ' ')]
HR19051001-V40-10-page89.txt: [('amintuummouttunummomonnommonaninnutomulto', ' ')]
HR19051101-V40-11-page13.txt: [('mmutminonnumuninnumounnitimmoummunnommumit', ' '), ('mmutminonnumuninnumounnitimmoummunnommumit', ' ')]
HR19060101-V41-01-page5.txt: [('Smtmomtimommitumnimmumuminuosimutunummuunutoi', ' ')]
HR19060301-V41-03-page50.txt: [('chickadee-dee-dee', ' '), ('chickadee-dee-dee', ' ')]
HR19060401-V41-04-page95.txt: [('wovvvovvowwwwwovvw', ' ')]
HR19070801-V42-08-page107.txt: [('IIIMIIIMIIIMIIIIIMIIINNIIIIIIMIIMMEr', ' ')]
HR19070801-V42-08-page95.txt: [('illivillirillivillb.lb', ' '), ('AllikvillibAllb', ' ')]
In [44]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/HR/correction9

Average verified rate: 0.9787164744677069

Average of error rates: 0.034487049650475

Total token count: 13908504

In [47]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[47]:
[('m', 15948),
 ('d', 11327),
 ("'", 11284),
 ('e', 8969),
 ('t', 7931),
 ('f', 7056),
 ('w', 6939),
 ('r', 6925),
 ('n', 5966),
 ('co', 5725),
 ('pm', 3762),
 ('g', 3208),
 ('th', 1569),
 ('u', 1505),
 ('k', 1445),
 ('x', 1336),
 ('z', 834),
 ('mo', 723),
 ('oz', 676),
 ('sel', 657),
 ('ex', 624),
 ('pa', 603),
 ("an'", 555),
 ('lb', 534),
 ('tion', 489),
 ('pp', 474),
 ('re', 465),
 ('-', 426),
 ('wm', 355),
 ('q', 316),
 ('al', 312),
 ('ti', 298),
 ('ft', 280),
 ('em', 267),
 ('io', 251),
 ('ro', 250),
 ('ment', 248),
 ('mt', 243),
 ('pt', 240),
 ('oo', 239),
 ('ry', 215),
 ('es', 203),
 ('il', 200),
 ('ll', 189),
 ('se', 185),
 ('ia', 183),
 ('tt', 181),
 ("hours'", 180),
 ('li', 178),
 ("''", 171)]

Correction 9 -- Split Squashed Words

In [50]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log

prev = cycle
cycle = "correction9"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

verified_tokens = []

for filename in corpus:  
    content = utilities.readfile(directories['prev'], filename)
    clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)

tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]

sorted_list_of_words = list(words_sorted_short['token'])

wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = utilities.strip_punct(content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    
    for token in tokens:
        if not token.lower() in spelling_dictionary:
            if len(token) > 17:
                if re.search(r"[\-\-\'\"]", token):
                    pass
                else:
                    split_string = clean.infer_spaces(token, wordcost, maxword)
                    list_split_string = split_string.split()
                    
                    if clean.verify_split_string(list_split_string, spelling_dictionary):
                        replacements.append((token, split_string))
                    else:
                        pass
            else:
                pass
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
HR18670601-V01-11,12-page28.txt: [('fleshasfoodmustbedecidedly', 'flesh as food must be decidedly')]
HR18670601-V01-11,12-page29.txt: [('anyonedoesyouaninjury', 'anyone does you an injury')]
HR18670601-V01-11,12-page30.txt: [('diseaseandsuddendeathdaughter', 'disease and sudden death daughter'), ('theastonisheddoctor', 'the astonished doctor')]
HR18670601-V01-11,12-page31.txt: [('ifthepowderhadcuredtheboy', 'if the powder had cured the boy'), ('TotheAdvantageopthe', 'To the Advantage opt he')]
HR18670701-V02-01-page1.txt: [('andbyeatinglargely', 'and by eating largely')]
HR18670701-V02-01-page14.txt: [('saidaphysiciantohisser', 'said a physician to his ser')]
HR18670701-V02-01-page15.txt: [('andyoucannnoturgeiton', 'and you c ann not urge it on')]
HR18670701-V02-01-page16.txt: [('readbyfromthreetofivethousandtoeverythou', 'read by from three to five thousand to every thou'), ('andaltogetheritwasa', 'and altogether it was a')]
HR18670701-V02-01-page7.txt: [('aremorelikelytodiscoverthemuponthe', 'are more likely to discover them upon the'), ('Ifhesoldamanarattlesnakeforten', 'If he sold a man a rattlesnake for ten')]
HR18670701-V02-01-page9.txt: [('everyyearofdiarrhea', 'every year of diarrhea')]
HR18670801-V02-02-page3.txt: [('reciprocatefficiency', 'rec i proc at efficiency')]
HR18671101-V02-05-page5.txt: [('garianpeasantsfromtheCarpathian', 'gar ian peasants from the Carpathian')]
HR18671201-V02-06-page1.txt: [('variousfeverstowhichthehumankind', 'various fevers to which the humankind')]
HR18680101-V02-07-page12.txt: [('romismINIMPIMPIPRIMPIII', 'rom ism IN IMP IMP I P RIM P III')]
HR18680301-V02-09-page1.txt: [('ingwhattheywereassuredwasessential', 'ing what they were assured was essential')]
HR18680501-V02-11-page11.txt: [('thesewillsoonresultinpleasure', 'these will soon result in pleasure'), ('aspapimadefromgood', 'as pap i made from good'), ('unboltedflourormeal', 'unbolted flour or meal')]
HR18680501-V02-11-page5.txt: [('malignantremittent', 'malignant remittent')]
HR18680701-V03-01-page4.txt: [('norcompletelyfatigued', 'nor completely fatigued'), ('thatwhichisharmoniousandtemperate', 'that which is harmonious and temperate')]
HR18680801-V03-02-page7.txt: [('advertisingpatthese', 'advertising pat these')]
HR18681101-V03-05-page16.txt: [('administrathemselves', 'ad min i st rat hem selves')]
HR18681201-V03-06-page15.txt: [('medicaacquaintance', 'medica acquaintance')]
HR18681201-V03-06-page2.txt: [('nolownwallellitOMM', 'no l own wall ell it O M M')]
HR18690301-V03-09-page23.txt: [('Addressalllettersto', 'Address all letters to')]
HR18690401-V03-10-page14.txt: [('fearfullyprevalent', 'fearfully prevalent')]
HR18690501-V03-11-page2.txt: [('supposethecaseofwateratatemperatureof', 'suppose the case of water at a temperature of')]
HR18690501-V03-11-page23.txt: [('GREATWESTERNRAILWAY', 'GREAT WESTERN RAILWAY')]
HR18690601-V03-12-page23.txt: [('AddressMissWILLIAMS', 'Address Miss WILLIAMS')]
HR18690701-V04-01-page8.txt: [('hygeiotherapeutics', 'hygeio therapeutics')]
HR18690901-V04-03-page16.txt: [('easilycomprehended', 'easily comprehended')]
HR18700101-V04-07-page7.txt: [('istocauseitsownendbyits', 'is to cause its own end by its')]
HR18701201-V05-06-page4.txt: [('antiphlogistication', 'antiphlogistic at ion')]
HR18710401-V05-10-page24.txt: [('neverfailedtoproveasuccessinrecovering', 'never failed to prove a success in recovering')]
HR18710801-V06-02-page23.txt: [('consideredwholesome', 'considered wholesome'), ('knowlcontradictions', 'know l contradictions')]
HR18710901-V06-03-page22.txt: [('debilitatedcondition', 'debilitated condition')]
HR18711001-V06-04-page12.txt: [('guiltlesslamentable', 'guiltless lamentable')]
HR18711101-V06-05-page14.txt: [('snuffmanufacturers', 'snuff manufacturers')]
HR18720401-V07-04-page26.txt: [('tranquillityefficient', 'tranquillity efficient')]
HR18721101-V07-11-page7.txt: [('vegetaintelligible', 'veg eta intelligible')]
HR18730501-V08-05-page7.txt: [('otherscunninglywrought', 'others cunningly wrought')]
HR18731101-V08-11-page14.txt: [('lAspoirigppipuppeaw', 'l As poi rig p pi pup pea w')]
HR18731101-V08-11-page32.txt: [('distilistuapplanissitsal', 'dis til i stu a p plan is sit sal')]
HR18731201-V08-12-page6.txt: [('lislisoptilipytelo', 'lis lis opt i lip y tel o')]
HR18740101-V09-01-page14.txt: [('slovenlymindedness', 'slovenly mindedness')]
HR18740101-V09-01-page27.txt: [('followingparticulars', 'following particulars')]
HR18740201-V09-02-page10.txt: [('outofdoorssomewhere', 'out of doors somewhere')]
HR18740201-V09-02-page30.txt: [('ordinaryphysiological', 'ordinary physiological')]
HR18740501-V09-05-page15.txt: [('somewherecounterfeit', 'somewhere counterfeit')]
HR18740601-V09-06-page9.txt: [('Sendusthesetotdeuc', 'Send us these tot de u c')]
HR18740701-V09-07-page27.txt: [('beenperusingthisarticle', 'been perusing this article'), ('thusremovingthetemptation', 'thus removing the temptation')]
HR18740701-V09-07-page8.txt: [('pasammoiasmairtitawiramP', 'pas ammo i as mai r t i t a w i r a m P')]
HR18740801-V09-08-page15.txt: [('Themanofsuperiorintellectwouldtakethe', 'The man of superior intellect would take the')]
HR18740801-V09-08-page18.txt: [('givehertheportionofmeattowhichsheisentitled', 'give her the portion of meat to which she is entitled')]
HR18740901-V09-09-page9.txt: [('complacentlyremarked', 'complacently remarked')]
HR18741001-V09-10-page21.txt: [('Themoneythatisspentonhorsesanddress', 'The money that is spent on horses and dress')]
HR18741001-V09-10-page25.txt: [('Thewholemansmellslikeadistillery', 'The whole man smells like a distillery')]
HR18741101-V09-11-page19.txt: [('representawritings', 'represent a writings')]
HR18741201-V09-12-page19.txt: [('particthusiastically', 'par tic thus i as tic ally')]
HR18750101-V10-01-page20.txt: [('absolutelyessential', 'absolutely essential')]
HR18750601-V10-06-page22.txt: [('followcommensurate', 'follow commensurate')]
HR18751201-V10-12-page13.txt: [('ilpersuadinghimtoliedown', 'i l persuading him to lie down'), ('lustratestheeffectofayellowday', 'lust rates the effect of a yellow day')]
HR18751201-V10-12-page26.txt: [('AREMARKABLEdiscoveryWasmadeonan', 'A REMARKABLE discovery Was made on an')]
HR18751201-V10-12-page32.txt: [('theChristianChurch', 'the Christian Church')]
HR18760101-V11-01-page5.txt: [('householdwithintelligencethantospeak', 'household with intelligence than to speak'), ('withpalecheekandtearless', 'with pale cheek and tear less')]
HR18760501-V11-05-page15.txt: [('Butusedfrequenthuman', 'But used frequent human')]
HR18760601-V11-06-page1.txt: [('andwinethatmakethgladthe', 'and wine that maketh glad the')]
HR18760801-V11-08-page19.txt: [('professconstitutional', 'profess constitutional')]
HR18760901-V11-09-page31.txt: [('AMERICANASSOCIATION', 'AMERICAN ASSOCIATION')]
HR18770401-V12-04-page12.txt: [('shorttimewewereallloadedinalargespringimaginary', 'short time we were all loaded in a large spring imaginary'), ('inagenuineRockynotuain', 'in a genuine Rocky not u a i n')]
HR18770401-V12-04-page31.txt: [('PlainFactsaboutSongLife', 'Plain Facts about Song Life')]
HR18770901-V12-09-page11.txt: [('impressuponthiremindsthefactthatifGodisimpossibleforhimtothuseducatethem', 'impress up ont hire minds the fact that if God is impossible for him to thus educate them')]
HR18771201-V12-12-page9.txt: [('notwithstandvomits', 'not withstand vomits')]
HR18780401-V13-04-page27.txt: [('pockethandkerchiefs', 'pocket handkerchiefs')]
HR18780501-V13-05-page7.txt: [('breathingapparatus', 'breathing apparatus')]
HR18780701-V13-07-page13.txt: [('representtransgression', 'represent transgression')]
HR18780701-V13-07-page29.txt: [('PUBLICATIONSRECEIVED', 'PUBLICATIONS RECEIVED')]
HR18780901-V13-09-page32.txt: [('PlainFactsaboutSonalLi', 'Plain Facts about Son alL i')]
HR18781101-V13-11-page31.txt: [('SPECIALTERMSTOAGENTS', 'SPECIAL TERMS TO AGENTS')]
HR18790501-V14-05-page12.txt: [('LITERARYMISCELLANY', 'LITERARY MISCELLANY')]
HR18800201-V15-02-page18.txt: [('somethingelseifications', 'something else if i cations'), ('sticktogetherations', 'stick together at ions')]
HR18801201-V15-12-page35.txt: [('tobaccostimulation', 'tobacco stimulation')]
HR18810301-V16-03-page29.txt: [('Otaititspantulttis', 'Ot a it its pant ult tis')]
HR18810401-V16-04-page14.txt: [('ofwineandsamplesofbeer', 'of wine and samples of beer'), ('Samplesofallkindsof', 'Samples of all kinds of')]
HR18810401-V16-04-page15.txt: [('seewhatwasthematter', 'see what was the matter')]
HR18810401-V16-04-page16.txt: [('weallhavegottowither', 'we all have got to wither'), ('temperanceworldisjustnowcon', 'temperance world is just now con')]
HR18810401-V16-04-page17.txt: [('ganceonthepretenceofharmony', 'gan c eon the pretence of harmony'), ('temperancemovement', 'temperance movement'), ('Wouldhalfapintofwineadaymakea', 'Would half a pint of wine a day make a')]
HR18810401-V16-04-page18.txt: [('theexcessiveuseofardentspirits', 'the excessive use of ardent spirits'), ('andthefatherislockedoutlikethe', 'and the father is locked out like the')]
HR18810401-V16-04-page23.txt: [('notafewofthoseveryre', 'not a few of those very r e'), ('demodexfolliculorum', 'demodex fol l i c u l o r u m'), ('portionsoftheircutaneouscovering', 'portions of their cutaneous covering')]
HR18810401-V16-04-page25.txt: [('considerablequantitiesofsulphuricacid', 'considerable quantities of sulphuric acid')]
HR18810401-V16-04-page26.txt: [('pleasingandinstructive', 'pleasing and instructive')]
HR18810401-V16-04-page29.txt: [('Ashorttimeagohewastryingtomake', 'A short time ago he was trying to make')]
HR18810401-V16-04-page31.txt: [('anditsTreatmentwith', 'and its Treatment with'), ('havereceivedacopyof', 'have received a copy of')]
HR18810401-V16-04-page8.txt: [('withoutsuchcomplementsasmilk', 'without such complement s as milk'), ('cyclesofthePtolemaicsystem', 'cycles of the P tole mai c system')]
HR18810501-V16-05-page12.txt: [('ofbadthingsabouttheplumber', 'of bad things about the plumber')]
HR18810501-V16-05-page17.txt: [('Theyarestillsearchingforatheorem', 'They are still searching for a theorem'), ('Johnisveryparticularaboutthemak', 'John is very particular about them a k')]
HR18810501-V16-05-page2.txt: [('Socompletelyisthewholebody', 'So completely is the whole body')]
HR18810501-V16-05-page21.txt: [('classicalmancameintoseehim', 'classical man came into see him')]
HR18810501-V16-05-page25.txt: [('wellillustratesthepointwhich', 'well illustrates the point which'), ('Inafewdaysallisforgotten', 'In a few days all is forgotten'), ('ditionallegislationisneeded', 'dit ion a l legislation is needed')]
HR18810501-V16-05-page30.txt: [('Anexchangestatesthatpellagra', 'An exchange states that p ell agr a')]
HR18810601-V16-06-page13.txt: [('wecanhopetofightconsump', 'we can hope to fight con sum p')]
HR18810601-V16-06-page14.txt: [('uponherthefullconsciousnessofHiswork', 'upon her the full consciousness of His work')]
HR18810601-V16-06-page19.txt: [('werethoughtofhighlyby', 'were thought of highly by')]
HR18810601-V16-06-page26.txt: [('whenoccasionmayrequire', 'when occasion may require')]
HR18810601-V16-06-page28.txt: [('Letthedietconsistprin', 'Let the diet consist p r i n'), ('andtheothersecondaryaf', 'and the other secondary a f')]
HR18810601-V16-06-page30.txt: [('Byunanimousvotethereportofthecommit', 'By unanimous vote the report of the commit'), ('TheIllustratedScien', 'The Illustrated Sci en'), ('bymailofthepublishers', 'by mail of the publishers')]
HR18810601-V16-06-page31.txt: [('Thisisoneofthemostvaluablemedicaljour', 'This is one of the most valuable medical jour')]
HR18810601-V16-06-page32.txt: [('successinintroducingGood', 'success in introducing Good')]
HR18810701-V16-07-page18.txt: [('woreacapinthosedays', 'wore a cap in those days')]
HR18810701-V16-07-page20.txt: [('capitaloffenseforamagistratetobedrunk', 'capital offense for a magistrate to be drunk')]
HR18810701-V16-07-page5.txt: [('fromthehonestphysician', 'from the honest physician')]
HR18810801-V16-08-page14.txt: [('thegirlsofMilburnwould', 'the girls of Mil burn would')]
HR18810801-V16-08-page19.txt: [('butIdidhaveabitofarow', 'but I did have a bit of a row')]
HR18810801-V16-08-page20.txt: [('nineteentwentieths', 'nineteen twentieth s')]
HR18810801-V16-08-page23.txt: [('verityoftheattackhaspassedaway', 'verity of the attack has passed away')]
HR18810801-V16-08-page30.txt: [('manycompetitorsinthecountry', 'many competitors in the country')]
HR18810801-V16-08-page31.txt: [('ofthelecturesareeminentlypracticalinchar', 'of the lectures are eminently practical in char')]
HR18810801-V16-08-page32.txt: [('candomoretosavelifeandre', 'can do more to save life andre'), ('TheeditoraimstomakeGood', 'The editor aims to make Good'), ('andsuchtractsandpam', 'and such tracts and pam')]
HR18810801-V16-08-page4.txt: [('thepallorofthefaceassumesape', 'the pallor of the face assumes ape')]
HR18810801-V16-08-page7.txt: [('Noremedyismorecertainlysuccessful', 'No remedy is more certainly successful'), ('undoubtedlyalsoincreasestheactionofall', 'undoubtedly also increases the action of all')]
HR18810901-V16-09-page1.txt: [('ThismonththereadersofGoodHealth', 'This month the readers of Good Health'), ('Thissilentandinstructive', 'This silent and instructive')]
HR18810901-V16-09-page12.txt: [('withoutfoodordrinkformanyhours', 'without food or drink for many hours'), ('ofacenturyagotheuseof', 'of a century ago the use of')]
HR18810901-V16-09-page15.txt: [('theaccumulationofdirtandfilth', 'the accumulation of dirt and filth')]
HR18810901-V16-09-page16.txt: [('achargethoucouldstnothear', 'a charge thou could st not hear'), ('ofthecrimsoncurtainflushingherpaleface', 'of the crimson curtain flushing her paleface')]
HR18810901-V16-09-page17.txt: [('workequallywellforgirls', 'work equally well for girls')]
HR18810901-V16-09-page18.txt: [('ofbecomingadrunkardifsheeverchances', 'of becoming a drunkard if she ever chances')]
HR18810901-V16-09-page21.txt: [('eitherinregardtohealth', 'either in regard to health'), ('hindasthestarsonthebrowofevening', 'hind as the stars on the brow of evening')]
HR18810901-V16-09-page25.txt: [('Thousandsoflivesarean', 'Thousands of lives are an')]
HR18810901-V16-09-page27.txt: [('hefindsitrisingquiterapidlyheknows', 'he finds it rising quite rapidly he knows')]
HR18810901-V16-09-page28.txt: [('Themethodofeliminationforwhich', 'The method of elimination for which')]
HR18810901-V16-09-page30.txt: [('Encouragestheheart', 'Encourage s the heart')]
HR18810901-V16-09-page31.txt: [('OntheSelfCurabilityofDisease', 'On the Self Curability of Disease'), ('pervadedbyacharitablespirit', 'pervaded by a charitable spirit'), ('itwasadogmaamongreflective', 'it was a dogma among reflective'), ('naturalphilosophers', 'natural philosophers')]
HR18810901-V16-09-page32.txt: [('Kelloggwilldeliver', 'Kellogg will deliver')]
HR18810901-V16-09-page8.txt: [('thewholeorapartofthebodytoairwhich', 'the whole or apart of the body to air which')]
HR18811001-V16-10-page11.txt: [('acaseofthiskindwhich', 'a case of this kind which')]
HR18811001-V16-10-page13.txt: [('siderablequantityoffluidmaybeabsorbed', 's i der able quantity of fluid may be absorbed')]
HR18811001-V16-10-page19.txt: [('gloryofEgypthaslongsincede', 'glory of Egypt has long since de')]
HR18811001-V16-10-page23.txt: [('ofdiatomaceousearth', 'of diatomaceous earth')]
HR18811001-V16-10-page28.txt: [('Everyphysicianisacquaintedwith', 'Every physician is acquainted with')]
HR18811001-V16-10-page32.txt: [('shouldsendsoonforpremiumand', 'should send soon for premium and')]
HR18811101-V16-11-page16.txt: [('itnotsaidbysomegreatsage', 'it not said by some great sage')]
HR18811101-V16-11-page19.txt: [('ofthefunguspresumably', 'of the fungus presumably')]
HR18811101-V16-11-page20.txt: [('givestonetothought', 'give stone to thought')]
HR18811101-V16-11-page24.txt: [('thecoldairofwinter', 'the cold air of winter')]
HR18811101-V16-11-page27.txt: [('whichisentirelyworthy', 'which is entirely worthy')]
HR18811101-V16-11-page30.txt: [('thingsisfrankenoughtotelltheplain', 'things is frank enough to tell the plain')]
HR18811101-V16-11-page31.txt: [('werepresentedatthesanitaryconven', 'we represented at the sanitary con ven')]
HR18811101-V16-11-page6.txt: [('asortofSamsonshorn', 'a sort of Samson shorn'), ('ofadocumentpublishedbytheState', 'of a document published by the State'), ('fairerandfatterinflesh', 'fairer and fatter in flesh'), ('diphtheriashouldbepr', 'diphtheria should be p r')]
HR18811101-V16-11-page7.txt: [('Securetheisolationofthosesickwith', 'Secure the isolation of those sick with')]
HR18811201-V16-12-page12.txt: [('cultureexperiments', 'culture experiments')]
HR18811201-V16-12-page18.txt: [('ragstuckintoawindowtokeep', 'rag stuck into a window to keep')]
HR18811201-V16-12-page25.txt: [('inquiresouropinion', 'inquires our opinion')]
HR18811201-V16-12-page26.txt: [('strengthcanbemadeforoneortwocents', 'strength can be made for one or two cents'), ('itcanhardlybedetermined', 'it can hardly be determined'), ('thesefiguresthatthewholesaleprice', 'these figures that the wholesale price'), ('ginofprofitissolargebetweentheprice', 'gin of profit is so large between the price')]
HR18811201-V16-12-page30.txt: [('TheHouseholdManual', 'The Household Manual'), ('makingthetwocostthesub', 'making the two cost the sub'), ('Ifthejournalhasdoneyougood', 'If the journal has done you good'), ('receivethesadintelli', 'receive the sad in tell i')]
HR18811201-V16-12-page31.txt: [('WhatistheChiefEndofWoman', 'What is the Chief End of Woman')]
HR18811201-V16-12-page35.txt: [('AddresstheAuthoratBattle', 'Address the Author at Battle'), ('warnpeopleagainstadulterationwith', 'warn people against adulteration with')]
HR18811201-V16-12-page38.txt: [('makingthispaperaSong', 'making this paper a Song'), ('duceourfriendstoworkwithawillwe', 'duce our friends to work with a will we')]
HR18820101-V17-01-page19.txt: [('isanoldfablethatsaysanIrish', 'is an old fable that says an Irish')]
HR18820101-V17-01-page2.txt: [('AutobiographicalPoemhethusdescribesthe', 'Auto biographical Poem he thus describes the'), ('theproductofhispeculiarphilo', 'the product of his peculiar phi lo')]
HR18820101-V17-01-page22.txt: [('anotheryearhasrolledaway', 'another year has rolled away'), ('andthatitisvastlybettertopay', 'and that it is vastly better to pay'), ('asthelastnumberoftheyearwas', 'as the last number of the year was')]
HR18820101-V17-01-page27.txt: [('whichflowsindrains', 'which flows in drains')]
HR18820101-V17-01-page29.txt: [('reportsgavethisitem', 'reports gave this item'), ('theactualnumberofper', 'the actual number of per')]
HR18820101-V17-01-page30.txt: [('Iwillnotusetobaccoinanyform', 'I will not use tobacco in any form')]
HR18820101-V17-01-page31.txt: [('contestisjustnowtakingplacebe', 'contest is just now taking place be'), ('Thisisoneofthebest', 'This is one of the best')]
HR18820101-V17-01-page32.txt: [('inthecourseoftheyear', 'in the course of the year'), ('Wearemadesadtoreceive', 'We are made sad to receive'), ('andbecameStatePrinter', 'and became State Printer'), ('winovertotheranksof', 'win over to the ranks of'), ('prohibitioniststhemostinvet', 'prohibitionists the most in vet')]
HR18820201-V17-02-page19.txt: [('wherethetruedietquestionisadvocated', 'where the true diet question is advocated'), ('Icanneverunderstandher', 'I can never understand her')]
HR18820201-V17-02-page2.txt: [('therehasbeenamarkedincreaseinvital', 'there has been a marked increase in vital')]
HR18820201-V17-02-page26.txt: [('areweaskedthequestion', 'are we asked the question')]
HR18820201-V17-02-page31.txt: [('andforsalebyJosephHarris', 'and for sale by Joseph Harris'), ('TheProblemofHumanLifeHereandHere', 'The Problem of Human Life Here and Here')]
HR18820201-V17-02-page32.txt: [('anduncleanlyhabitsoccasionadevelopmentof', 'and uncleanly habits occasion a development of')]
HR18820301-V17-03-page10.txt: [('mostintelligentmanintown', 'most intelligent man in town'), ('appetiteisrestoredtoitsnormalcondition', 'appetite is restored to its normal condition')]
HR18820301-V17-03-page12.txt: [('beenusedfortwentyyearsintheNew', 'been used for twenty years in the New')]
HR18820301-V17-03-page14.txt: [('kneesofthechubbymorsel', 'knees of the chubby morsel'), ('andforeverythingelseasfarasIcansee', 'and for everything else as far as I can see')]
HR18820301-V17-03-page19.txt: [('portantthanforachildtobeathomeatnight', 'port ant than for a child to be at home at night'), ('hassaidthattrueeduca', 'has said that true ed u c a'), ('andifitbeonlytheseandnothingmore', 'and if it be only these and nothing more')]
HR18820301-V17-03-page21.txt: [('lockedintothemforasuitablelengthoftime', 'locked into them for a suitable length of time'), ('totheAtlanticMonthly', 'to the Atlantic Monthly'), ('teredwithgreatcare', 'ter ed with great care'), ('Thelightisshadedat', 'The light is shaded at')]
HR18820301-V17-03-page28.txt: [('oringtosecureathoroughpreparation', 'or ing to secure a thorough preparation')]
HR18820301-V17-03-page29.txt: [('andevenlessofsurgeryand', 'and even less of surgery and')]
HR18820301-V17-03-page32.txt: [('Wearesendingoutscoresofoutfitstoagents', 'We are sending out scores of outfits to agents'), ('OrdersmaybeforwardedtoGood', 'Orders may be forwarded to Good'), ('asapremiumwiththeGood', 'as a premium with the Good')]
HR18820301-V17-03-page4.txt: [('remaintheretodriveofftheheadache', 'remain there to drive off the headache'), ('thatthesepeoplehave', 'that these people have')]
HR18820301-V17-03-page7.txt: [('andseriousofwhichareinflammationof', 'and serious of which are inflammation of')]
HR18820401-V17-04-page12.txt: [('evermakesavigorousman', 'ever makes a vigorous man')]
HR18820401-V17-04-page14.txt: [('uponthegratefulearth', 'upon the grateful earth')]
HR18820401-V17-04-page25.txt: [('correspondentwritesus', 'correspondent writes us')]
HR18820401-V17-04-page29.txt: [('solutionsasmaybecomeasourceofirritathe', 'solutions as may become a source of ir rit a the')]
HR18820401-V17-04-page7.txt: [('amotherunfitformaternity', 'a mother unfit for maternity')]
HR18820501-V17-05-page10.txt: [('Ineversawawellmanintheex', 'I never saw a well man in thee x')]
HR18820501-V17-05-page26.txt: [('peoplethatisfastbecomingacongrega', 'people that is fast becoming a c ong reg a')]
HR18820501-V17-05-page32.txt: [('nowcomingshouldsend', 'now coming should send')]
HR18820601-V17-06-page11.txt: [('claimantsfortheBengal', 'claimants for the Bengal')]
HR18820601-V17-06-page14.txt: [('Itwasasmuchaseverthattheykeptoutof', 'It was as much as ever that they kept out of')]
HR18820601-V17-06-page18.txt: [('whethersheneedstoenterapro', 'whether she needs to enter a pro'), ('thewinepuddingsauce', 'the wine pudding sauce')]
HR18820601-V17-06-page25.txt: [('thepresentisanageofhumbuggery', 'the present is an age of humbuggery')]
HR18820601-V17-06-page3.txt: [('SpongesorElectrodes', 'Sponges or Electrodes')]
HR18820601-V17-06-page31.txt: [('alreadyhasapayinglistofsubscriberswhichisconstantlyincreasing', 'already has a paying list of subscribers which is constantly increasing'), ('ElderMattesonhasalsoissued', 'Elder Matteson has also issued')]
HR18820601-V17-06-page4.txt: [('articlesonwhichitmay', 'articles on which it may'), ('fairlyuponthehollowplacespreparedfor', 'fairly upon the hollow places prepared for'), ('thespongeconnectedwith', 'the sponge connected with'), ('addedtomakeuptheoriginalquantity', 'added to make up the original quantity')]
HR18820701-V17-07-page12.txt: [('largequantitiesofsaltandothercon', 'large quantities of salt and other con')]
HR18820701-V17-07-page13.txt: [('upthismostimportantsubject', 'up this most important subject')]
HR18820701-V17-07-page15.txt: [('greatvarietyofways', 'great variety of ways'), ('mostlyinthehandsoftheSuperintendent', 'mostly in the hands of the Superintendent')]
HR18820701-V17-07-page16.txt: [('thedistinctivenameofTemperanceSchool', 'the distinctive name of Temperance School')]
HR18820701-V17-07-page25.txt: [('articleintheAprilnumberofGood', 'article in the April number of Good'), ('seemstohavemadeconsiderable', 'seems to have made considerable'), ('receivedtheenclosedreplywhichIsendto', 'received the enclosed reply which I send to')]
HR18820701-V17-07-page31.txt: [('Thefirstnumberofanew', 'The first number of a new')]
HR18820801-V17-08-page13.txt: [('unaccompaniedbyanyconsider', 'unaccompanied by any consider')]
HR18820801-V17-08-page16.txt: [('cordialthatroomsinnolessthansixteen', 'cordial that rooms in no less than sixteen')]
HR18820801-V17-08-page18.txt: [('aretracesofprimalloveliness', 'are traces of primal loveliness')]
HR18820801-V17-08-page2.txt: [('thejailofRangoonthatitisfound', 'the jail of Rangoon that it is found')]
HR18820801-V17-08-page31.txt: [('anddoesnotreallyaddtothe', 'and does not really add to the')]
HR18820801-V17-08-page4.txt: [('moremenwalkpastthecor', 'more men walk past the cor')]
HR18820901-V17-09-page1.txt: [('areexcellentmeansoftreatment', 'are excellent means of treatment')]
HR18820901-V17-09-page18.txt: [('butwedothinkitsintospendmoney', 'but we do think its into spend money')]
HR18820901-V17-09-page24.txt: [('numberoflittlecreatureswhich', 'number of little creatures which')]
HR18820901-V17-09-page25.txt: [('obligedtowaitafewhoursfora', 'obliged to wait a few hours for a')]
HR18820901-V17-09-page27.txt: [('inallpublicandprivateplaces', 'in all public and private places')]
HR18820901-V17-09-page5.txt: [('Sodifficultisthisworkthat', 'So difficult is this work that'), ('thatthepeoplebetaughthowtolive', 'that the people be taught how to live')]
HR18821001-V17-10-page1.txt: [('followingbriefsketch', 'following brief sketch')]
HR18821001-V17-10-page11.txt: [('oftheirintoxicatingpower', 'of their intoxicating power')]
HR18821001-V17-10-page12.txt: [('Iheardofamanwhowas', 'I heard of a man who was'), ('formalltheworkofthebody', 'form all the work of the body'), ('oftencalledastimulant', 'often called a stimulant')]
HR18821001-V17-10-page2.txt: [('Slightfeverforeightorten', 'Slight fever for eight or ten')]
HR18821001-V17-10-page31.txt: [('ofthemareunusuallyfittedtoattractattention', 'of them are unusually fitted to attract attention'), ('finditasusualfullofinterestandvariety', 'find it as usual full of interest and variety')]
HR18821001-V17-10-page32.txt: [('wehavearrangedtosendGood', 'we have arranged to send Good')]
HR18821001-V17-10-page8.txt: [('whichmayboappliedbymeansoffomenoughly', 'which may b o applied by means off omen o ugh l y')]
HR18821101-V17-11-page15.txt: [('tospareinfightingalcohol', 'to spare in fighting alcohol')]
HR18821101-V17-11-page27.txt: [('littlehealthofwomen', 'little health of women'), ('Habituallossofsufficientandhealthy', 'Habitual loss of sufficient and healthy')]
HR18821101-V17-11-page31.txt: [('Thisistheuniquename', 'This is the unique name'), ('ofpersonsaffectedbycertainformsofinsanity', 'of persons affected by certain forms of insanity')]
HR18821101-V17-11-page4.txt: [('sureandsosafeasthethirst', 'sure and so safe as the thirst'), ('Manypeoplehavegottheno', 'Many people have got the no')]
HR18821101-V17-11-page5.txt: [('ciallytothenarcoticactionofthenieotine', 'c i ally to the narcotic action of then i e o t i n e')]
HR18821201-V17-12-page24.txt: [('andnowandthenaclearnoteofwarningis', 'and now and then a clear note of warning is')]
HR18821201-V17-12-page3.txt: [('WrittenforGoodHealth', 'Written for Good Health'), ('issomereasonfornearlyeveryevent', 'is some reason for nearly every event')]
HR18821201-V17-12-page30.txt: [('allwillbeinterested', 'all will be interested'), ('Forfoursubscriberswewillsendby', 'For four subscribers we will send by')]
HR18821201-V17-12-page6.txt: [('itwillbeobservedthatthe', 'it will be observed that the'), ('IIrepresentsHealthy', 'II represents Healthy')]
HR18830101-V18-01-page10.txt: [('wasinaverybadcondition', 'was in a very bad condition'), ('feredupthatitsprogressmightbestayed', 'fe red up that its progress might be stayed')]
HR18830101-V18-01-page12.txt: [('foundationofTemperance', 'foundation of Temperance'), ('definitionoftheword', 'definition of the word')]
HR18830101-V18-01-page17.txt: [('Thekitchenissomewhatlikean', 'The kitchen is somewhat like an')]
HR18830101-V18-01-page18.txt: [('Fortheinfinitevarietyoftypeswemust', 'For the infinite variety of types we must')]
HR18830101-V18-01-page2.txt: [('illustratestheEnlarged', 'illustrates the Enlarged'), ('JshowstheAtrophied', 'J shows the Atrophied')]
HR18830101-V18-01-page29.txt: [('neitheryenoryoursons', 'neither yen or your sons'), ('foruntothisdaythey', 'for unto this day they')]
HR18830101-V18-01-page3.txt: [('Bytheaidofalittleinstrumentknown', 'By the aid of a little instrument known')]
HR18830101-V18-01-page30.txt: [('publishedbyWilliamsand', 'published by Williams and')]
HR18830101-V18-01-page31.txt: [('fromthepenofarespectedmem', 'from the pen of a respected mem')]
HR18830101-V18-01-page32.txt: [('arehappytobeabletoreportforGoonHealth', 'are happy to be able to report for Goon Health')]
HR18830101-V18-01-page5.txt: [('WhilearrangingtoplaceGood', 'While arranging to place Good')]
HR18830201-V18-02-page14.txt: [('andhecomeoutaspure', 'and he come out as pure')]
HR18830201-V18-02-page17.txt: [('Musthaveatimeonceinawhile', 'Must have a time once in awhile')]
HR18830201-V18-02-page31.txt: [('missionsofProtestantwriters', 'missions of Protestant writers'), ('Thisannualvisitoris', 'This annual visitor is')]
HR18830201-V18-02-page32.txt: [('theinformationwhichwilltendtoenlightenthem', 'the information which will tend to enlighten them'), ('Fromcommunicationsreceived', 'From communications received')]
HR18830301-V18-03-page15.txt: [('promotethecomfortandinterestofothers', 'promote the comfort and interest of others')]
HR18830301-V18-03-page16.txt: [('numberandvarietyoftheirdishes', 'number and variety of their dishes')]
HR18830301-V18-03-page2.txt: [('theterminationofthewar', 'the termination of the war'), ('accountofthememorablemarchwaspub', 'account of the memorable march was pub')]
HR18830301-V18-03-page26.txt: [('tokeepthefamilyingoodhealth', 'to keep the family in good health')]
HR18830301-V18-03-page27.txt: [('Itismadeofthesoakageoftheswamps', 'It is made of the soakage of the swamps')]
HR18830301-V18-03-page29.txt: [('wishestoknowwhetherbuck', 'wishes to know whether buck')]
HR18830301-V18-03-page30.txt: [('isthenameofaweeklyjournalpub', 'is the name of a weekly journal pub')]
HR18830301-V18-03-page31.txt: [('pamphletissupplemented', 'pamphlet is supplemented'), ('MarchnumberoftheNorth', 'March number of the North'), ('Thisclaimstobetheoldestprophetic', 'This claims to be the oldest prophetic')]
HR18830301-V18-03-page32.txt: [('inconnectionwiththe', 'in connection with the')]
HR18830301-V18-03-page9.txt: [('homuststudythelawsofhealth', 'ho must study the laws of health')]
HR18830401-V18-04-page11.txt: [('theasceticismoftheancient', 'the asceticism of the ancient')]
HR18830401-V18-04-page13.txt: [('andareaverypowerfulclassofmen', 'and are a very powerful class of men')]
HR18830401-V18-04-page19.txt: [('ofthishabitofconformingtothehygienic', 'of this habit of conforming to the hygienic'), ('thusexposingourignorance', 'thus exposing our ignorance')]
HR18830401-V18-04-page2.txt: [('inthesmallestdropofwater', 'in the smallest drop of water'), ('seenfoessallyforthtopreyuponourdear', 'seen foes sally forth to prey upon our dear')]
HR18830401-V18-04-page20.txt: [('keeplittlechildrenoutoffactories', 'keep little children out of factories'), ('theweedbythedaylightfair', 'the weed by the daylight fair')]
HR18830401-V18-04-page22.txt: [('weeksagowewerecalledonto', 'weeks ago we were called on to')]
HR18830401-V18-04-page24.txt: [('benodoubtthatfacts', 'be no doubt that facts')]
HR18830401-V18-04-page29.txt: [('havehotwetclothsappliedtothehead', 'have hot wet cloths applied to the head')]
HR18830801-V18-08-page24.txt: [('fashionablydressed', 'fashionably dressed')]
HR18830901-V18-09-page32.txt: [('thearrangementswhichhadbeenmadefortheconvention', 'the arrangements which had been made for the convention'), ('questionofvegetable', 'question of vegetable')]
HR18831101-V18-11-page15.txt: [('surprisingproperties', 'surprising properties')]
HR18831101-V18-11-page39.txt: [('questionofvegetableversusanimalfood', 'question of vegetable versus animal food')]
HR18840201-V19-02-page8.txt: [('Itisassumedthatthestomach', 'It is assumed that the stomach')]
HR18840401-V19-04-page25.txt: [('Therecommendations', 'The recommendations')]
HR18840701-V19-07-page29.txt: [('whichnaturehassoabundantlysuppliedus', 'which nature has so abundantly supplied us'), ('furoveninapanhalffulloflionigwater', 'fur oven in a pan half full of lion i g w a t e r')]
HR18840701-V19-07-page4.txt: [('thereestablishment', 'there establishment')]
HR18840901-V19-09-page18.txt: [('universallyaccepted', 'universally accepted')]
HR18841201-V19-12-page31.txt: [('ReadingversusKnowledge', 'Reading versus Knowledge')]
HR18850101-V20-01-page13.txt: [('GEORGEWESTBROOKwere', 'GEORGE WEST BROOK were')]
HR18850101-V20-01-page9.txt: [('remarkablyefficacious', 'remarkably efficacious')]
HR18851001-V20-10-page26.txt: [('polypharmaceutists', 'polyp harm ace u t i s t s')]
HR18860801-V21-08-page4.txt: [('innocentgeneralization', 'innocent generalization')]
HR18861201-V21-12-page2.txt: [('andAtlanticExpresses', 'and Atlantic Expresses')]
HR18861201-V21-12-page39.txt: [('SpecialDepartments', 'Special Departments')]
HR18861201-V21-12-page41.txt: [('iCnombviiteedelTracd', 'i C nom b vii tee del T r a c d')]
HR18870101-V22-01-page22.txt: [('presentogeneration', 'present o generation')]
HR18870101-V22-01-page37.txt: [('Agriculturistdescribes', 'Agriculturist describes')]
HR18870101-V22-01-page5.txt: [('AllInvalidsandTouristsmayfeelassuredthattheywill', 'All Invalids and Tourists may feel assured that they will'), ('becourteouslyreceived', 'be courteously received')]
HR18870301-V22-03-page23.txt: [('ignorantenthusiast', 'ignorant enthusiast')]
HR18870901-V22-09-page1.txt: [('ConSsuimirpitifoTnoContagious', 'Con S sui mir pit if oT no Contagious')]
HR18871101-V22-11-page44.txt: [('attractiveillustrations', 'attractive illustrations')]
HR18871201-V22-12-page54.txt: [('AbdominalSupporters', 'Abdominal Supporters')]
HR18880101-V23-01-page55.txt: [('ForeignoPeriodical', 'Foreign o Periodical')]
HR18880201-V23-02-page4.txt: [('picitcuturreeGallery', 'pic it cut ur ree Gallery')]
HR18880201-V23-02-page50.txt: [('onthewarpbeamdirectfromthespools', 'on the warp beam direct from the spools')]
HR18880301-V23-03-page3.txt: [('periodicalpublished', 'periodical published')]
HR18880301-V23-03-page9.txt: [('BATTLECREEKBUGGYCO', 'BATTLE CREEK BUGGY C O')]
HR18880401-V23-04-page57.txt: [('BATTLECREEKBUGGYCO', 'BATTLE CREEK BUGGY C O')]
HR18880601-V23-06-page45.txt: [('INHEALTHANDDISEASE', 'IN HEALTH AND DISEASE')]
HR18880701-V23-07-page49.txt: [('BATTLECREEKBUGGYCO', 'BATTLE CREEK BUGGY C O')]
HR18880701-V23-07-page54.txt: [('fortheTreatmentofDiseasesoftheEYE', 'for the Treatment of Diseases of the EYE')]
HR18880801-V23-08-page46.txt: [('NEWSPAPERSINCITIES', 'NEWSPAPERS IN CITIES')]
HR18880801-V23-08-page47.txt: [('satisfactioninallparticulars', 'satisfaction in all particulars')]
HR18881101-V23-11-page53.txt: [('INHEALTHANDDISEASE', 'IN HEALTH AND DISEASE')]
HR18881101-V23-11-page57.txt: [('thanthisHolidayGem', 'than this Holiday Gem'), ('THESIGNSOfTHETIMES', 'THE SIGNS Of THE TIMES')]
HR18881201-V23-12-page46.txt: [('extensiveexperience', 'extensive experience')]
HR18881201-V23-12-page50.txt: [('EnglishIllustrated', 'English Illustrated')]
HR18881201-V23-12-page55.txt: [('GOODHEALTHPUBLISHING', 'GOOD HEALTH PUBLISHING')]
HR18890101-V24-01-page43.txt: [('EnglishIllustrated', 'English Illustrated')]
HR18890101-V24-01-page46.txt: [('ForCircularsAddress', 'For Circulars Address')]
HR18890201-V24-02-page43.txt: [('EnglishIllustrated', 'English Illustrated')]
HR18890301-V24-03-page26.txt: [('oxydimethylchinizine', 'ox y dim ethyl chin i zine')]
HR18890501-V24-05-page5.txt: [('hereditarycontinent', 'hereditary continent')]
HR18890601-V24-06-page45.txt: [('mostcomprehensiveCycling', 'most comprehensive Cycling')]
HR18890701-V24-07-page8.txt: [('consciencestricken', 'conscience stricken')]
HR18890801-V24-08-page11.txt: [('polypharmaceutists', 'polyp harm ace u t i s t s')]
HR18890801-V24-08-page43.txt: [('IsEspeciallyAdaptedtoweavingRagCarpets', 'Is Especially Adapted to weaving Rag Carpets')]
HR18891101-V24-11-page44.txt: [('TheNiagaraFallsRoute', 'The Niagara Falls Route')]
HR18891201-V24-12-page16.txt: [('disproporexamining', 'dis prop or examining')]
HR18900201-V25-02-page39.txt: [('EnglishIllustrated', 'English Illustrated')]
HR18900201-V25-02-page42.txt: [('SCIENTIFICAMERICAN', 'SCIENTIFIC AMERICAN')]
HR18900301-V25-03-page42.txt: [('SCIENTIFICAMERICAN', 'SCIENTIFIC AMERICAN')]
HR18900301-V25-03-page46.txt: [('ForCircularsAddress', 'For Circulars Address')]
HR18900501-V25-05-page42.txt: [('EnglishIllustrated', 'English Illustrated')]
HR18900601-V25-06-page42.txt: [('ForCircularsAddress', 'For Circulars Address')]
HR18900801-V25-08-page37.txt: [('ISERIESOFTENCHROMO', 'I SERIES OFTEN CHROMO')]
HR18900901-V25-09-page11.txt: [('kindergartentrained', 'kindergarten trained')]
HR18900901-V25-09-page46.txt: [('ForCircularsAddreets', 'For Circulars Add ree t s')]
HR18901101-V25-11-page4.txt: [('LOSTTHANKSGIVINGDINNER', 'LOST THANKSGIVING DINNER')]
HR18910301-V26-03-page34.txt: [('temperaturereducing', 'temperature reducing')]
HR18910401-V26-04-page45.txt: [('sellingsubscription', 'selling subscription')]
HR18910501-V26-05-page45.txt: [('sellingsubscription', 'selling subscription')]
HR18911001-V26-10-page39.txt: [('PEROXIDEofHYDROGEN', 'PEROXIDE of HYDROGEN')]
HR18911001-V26-10-page41.txt: [('HEALTHPUBLISHINGCOMPANY', 'HEALTH PUBLISHING COMPANY')]
HR18920101-V27-01-page22.txt: [('greatgrandchildren', 'great grandchildren')]
HR18920601-V27-06-page45.txt: [('sellingsubscription', 'selling subscription')]
HR18920701-V27-07-page31.txt: [('somewhatdiminished', 'somewhat diminished')]
HR18920701-V27-07-page45.txt: [('withitsgreatvarietyofpatients', 'with its great variety of patients')]
HR18920901-V27-09-page11.txt: [('groupingthemtogetherasrespiratoryandforcegenerating', 'grouping them together as respiratory and force generating')]
HR18921001-V27-10-page42.txt: [('AcceptabletotheMostFastidiouspalate', 'Acceptable to the Most Fastidious palate'), ('Sendforcircularsof', 'Send for circulars of')]
HR18930101-V28-01-page45.txt: [('AndtoEveryCivilized', 'And to Every Civilized')]
HR18930301-V28-03-page41.txt: [('ScientificAmerican', 'Scientific American')]
HR18930501-V28-05-page48.txt: [('sellingsubscription', 'selling subscription')]
HR18930901-V28-09-page45.txt: [('Acittatiearocollft', 'A cit tat i ear o col l f t')]
HR18931001-V28-10-page40.txt: [('sitonststennwoutaims', 'sit on sts ten n w o u t a i m s')]
HR18940101-V29-01-page27.txt: [('physiologicalinterval', 'physiological interval')]
HR18940101-V29-01-page41.txt: [('beequippedwithawheel', 'be equipped with a wheel')]
HR18940301-V29-03-page44.txt: [('beequippedwithawheel', 'be equipped with a wheel')]
HR18940401-V29-04-page38.txt: [('MedicallMissionary', 'Medical l Missionary')]
HR18940501-V29-05-page10.txt: [('pockethandkerchiefs', 'pocket handkerchiefs')]
HR18940601-V29-06-page41.txt: [('chinefullywarranted', 'chine fully warranted')]
HR18940801-V29-08-page36.txt: [('landscapegardening', 'landscape gardening')]
HR18941001-V29-10-page44.txt: [('ingeniouslydevised', 'ingeniously devised')]
HR18941201-V29-12-page45.txt: [('SANITARYANDELECTRICALSUPPLYCO', 'SANITARY AND ELECTRICAL SUPPLY C O')]
HR18941201-V29-12-page47.txt: [('ditilyexceptSanday', 'di til y except San day')]
HR18950101-V30-01-page40.txt: [('WILLBEBETTERTHANEVER', 'WILL BE BETTER THAN EVER')]
HR18950101-V30-01-page44.txt: [('ingeniouslydevised', 'ingeniously devised')]
HR18950201-V30-02-page41.txt: [('ingeniouslydevised', 'ingeniously devised')]
HR18950301-V30-03-page10.txt: [('supersensitiveness', 'supersensitive ness')]
HR18950501-V30-05-page41.txt: [('highestpricedwheelonthemarket', 'highest priced wheel on the market')]
HR18950501-V30-05-page42.txt: [('RetinaCollegeandPallsWoolHalm', 'Retina College and Palls Wool Hal m')]
HR18950701-V30-07-page40.txt: [('ANDELECTRICALSUPPLYCO', 'AND ELECTRICAL SUPPLY C O')]
HR18950801-V30-08-page40.txt: [('andworthyofthetitleitbears', 'and worthy of the title it bears')]
HR18951101-V30-11-page43.txt: [('SurgicalandVeterinaryInstruments', 'Surgical and Veterinary Instruments')]
HR18951201-V30-12-page46.txt: [('SURGICALANDVETERINARYINSTRUMENTS', 'SURGICAL AND VETERINARY INSTRUMENTS')]
HR18951201-V30-12-page53.txt: [('ANDELECTRICALSUPPLYCO', 'AND ELECTRICAL SUPPLY C O')]
HR18960101-V31-01-page47.txt: [('SANITARYANDELECTRICALSUPPLYCO', 'SANITARY AND ELECTRICAL SUPPLY C O')]
HR18960201-V31-02-page40.txt: [('SANITARYANDELECTRICALSUPPLYCO', 'SANITARY AND ELECTRICAL SUPPLY C O')]
HR18960501-V31-05-page44.txt: [('SURGICALANDVETERINARYINSTRUMENTS', 'SURGICAL AND VETERINARY INSTRUMENTS')]
HR18960701-V31-07-page42.txt: [('SANITARYANDELECTRICALSUPPLYCO', 'SANITARY AND ELECTRICAL SUPPLY C O')]
HR18960801-V31-08-page30.txt: [('Isthereanyhelpforanenlarged', 'Is there any help for an enlarged')]
HR18960901-V31-09-page46.txt: [('mathAleantrementrof', 'mat h Ale ant rem en t r o f')]
HR18960901-V31-09-page62.txt: [('SANITARYANDELECTRICALSUPPLYCO', 'SANITARY AND ELECTRICAL SUPPLY C O')]
HR18970101-V32-01-page57.txt: [('Mentionthispublication', 'Mention this publication')]
HR18970201-V32-02-page31.txt: [('theirmismanagement', 'their mismanagement')]
HR18970501-V32-05-page74.txt: [('NEBRASKASANITARIUM', 'NEBRASKA SANITARIUM')]
HR18970601-V32-06-page25.txt: [('pockethandkerchiefs', 'pocket handkerchiefs')]
HR18970601-V32-06-page63.txt: [('digestivedisorders', 'digestive disorders')]
HR18970601-V32-06-page70.txt: [('dyspepsiaproducing', 'dyspepsia producing')]
HR18970701-V32-07-page25.txt: [('magneticelectrical', 'magnetic electrical')]
HR18970801-V32-08-page26.txt: [('apprenticeexercise', 'apprentice exercise')]
HR18971101-V32-11-page20.txt: [('supersensitiveness', 'supersensitive ness')]
HR18971201-V32-12-page65.txt: [('HEALTHOFBODYANDMIND', 'HEALTH OF BODY AND MIND')]
HR18980101-V33-01-page49.txt: [('considerablylarger', 'considerably larger')]
HR18980301-V33-03-page21.txt: [('transportaintellectual', 'transport a intellectual')]
HR18980401-V33-04-page40.txt: [('fragranceeverywhere', 'fragrance everywhere')]
HR18980701-V33-07-page17.txt: [('distinconstriction', 'dist in constriction')]
HR18981001-V33-10-page68.txt: [('InelNonderlallsolthe', 'In el Non der l all sol the')]
HR18981101-V33-11-page38.txt: [('importantdepartments', 'important departments')]
HR18990201-V34-02-page35.txt: [('hydrotherapeutists', 'hydro therapeutist s')]
HR18990201-V34-02-page42.txt: [('intellectuallooking', 'intellectual looking')]
HR18990301-V34-03-page56.txt: [('dyspepsiaproducing', 'dyspepsia producing')]
HR18990401-V34-04-page15.txt: [('troublesomesymptom', 'troublesome symptom')]
HR18990401-V34-04-page2.txt: [('dyspepsiaproducing', 'dyspepsia producing')]
HR18990401-V34-04-page64.txt: [('CleanineDepartment', 'C lea nine Department')]
HR18990801-V34-08-page41.txt: [('begintablespoonful', 'begin tablespoonful')]
HR19000101-V35-01-page74.txt: [('ThisHartwellDoubleGlassDoorBookcase', 'This Hartwell Double Glass Door Book case')]
HR19000101-V35-01-page83.txt: [('adjustablebearings', 'adjustable bearings')]
HR19000201-V35-02-page79.txt: [('StandardTypewriter', 'Standard Typewriter')]
HR19000301-V35-03-page66.txt: [('miscellaneousmatter', 'miscellaneous matter')]
HR19000401-V35-04-page28.txt: [('hydrotherapeutically', 'hydrotherapeutic ally')]
HR19000501-V35-05-page89.txt: [('TheBattleCreekSanitarium', 'The Battle Creek Sanitarium')]
HR19000601-V35-06-page16.txt: [('factorsaccompanying', 'factors accompanying')]
HR19000701-V35-07-page47.txt: [('musclestrengthening', 'muscle strengthening')]
HR19000701-V35-07-page81.txt: [('BeveridgeSteamCooker', 'Beveridge Steam Cooker')]
HR19000701-V35-07-page84.txt: [('preventsescapeofallgerms', 'prevents escape of all germs'), ('satisfiesthestarvinglikebread', 'satisfies the starving like bread'), ('nothingsuppliescertain', 'nothing supplies certain')]
HR19000701-V35-07-page95.txt: [('SWEDENBOISMIIMIIIMEMMIZMINE', 'SWEDEN B O ISM I IM III MEM M I Z M I N E')]
HR19000801-V35-08-page73.txt: [('Inreadingandseeking', 'In reading and seeking')]
HR19000901-V35-09-page12.txt: [('dressmakerobstacle', 'dressmaker obstacle')]
HR19000901-V35-09-page53.txt: [('ReducesFuelBillsOneHalf', 'Reduces Fuel Bills One Half')]
HR19000901-V35-09-page68.txt: [('TheSanitasNutFoodCo', 'The Sanitas Nut Food C o')]
HR19000901-V35-09-page71.txt: [('OmahaleMinneapolis', 'Omaha le Minneapolis')]
HR19000901-V35-09-page77.txt: [('THEBESTFOODFORSUMMER', 'THE BEST FOOD FOR SUMMER')]
HR19001001-V35-10-page59.txt: [('specialillustratednoticeintheAmerican', 'special illustrated notice in the American')]
HR19001001-V35-10-page73.txt: [('inatnedriodrecotrhae', 'in at ned rio dr eco t r h a e')]
HR19001201-V35-12-page62.txt: [('electrotherapeutic', 'electro therapeutic')]
HR19020101-V37-01-page20.txt: [('universallyclassified', 'universally classified')]
HR19020101-V37-01-page79.txt: [('andisartisticallyprintedand', 'and is artistically printed and')]
HR19020201-V37-02-page29.txt: [('dangerouscharacter', 'dangerous character')]
HR19020301-V37-03-page53.txt: [('AddressHealthLibraryAssociation', 'Address Health Library Association')]
HR19020301-V37-03-page78.txt: [('BathsofEveryDescription', 'Baths of Every Description')]
HR19020401-V37-04-page49.txt: [('SuchaninstitutionisthatatSouthcLasaterr', 'Such an institution is that at South c Las at err'), ('otherrespectsunhealthful', 'other respects unhealthful')]
HR19020401-V37-04-page51.txt: [('unconventionalities', 'unconventional i ties')]
HR19020401-V37-04-page67.txt: [('ThePearloftheRepublic', 'The Pearl of the Republic')]
HR19020501-V37-05-page61.txt: [('MAILINGDIRECTORYof', 'MAIL ING DIRECTORY of')]
HR19020501-V37-05-page71.txt: [('Inreplyingtoadvertisementspleasemention', 'In replying to advertisements please mention')]
HR19020501-V37-05-page76.txt: [('etwaveiweeewesoweveiftergearrwair', 'et wave i wee ewe sow eve if ter gear r w a i r')]
HR19020601-V37-06-page54.txt: [('Intheheartofthecity', 'In the heart of the city')]
HR19020601-V37-06-page55.txt: [('TOLEDOSPRINGANDMATTRESSCO', 'TOLEDO SPRING AND MATTRESS C O')]
HR19020701-V37-07-page40.txt: [('autoiritoxications', 'auto iri toxic at ions')]
HR19020701-V37-07-page70.txt: [('withallmodernconveniences', 'with all modern conveniences')]
HR19021001-V37-10-page52.txt: [('Themotiveinchoosing', 'The motive in choosing')]
HR19030301-V38-03-page59.txt: [('FRIGHTFULDEATHRATE', 'FRIGHTFUL DEATH RATE')]
HR19030301-V38-03-page68.txt: [('DailyPullmanservicebetweenSt', 'Daily Pullman service between St')]
HR19030401-V38-04-page65.txt: [('willbeimprovedbythecontinued', 'will be improved by the continued')]
HR19030401-V38-04-page66.txt: [('MenandWomenathOrne', 'Men and Women at h Or n e')]
HR19030501-V38-05-page36.txt: [('greatgrandchildren', 'great grandchildren')]
HR19030501-V38-05-page44.txt: [('oftenunconsciously', 'often unconsciously')]
HR19030701-V38-07-page61.txt: [('DANGEROUSGASOLINESTOVES', 'DANGEROUS GASOLINE STOVES')]
HR19030701-V38-07-page74.txt: [('toadvertisementspleasemention', 'to advertisements please mention')]
HR19030701-V38-07-page75.txt: [('Inreplyingtoadvertisementspleasemention', 'In replying to advertisements please mention')]
HR19030801-V38-08-page33.txt: [('passionstimulating', 'passion stimulating')]
HR19030801-V38-08-page62.txt: [('lieutenantgovernor', 'lieutenant governor')]
HR19031001-V38-10-page70.txt: [('toadvertisementspleasemention', 'to advertisements please mention')]
HR19031001-V38-10-page76.txt: [('toadvertisementspleasemention', 'to advertisements please mention')]
HR19031101-V38-11-page61.txt: [('InreplyingtoadvertisementspleasementionGOOD', 'In replying to advertisements please mention GOOD')]
HR19031101-V38-11-page66.txt: [('distillationmaking', 'distillation making')]
HR19040101-V39-01-page30.txt: [('delightfulbreakfast', 'delightful breakfast')]
HR19040101-V39-01-page63.txt: [('NoMemberofyourFamily', 'No Member of your Family')]
HR19040201-V39-02-page70.txt: [('ANTISEPTICDENTIFRICE', 'ANTISEPTIC DENTIFRICE')]
HR19040301-V39-03-page70.txt: [('SplendidforCookingandHeating', 'Splendid for Cooking and Heating')]
HR19040401-V39-04-page78.txt: [('orderssnanadlappoianttinaggents', 'orders s nan ad lap poi ant tin a g g e n t s'), ('SplendidforCookingandHeating', 'Splendid for Cooking and Heating')]
HR19040501-V39-05-page35.txt: [('interchangeableness', 'interchangeable ness')]
HR19040501-V39-05-page69.txt: [('advertisementplease', 'advertisement please')]
HR19040801-V39-08-page95.txt: [('whichmaketheWingPiano', 'which make the Wing Piano')]
HR19040801-V39-08-page98.txt: [('ALLENMANUFACTURING', 'ALLEN MANUFACTURING')]
HR19041001-V39-10-page75.txt: [('WithGoonHEALTHoneyear', 'With Goon HEALTH one year')]
HR19041101-V39-11-page12.txt: [('GOODHEALTHPUBLISHINGCO', 'GOOD HEALTH PUBLISHING C O')]
HR19041101-V39-11-page64.txt: [('greatgrandchildren', 'great grandchildren')]
HR19041101-V39-11-page85.txt: [('YourHealthwillbeimprovedbythecon', 'Your Health will be improved by the con')]
HR19050101-V40-01-page51.txt: [('occupationdeformities', 'occupation deformities')]
HR19050201-V40-02-page12.txt: [('RecipewitheachMill', 'Recipe with each Mill')]
HR19050201-V40-02-page76.txt: [('THENEWVOICECOMPANY', 'THE NEW VOICE COMPANY')]
HR19050201-V40-02-page77.txt: [('LaythisPaperDownandWriteNOW', 'Lay this Paper Down and Write NOW')]
HR19050201-V40-02-page78.txt: [('DIRECTFROMOURFACTORIES', 'DIRECT FROM OUR FACTORIES')]
HR19050201-V40-02-page79.txt: [('MICHIGANCENTRALhas', 'MICHIGAN CENTRAL has')]
HR19050201-V40-02-page80.txt: [('Cashoreasymonthlypayments', 'Cash or easy monthly payments')]
HR19050401-V40-04-page77.txt: [('Samplesofleatherssentonrequest', 'Samples of leathers sent on request')]
HR19050501-V40-05-page66.txt: [('andMassageInstruments', 'and Massage Instruments')]
HR19050901-V40-09-page72.txt: [('completecatalogueofbooksuponrequest', 'complete catalogue of books upon request')]
HR19051001-V40-10-page92.txt: [('CALIFORNIASANITARIUM', 'CALIFORNIA SANITARIUM')]
HR19051101-V40-11-page4.txt: [('THESTATISTICSOFCOCOAIMPORTATIONS', 'THE STATISTICS OF COCOA IMP OR T A T I O N S')]
HR19051201-V40-12-page10.txt: [('givingvaluableInformationaboutcorrect', 'giving valuable Information about correct')]
HR19051201-V40-12-page48.txt: [('disagreeablelooking', 'disagreeable looking')]
HR19060101-V41-01-page73.txt: [('LETUSMAKEYOULIKEWISEPROSPEROUS', 'LET US MAKE YOU LIKEWISE PROSPEROUS'), ('Scientificiimerican', 'Scientific ii mer i can')]
HR19060101-V41-01-page75.txt: [('Pleasestatewhether', 'Please state whether')]
HR19060201-V41-02-page82.txt: [('GOODALLTHEWAYTHROUGH', 'GOOD ALL THE WAY THROUGH')]
HR19060301-V41-03-page21.txt: [('dehydrochlorinated', 'de hydro chlorinated')]
HR19060301-V41-03-page70.txt: [('associationsthroughout', 'associations throughout')]
HR19060401-V41-04-page96.txt: [('Itisatimesaverforallbusypeople', 'It is a time saver for all busy people')]
HR19060601-V41-06-page103.txt: [('TwoHundredThousandpairsnowinactualuse', 'Two Hundred Thousand pairs now in actual use'), ('rioterauspairpostailetsoday', 'riot era us pair post ai let so day'), ('forourbigFRIBIYEeato', 'for our big F R I B I Y E e a t o')]
HR19060701-V41-07-page14.txt: [('alBETTERBICYCLESatLOWERPRICES', 'a l BETTER BICYCLES at LOWER PRICES')]
HR19060801-V41-08-page6.txt: [('printwoeuirghpsrice', 'print woe u i r g h p s r i c e')]
HR19060901-V41-09-page13.txt: [('MagicPocketVaporizer', 'Magic Pocket Vaporizer')]
HR19060901-V41-09-page65.txt: [('whichwouldseemonlywarmtoit', 'which would seem only warm to it')]
HR19061001-V41-10-page4.txt: [('EXPLAINSTHEONLYWAYOFPER', 'EXPLAINS THE ONLY WAY OF PER'), ('GOODALLTHEWAYTHROUGH', 'GOOD ALL THE WAY THROUGH')]
HR19061001-V41-10-page81.txt: [('appetitestimulating', 'appetite stimulating')]
HR19061201-V41-12-page84.txt: [('EXPLAINSTHEONLYWAYOFPER', 'EXPLAINS THE ONLY WAY OF PER')]
HR19061201-V41-12-page95.txt: [('ANDINFECTIOUSDISEASES', 'AND INFECTIOUS DISEASES')]
HR19070201-V42-02-page69.txt: [('TheNilesBryantSchoolofPianoTuning', 'The Niles Bryant School of Piano Tuning')]
HR19070301-V42-03-page85.txt: [('EquippedSanitariumNearChicago', 'Equipped Sanitarium Near Chicago')]
HR19070301-V42-03-page86.txt: [('seldonilluniforrkinfoornpostrio', 's eld on ill uni for r k i n f o o r n p o s t r i o')]
HR19070401-V42-04-page6.txt: [('valuableinformation', 'valuable information')]
HR19070501-V42-05-page49.txt: [('diseasepredisposing', 'disease predisposing')]
HR19070501-V42-05-page82.txt: [('oisensearItgtuelmls', 'o is ens ear It g t u e l m l s')]
HR19070601-V42-06-page104.txt: [('HELVETIAMILKCONDENSING', 'HELVETIA MILK CONDENSING')]
HR19070601-V42-06-page6.txt: [('thoroughlyequipped', 'thoroughly equipped')]
HR19070601-V42-06-page7.txt: [('TwoHundredThousandpairsnowinactualuse', 'Two Hundred Thousand pairs now in actual use')]
HR19070601-V42-06-page89.txt: [('ANDINFECTIOUSDISEASES', 'AND INFECTIOUS DISEASES'), ('WILLSEMIFREESInLessons', 'WILL SEMI FREE S In Lessons')]
HR19070601-V42-06-page92.txt: [('EquippedSanitariumNearChicago', 'Equipped Sanitarium Near Chicago')]
HR19070701-V42-07-page12.txt: [('progressivehhousekeeper', 'progressive h housekeeper')]
HR19070701-V42-07-page87.txt: [('thoroughlyequipped', 'thoroughly equipped')]
HR19070901-V42-09-page103.txt: [('yourgrocerhasyourinterestsinviewhewill', 'your grocer has your interests in view he will')]
HR19071001-V42-10-page26.txt: [('oftiresfromanyonNe', 'of tires from any on N e')]
HR19071001-V42-10-page62.txt: [('tothepartswhicharebeingrubbed', 'to the parts which are being rubbed')]
HR19071101-V42-11-page108.txt: [('FORPROSPECTUSADDRESS', 'FOR PROSPECTUS ADDRESS')]
HR19071101-V42-11-page92.txt: [('BICYCLEcatalogiule', 'BICYCLE catalog i u l e')]
HR19071201-V42-12-page89.txt: [('willbeincreasedfrom', 'will be increased from')]
In [51]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/HR/correction9

Average verified rate: 0.978788437440143

Average of error rates: 0.0343930811973472

Total token count: 13913355

In [52]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[52]:
[('m', 15959),
 ('d', 11328),
 ("'", 11284),
 ('e', 8989),
 ('t', 7953),
 ('f', 7061),
 ('r', 6946),
 ('w', 6945),
 ('n', 5979),
 ('co', 5725),
 ('pm', 3762),
 ('g', 3213),
 ('th', 1569),
 ('u', 1516),
 ('k', 1449),
 ('x', 1337),
 ('z', 835),
 ('mo', 723),
 ('oz', 676),
 ('sel', 657),
 ('ex', 624),
 ('pa', 603),
 ("an'", 555),
 ('lb', 534),
 ('tion', 489),
 ('pp', 474),
 ('re', 465),
 ('-', 426),
 ('wm', 355),
 ('q', 316),
 ('al', 312),
 ('ti', 298),
 ('ft', 280),
 ('em', 267),
 ('io', 251),
 ('ro', 250),
 ('ment', 248),
 ('mt', 243),
 ('pt', 240),
 ('oo', 239),
 ('ry', 215),
 ('es', 203),
 ('il', 200),
 ('ll', 189),
 ('se', 185),
 ('ia', 183),
 ('tt', 181),
 ("hours'", 180),
 ('li', 178),
 ("''", 171)]
In [ ]: