NMN-OCR-Evaluation-and-Correction

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-02-14-Roman-Numerals.txt",
             "2017-03-01-Additional-Approved-Words.txt"
            ]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "NMN"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)

Baseline

In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/NMN/baseline

Average verified rate: 0.8992190982814151

Average of error rates: 0.10566570605187318

Total token count: 199641

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 50 )
Out[11]:
[('-', 916),
 ('e', 500),
 ('¥', 337),
 ('t', 323),
 ("'", 250),
 ('th', 244),
 ('r', 207),
 ('n', 204),
 ('w', 198),
 ('--', 191),
 (')', 158),
 ('m', 153),
 ('d', 140),
 ('f', 140),
 ('/', 130),
 ('g', 108),
 ('aro', 89),
 ('(', 82),
 ('u', 78),
 ('bo', 72),
 ('wo', 64),
 ('k', 63),
 ('co', 53),
 ('re', 51)]

Review Special Character Use

In [12]:
reports.tokens_with_special_characters(errors_summary)[:100]
Out[12]:
[('¥', 337),
 (')', 158),
 ('/', 130),
 ('(', 82),
 ('*', 47),
 ('_', 44),
 ('ñ', 35),
 ('%', 34),
 ('+', 34),
 ('¥¥¥¥', 32),
 ('¥¥¥¥¥', 23),
 ('¥¥', 21),
 ('¥¥¥', 19),
 ('•', 15),
 ('#', 9),
 ('()', 8),
 ('(a)', 8),
 ('¥¥¥¥¥¥', 7),
 ('(the', 7),
 ('`', 7),
 ('/per', 7),
 ('(to', 7),
 ('__', 6),
 ('(b)', 6),
 ('\ufeff', 6),
 ("')", 6),
 ('¡', 6),
 ('(c)', 5),
 (']', 5),
 ('=', 5),
 ('(for', 5),
 ('/t', 4),
 ('-¥', 4),
 ('(i', 4),
 ('continued)', 4),
 ('ò', 4),
 ('¥¥¥¥¥¥¥¥¥', 4),
 ('^', 3),
 (')-', 3),
 ('e)', 3),
 ('~~', 3),
 ('(continued', 3),
 ('-_', 3),
 ('work*', 3),
 ('(f)', 3),
 ('t¥', 3),
 ('♦', 3),
 ('f_', 3),
 ('r)', 3),
 ('ã', 3),
 ('al]', 3),
 ("%'", 2),
 ('-)f', 2),
 ('(col', 2),
 ('(appointed)', 2),
 ('re/', 2),
 ('wil_', 2),
 ('s¥', 2),
 ('of_', 2),
 ('/each', 2),
 ('fir/', 2),
 ('_-', 2),
 ('*me', 2),
 ('the)', 2),
 ("'¥", 2),
 ('student)', 2),
 ('¥¥¥¥¥¥¥¥¥¥¥', 2),
 (')er', 2),
 ('_s', 2),
 ('(acts', 2),
 ('(new', 2),
 ('_of', 2),
 ('in(', 2),
 ('bank)', 2),
 ('(d)', 2),
 ('¥t', 2),
 ('°', 2),
 ('(r', 2),
 ('it¥', 2),
 ('¥¥¥¥¥¥¥', 2),
 ('=n', 2),
 ('))', 2),
 ('/a', 2),
 ('~', 2),
 ('price)', 2),
 ('{', 2),
 ('c/', 2),
 ('<', 2),
 ('teacher)', 2),
 ('£', 2),
 ('_we', 2),
 ('_ng', 2),
 ('/z', 2),
 ("('", 2),
 ('/i', 2),
 ('/e', 2),
 ('hem¡', 2),
 ('*his', 1),
 ('sebscriptions)', 1),
 ('can¥', 1)]

Correction 1 -- Normalize Character Use

In [13]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
    
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 1

In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/NMN/correction1

Average verified rate: 0.9070753720305472

Average of error rates: 0.09680691642651297

Total token count: 198774

In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[('-', 959),
 ('e', 529),
 ('t', 342),
 ("'", 274),
 ('th', 246),
 ('r', 224),
 ('n', 212),
 ('w', 204),
 ('--', 199),
 ('f', 156),
 ('m', 155),
 ('d', 150),
 ('g', 110),
 ('aro', 89),
 ('u', 82),
 ('bo', 72),
 ('k', 66),
 ('wo', 64),
 ('re', 57),
 ('co', 54),
 ('x', 36),
 ('tc', 36),
 ('se', 33),
 ('---', 30),
 ('te', 29),
 ('al', 28),
 ('leetsville', 28),
 ('li', 28),
 ('es', 28),
 ('ie', 28),
 ("canvassers'", 27),
 ('mt', 27),
 ('nd', 27),
 ('z', 26),
 ('willaman', 26),
 ('tt', 25),
 ('con-', 25),
 ('ti', 24),
 ('--o--', 22),
 ('ft', 21),
 ('--selected', 20),
 ('ne', 20),
 ('soo', 20),
 ('ce', 19),
 ("elders'", 19),
 ('q', 19),
 ('myrta', 18),
 ('rs', 18),
 ('altho', 18),
 ('ly', 17)]

Correction 2 -- Fix Line Endings

In [16]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 2

In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/NMN/correction2

Average verified rate: 0.9108103059158542

Average of error rates: 0.09321037463976946

Total token count: 198061

In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('-', 946),
 ('e', 528),
 ('t', 341),
 ("'", 274),
 ('th', 246),
 ('r', 223),
 ('n', 212),
 ('w', 203),
 ('--', 199),
 ('f', 155),
 ('m', 154),
 ('d', 149),
 ('g', 110),
 ('aro', 89),
 ('u', 81),
 ('bo', 71),
 ('k', 66),
 ('wo', 64),
 ('re', 57),
 ('co', 54),
 ('tc', 36),
 ('x', 36),
 ('se', 33),
 ('---', 30),
 ('te', 29),
 ('es', 28),
 ('li', 28),
 ('ie', 28),
 ('leetsville', 28),
 ("canvassers'", 27),
 ('mt', 27),
 ('nd', 27),
 ('z', 26),
 ('willaman', 26),
 ('tt', 25),
 ('ti', 24),
 ('al', 23),
 ('--o--', 22),
 ('ft', 21),
 ('ne', 20),
 ('--selected', 20),
 ('soo', 20),
 ('ce', 19),
 ("elders'", 19),
 ('q', 19),
 ('myrta', 18),
 ('altho', 18),
 ('rs', 18),
 ('ca', 17),
 ('il', 16)]

Correction 3 -- Remove Extra Dashes

In [19]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
            
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
NMN19070108-V03-01-page1.txt: [('--Selected.', '-Selected.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19070108-V03-01-page2.txt: [('neighbor-', 'neighbor'), ('no-', 'no'), ('-.', '.'), ('Zn-', 'Zn')]
NMN19070108-V03-01-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', '')]
NMN19070108-V03-01-page4.txt: [('t-', 't')]
NMN19070129-V03-02-page1.txt: [('e-ooCOOoo--', 'e-ooCOOoo-'), ('-', ''), ('hone-', 'hone'), ('-', ''), ('p.-', 'p.'), ('-heir', 'heir')]
NMN19070129-V03-02-page2.txt: [('--', '-'), ('-e', 'e'), ('--o--', '-o--')]
NMN19070129-V03-02-page3.txt: [('--', '-'), ('--', '-'), ('--', '-'), ('-the', 'the')]
NMN19070129-V03-02-page6.txt: [('-', '')]
NMN19070219-V03-03-page1.txt: [('C-', 'C')]
NMN19070219-V03-03-page2.txt: [('seal--', 'seal-')]
NMN19070219-V03-03-page3.txt: [('--', '-'), ('--', '-'), ('Feb-', 'Feb'), ('-', ''), ('-', ''), ('-', '')]
NMN19070219-V03-03-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('---', '--'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19070219-V03-03-page5.txt: [('-will', 'will')]
NMN19070312-V03-04-page1.txt: [('--a', '-a'), ('-', ''), ('-titles', 'titles')]
NMN19070312-V03-04-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19070312-V03-04-page3.txt: [('--', '-'), ('--', '-'), ('"--', '"-'), ('-', '')]
NMN19070312-V03-04-page4.txt: [('-id', 'id')]
NMN19070312-V03-04-page5.txt: [('--o--', '-o--'), ('fractions.--', 'fractions.-'), ('hesitancy--', 'hesitancy-')]
NMN19070401-V03-05-page1.txt: [('-o', 'o'), ('--', '-'), ('for-', 'for'), ('--', '-'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('--Chrty', '-Chrty')]
NMN19070401-V03-05-page2.txt: [('--', '-'), ('-', ''), ('-this', 'this'), ('-', ''), ('-', '')]
NMN19070401-V03-05-page3.txt: [('talkie-', 'talkie')]
NMN19070401-V03-05-page4.txt: [('-', '')]
NMN19070401-V03-05-page5.txt: [('-', ''), ('-ataer', 'ataer'), ('-', '')]
NMN19070401-V03-05-page6.txt: [('..-', '..')]
NMN19070401-V03-05-page7.txt: [('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('Teevorte-', 'Teevorte'), ('-', ''), ('-nommen', 'nommen'), ('-', ''), ('-', '')]
NMN19070423-V03-06-page1.txt: [('mine-', 'mine'), ('--Selected.', '-Selected.'), ('tas-', 'tas')]
NMN19070423-V03-06-page3.txt: [('-', ''), ('saving-', 'saving'), ('-', ''), ('-', ''), ('--', '-')]
NMN19070423-V03-06-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('ex-', 'ex')]
NMN19070423-V03-06-page5.txt: [('Member-', 'Member'), ('Member-', 'Member')]
NMN19070423-V03-06-page7.txt: [('to-', 'to')]
NMN19070514-V03-07-page1.txt: [('-', ''), ('-', ''), ('--W.', '-W.')]
NMN19070514-V03-07-page2.txt: [('-', ''), ('--', '-'), ('-', ''), ('-by', 'by')]
NMN19070514-V03-07-page3.txt: [('--', '-'), ('--', '-'), ('-', ''), ('purity--', 'purity-'), ('mind--', 'mind-')]
NMN19070514-V03-07-page4.txt: [('Mich.--', 'Mich.-'), ('--o--', '-o--'), ('--Eugene', '-Eugene')]
NMN19070514-V03-07-page5.txt: [('--', '-'), ('-', ''), ('--', '-'), ('-', '')]
NMN19070514-V03-07-page7.txt: [('-TEACHERS', 'TEACHERS'), ('con.-', 'con.'), ('--', '-'), ('--', '-')]
NMN19070604-V03-08-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('pain-', 'pain'), ('-', '')]
NMN19070604-V03-08-page2.txt: [('-', ''), ('com-', 'com')]
NMN19070604-V03-08-page3.txt: [('-', '')]
NMN19070604-V03-08-page4.txt: [('-Iool', 'Iool')]
NMN19070604-V03-08-page5.txt: [('-', ''), ('-', '')]
NMN19070625-V03-09-page1.txt: [('--n--', '-n--'), ('--H.', '-H.'), ('offer--', 'offer-'), ('-', ''), ('-iath', 'iath'), ('--T.', '-T.')]
NMN19070625-V03-09-page2.txt: [('-', ''), ('-', ''), ('---', '--'), ('SABBATH-', 'SABBATH'), ('--all', '-all')]
NMN19070625-V03-09-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-dog', 'dog'), ('something--', 'something-'), ('-ner', 'ner'), ('bark--', 'bark-'), ('--bark', '-bark'), ('--Selected.', '-Selected.'), ('-', '')]
NMN19070625-V03-09-page4.txt: [('Confer-', 'Confer'), ('--', '-'), ('--', '-')]
NMN19070625-V03-09-page5.txt: [('-th', 'th'), ('wan-', 'wan')]
NMN19070625-V03-09-page6.txt: [('-', ''), ('-', '')]
NMN19070716-V03-10-page1.txt: [('-', ''), ('-eparate', 'eparate'), ('-They', 'They'), ('-are', 'are')]
NMN19070716-V03-10-page3.txt: [('--Selected.', '-Selected.'), ('shining--', 'shining-')]
NMN19070716-V03-10-page4.txt: [('CAleT-', 'CAleT'), ('-rut', 'rut'), ('camp--', 'camp-'), ('-to', 'to')]
NMN19070716-V03-10-page5.txt: [('C-', 'C'), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-crie', 'crie'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19070813-V03-11-page1.txt: [('-', ''), ('-', ''), ('--Helen', '-Helen'), ('box-', 'box')]
NMN19070813-V03-11-page2.txt: [('be-', 'be')]
NMN19070813-V03-11-page3.txt: [('--', '-'), ('--', '-'), ("-'gachera.", "'gachera."), ('-people', 'people'), ('-hear', 'hear')]
NMN19070813-V03-11-page5.txt: [('--', '-')]
NMN19070813-V03-11-page6.txt: [('-', ''), ('-', '')]
NMN19070813-V03-11-page7.txt: [('-', ''), ('--', '-'), ('-page', 'page')]
NMN19070917-V03-12-page1.txt: [('--', '-')]
NMN19070917-V03-12-page2.txt: [('-', ''), ('-', ''), ('--', '-'), ('-Lake', 'Lake')]
NMN19070917-V03-12-page3.txt: [('-', ''), ('-', ''), ('--', '-'), ('J-', 'J'), ('A-', 'A')]
NMN19070917-V03-12-page5.txt: [('-ft', 'ft'), ('ralred-', 'ralred'), ('-', ''), ('-', ''), ('-', '')]
NMN19070917-V03-12-page6.txt: [('-', ''), ('Hansen-', 'Hansen')]
NMN19071008-V03-13-page1.txt: [('-yield', 'yield'), ('-ark', 'ark'), ('-', '')]
NMN19071008-V03-13-page2.txt: [('-', ''), ('-', ''), ('-le', 'le'), ('-', ''), ('-', ''), ('-', ''), ('---', '--'), ('-the', 'the'), ('-Field', 'Field'), ('-come', 'come'), ('"-', '"')]
NMN19071008-V03-13-page3.txt: [('SEItaS---', 'SEItaS--'), ('-', ''), ('CLUBS.-', 'CLUBS.')]
NMN19071008-V03-13-page4.txt: [('build-', 'build'), ('-Ince', 'Ince'), ('-..apeisur', '..apeisur'), ('-', ''), ('O.-', 'O.')]
NMN19071008-V03-13-page5.txt: [('-', ''), ('-i', 'i'), ('-hath.', 'hath.'), ('-wo', 'wo'), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19071008-V03-13-page6.txt: [('-ants.', 'ants.')]
NMN19071008-V03-13-page7.txt: [('NA-', 'NA')]
NMN19071008-V03-13-page8.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('subscrip-', 'subscrip')]
NMN19071008-V03-13-page9.txt: [('fur-', 'fur')]
NMN19071029-V03-14-page1.txt: [('-angels', 'angels'), ('-visitants', 'visitants'), ('.-', '.'), ('--So', '-So'), ('y-', 'y'), ('-et', 'et')]
NMN19071029-V03-14-page2.txt: [('-', ''), ('--', '-'), ('--Floyd', '-Floyd'), ('-ing', 'ing')]
NMN19071029-V03-14-page3.txt: [('--', '-'), ('--', '-'), ('School."-', 'School."'), ('-There', 'There'), ('--UA', '-UA'), ('cheer-', 'cheer')]
NMN19071029-V03-14-page4.txt: [('-', ''), ('-', ''), ('-.', '.')]
NMN19071029-V03-14-page5.txt: [('lca-', 'lca'), ('-ietrskeyl', 'ietrskeyl'), ('-', ''), ('-', ''), ('-circulatioL', 'circulatioL'), ('-f', 'f'), ('-th', 'th')]
NMN19071029-V03-14-page6.txt: [('-', ''), ('-', ''), ('--', '-')]
NMN19071029-V03-14-page7.txt: [('--', '-'), ('--', '-'), ('-TIPPER', 'TIPPER'), ('--Presbyterian', '-Presbyterian')]
NMN19071119-V03-15-page1.txt: [('--coPoo--', '-coPoo--'), ('-such', 'such')]
NMN19071119-V03-15-page2.txt: [('-', ''), ('-', ''), ('eke-', 'eke'), ('wIthdrawal-', 'wIthdrawal'), ('-.general', '.general')]
NMN19071119-V03-15-page3.txt: [('-best', 'best')]
NMN19071119-V03-15-page4.txt: [('-', ''), ('Seventh-', 'Seventh'), ('half-', 'half')]
NMN19071119-V03-15-page5.txt: [('--', '-'), ('--', '-'), ('-coma.', 'coma.')]
NMN19071119-V03-15-page6.txt: [('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('--', '-')]
NMN19071119-V03-15-page7.txt: [('--', '-'), ('---', '--'), ('-', ''), ('neigh-', 'neigh'), ('-', ''), ('-', ''), ('-winter', 'winter'), ('--', '-'), ('---....', '--....'), ('-', ''), ('-', ''), ('-', '')]
NMN19071119-V03-15-page8.txt: [('--roportud', '-roportud')]
NMN19071210-V03-16-page1.txt: [('.-', '.'), ('shine--', 'shine-'), ('flame--', 'flame-'), ('--Missionary', '-Missionary'), ('feel-', 'feel')]
NMN19071210-V03-16-page2.txt: [('Christ-', 'Christ'), ('--', '-'), ('--', '-')]
NMN19071210-V03-16-page3.txt: [('-', ''), ('--', '-'), ('--', '-'), ('--ready', '-ready'), ("wood'-", "wood'")]
NMN19071210-V03-16-page4.txt: [('-.ruth', '.ruth')]
NMN19071210-V03-16-page5.txt: [('--', '-'), ('--', '-'), ('-flers', 'flers'), ('-', ''), ('-', ''), ('MoIrrn--', 'MoIrrn-'), ('-st', 'st')]
NMN19071210-V03-16-page6.txt: [('--', '-'), ('--', '-'), ('-', ''), ('meeting.-', 'meeting.')]
NMN19071210-V03-16-page7.txt: [('home-', 'home'), ('God-', 'God'), ('--', '-')]
NMN19071210-V03-16-page8.txt: [('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19080107-V04-01-page1.txt: [('-ient', 'ient'), ('-', ''), ('-.unjust', '.unjust'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--L.', '-L.'), ('-ono', 'ono')]
NMN19080107-V04-01-page2.txt: [('-rho', 'rho')]
NMN19080107-V04-01-page3.txt: [('-.', '.')]
NMN19080107-V04-01-page4.txt: [('-ran', 'ran'), ('--powor', '-powor'), ('-oOo-', 'oOo-')]
NMN19080107-V04-01-page5.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Total-', 'Total'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19080107-V04-01-page6.txt: [('--', '-'), ('--', '-'), ('-', '')]
NMN19080128-V04-02-page1.txt: [('--', '-'), ('-LOCOLLTIVE', 'LOCOLLTIVE')]
NMN19080128-V04-02-page2.txt: [('-eelonged.', 'eelonged.'), ('Nun.-', 'Nun.'), ('var-', 'var')]
NMN19080128-V04-02-page3.txt: [('-a', 'a'), ('--', '-'), ('-net', 'net'), ('T-', 'T')]
NMN19080128-V04-02-page4.txt: [('K.-', 'K.'), ('-', ''), ('-', '')]
NMN19080128-V04-02-page5.txt: [('--', '-'), ('-', '')]
NMN19080128-V04-02-page6.txt: [('-', ''), ('.-', '.'), ('AN-', 'AN'), ('-roar.', 'roar.'), ('TI-', 'TI'), ('-', '')]
NMN19080128-V04-02-page7.txt: [('-', ''), ('--', '-'), ('--', '-'), ('angel---', 'angel--')]
NMN19080218-V04-03-page1.txt: [('--eocO', '-eocO'), ('--', '-'), ('faithful-', 'faithful')]
NMN19080218-V04-03-page2.txt: [('-', '')]
NMN19080218-V04-03-page3.txt: [('-', '')]
NMN19080218-V04-03-page4.txt: [('-Lain', 'Lain'), ('-', ''), ('tS-', 'tS'), ('themr-', 'themr')]
NMN19080218-V04-03-page5.txt: [('--', '-'), ('--', '-'), ('attend-', 'attend'), ('-', '')]
NMN19080218-V04-03-page6.txt: [('sto-', 'sto'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.r.-', '.r.')]
NMN19080218-V04-03-page7.txt: [('-to', 'to'), ('-sent', 'sent'), ('--', '-'), ('-', ''), ('Mich-', 'Mich')]
NMN19080310-V04-04-page1.txt: [('-', ''), ('-', ''), ('k-', 'k'), ('-', ''), ('-en', 'en')]
NMN19080310-V04-04-page2.txt: [('sanctuary--', 'sanctuary-')]
NMN19080310-V04-04-page3.txt: [('-live', 'live')]
NMN19080310-V04-04-page4.txt: [('ex.-', 'ex.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('W-', 'W'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19080310-V04-04-page5.txt: [('-rho', 'rho'), ('-many', 'many'), ('-oom', 'oom')]
NMN19080310-V04-04-page6.txt: [('monotonous--', 'monotonous-'), ('-', ''), ('--e', '-e'), ('-', ''), ('peeiod.-', 'peeiod.')]
NMN19080310-V04-04-page7.txt: [('s-', 's')]
NMN19080310-V04-04-page8.txt: [('-.', '.'), ('-', '')]
NMN19080310-V04-04-page9.txt: [('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', '')]
NMN19080331-V04-05-page1.txt: [('--Selected.', '-Selected.')]
NMN19080331-V04-05-page2.txt: [('t-', 't'), ('--', '-'), ('--', '-'), ('-', '')]
NMN19080331-V04-05-page3.txt: [('pos-', 'pos')]
NMN19080331-V04-05-page4.txt: [('-', ''), ('-e', 'e')]
NMN19080331-V04-05-page5.txt: [('-Aave', 'Aave')]
NMN19080331-V04-05-page6.txt: [('-', ''), ('--', '-')]
NMN19080331-V04-05-page7.txt: [('-lour', 'lour')]
NMN19080331-V04-05-page8.txt: [('-', ''), ('--Selected.', '-Selected.')]
NMN19080421-V04-06-page1.txt: [('-', ''), ('-', '')]
NMN19080421-V04-06-page2.txt: [('-espect', 'espect'), ('-heir', 'heir'), ('-bout', 'bout'), ("-'o", "'o"), ('-', '')]
NMN19080421-V04-06-page3.txt: [('-', ''), ('-', ''), ("-.'il", ".'il"), ('-dens', 'dens'), ('-my', 'my')]
NMN19080421-V04-06-page4.txt: [('-rot', 'rot'), ('-', ''), ('-nu', 'nu'), ('-nd', 'nd'), ('-', ''), ('-', ''), ('--', '-'), ('-', '')]
NMN19080421-V04-06-page5.txt: [('pre-', 'pre'), ('-twenty', 'twenty'), ('uis-', 'uis'), ('-nd', 'nd'), ('-', '')]
NMN19080421-V04-06-page6.txt: [('--', '-'), ('--', '-'), ('-eeuested', 'eeuested'), ('-follows', 'follows'), ('Insti-', 'Insti')]
NMN19080421-V04-06-page7.txt: [('-', ''), ('report-', 'report'), ('or-', 'or')]
NMN19080421-V04-06-page8.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Gay-', 'Gay'), ('-', ''), ('-', ''), ('Fetos-', 'Fetos')]
NMN19080512-V04-07-page1.txt: [('---somewhere."', '--somewhere."'), ('-', ''), ('-', ''), ('-to', 'to'), ('after-', 'after'), ('-mu-a', 'mu-a')]
NMN19080512-V04-07-page2.txt: [('.-', '.'), ('-', ''), ('-', ''), ('-', '')]
NMN19080512-V04-07-page3.txt: [('-y', 'y'), ('-our', 'our'), ('Sabbath-', 'Sabbath')]
NMN19080512-V04-07-page4.txt: [('--Selected.', '-Selected.'), ('--', '-'), ('-..', '..'), ('-', ''), ('ffor-', 'ffor'), ('-', '')]
NMN19080512-V04-07-page5.txt: [('-', '')]
NMN19080512-V04-07-page6.txt: [('-a', 'a'), ('-.', '.')]
NMN19080512-V04-07-page7.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-atod', 'atod'), ('essen-', 'essen'), ('--o--', '-o--'), ('-', ''), ('-', ''), ('emper-', 'emper'), ('-', ''), ('ow-', 'ow'), ('--o--', '-o--'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-e-', 'e-'), ('frau-', 'frau'), ('--o--', '-o--'), ('-', ''), ('-', ''), ('-t.', 't.'), ('-T', 'T')]
NMN19080512-V04-07-page8.txt: [('--Selected.', '-Selected.'), ('--o--', '-o--'), ('-', ''), ('-', ''), ('-', ''), ('Blosse.-', 'Blosse.'), ('EXHIB-', 'EXHIB')]
NMN19080512-V04-07-page9.txt: [('--', '-'), ('--', '-'), ('c-', 'c'), ('-ent', 'ent'), ('Pro-', 'Pro')]
NMN19080623-V04-08-page3.txt: [('-', ''), ('--', '-'), ('-', ''), ('-Caine', 'Caine'), ('-', ''), ('-', ''), ('--the', '-the')]
NMN19080623-V04-08-page4.txt: [('-', '')]
NMN19080623-V04-08-page5.txt: [('.-', '.'), ('--', '-')]
NMN19080623-V04-08-page6.txt: [('a-', 'a'), ('-', ''), ('-cheer', 'cheer')]
NMN19080623-V04-08-page7.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19080623-V04-08-page8.txt: [('-', '')]
NMN19080623-V04-08-page9.txt: [('--', '-')]
NMN19080714-V04-09-page1.txt: [('e-', 'e'), ('Lord."---', 'Lord."--')]
NMN19080714-V04-09-page2.txt: [('-dar', 'dar'), ('--o--', '-o--'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('WATCH-', 'WATCH'), ('-', ''), ('-.', '.'), ('-', '')]
NMN19080714-V04-09-page4.txt: [('tencent-', 'tencent'), ('---', '--'), ('-', ''), ('in-', 'in')]
NMN19080714-V04-09-page5.txt: [('INSTRUCT-', 'INSTRUCT')]
NMN19080714-V04-09-page6.txt: [('--', '-'), ('--', '-'), ('--e....', '-e....'), ('-', '')]
NMN19080714-V04-09-page7.txt: [('-', ''), ('--', '-'), ('-', ''), ('step-', 'step')]
NMN19080714-V04-09-page8.txt: [('--a--', '-a--'), ('Exhib-', 'Exhib'), ('-', ''), ('-', ''), ('-', ''), ('Sabbath-', 'Sabbath')]
NMN19080714-V04-09-page9.txt: [('-', ''), ('--gathered', '-gathered')]
NMN19080804-V04-10-page1.txt: [('necess-', 'necess'), ('--Selected.', '-Selected.')]
NMN19080804-V04-10-page3.txt: [('-', ''), ('-', '')]
NMN19080804-V04-10-page4.txt: [('-eeligious', 'eeligious'), ('-', ''), ('--', '-')]
NMN19080804-V04-10-page6.txt: [('-.', '.'), ('.-', '.'), ('-or', 'or'), ('-', '')]
NMN19080804-V04-10-page7.txt: [('-', ''), ('-', ''), ('-', ''), ('-nd', 'nd')]
NMN19080804-V04-10-page8.txt: [('-', ''), ('Wilson---', 'Wilson--'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-lave', 'lave'), ('-', '')]
NMN19080818-V04-11-page1.txt: [('-', ''), ('-M', 'M'), ('View.-', 'View.'), ('-', ''), ('in-', 'in')]
NMN19080818-V04-11-page2.txt: [('-nal', 'nal'), ('-ance', 'ance'), ('-', '')]
NMN19080915-V04-12-page1.txt: [('-are', 'are'), ('-', '')]
NMN19080915-V04-12-page2.txt: [('-', '')]
NMN19080915-V04-12-page3.txt: [('-', ''), ('-eis', 'eis'), ('-frequent', 'frequent')]
NMN19080915-V04-12-page4.txt: [('Mr-', 'Mr')]
NMN19080915-V04-12-page5.txt: [('--.', '-.'), ('--', '-'), ('-quite', 'quite')]
NMN19080915-V04-12-page6.txt: [('--', '-'), ('--', '-'), ('-', ''), ('---', '--')]
NMN19080915-V04-12-page7.txt: [('Ilissionary-', 'Ilissionary'), ('also.--', 'also.-')]
NMN19080915-V04-12-page8.txt: [('car-', 'car'), ('sw-', 'sw')]
NMN19080915-V04-12-page9.txt: [('-', ''), ('-', ''), ('Fenne-', 'Fenne')]
NMN19080922-V04-13-page1.txt: [('-', '')]
NMN19080922-V04-13-page2.txt: [('-i', 'i')]
NMN19081006-V04-14-page1.txt: [('--Selected.', '-Selected.'), ('-', ''), ('--rulpit', '-rulpit')]
NMN19081006-V04-14-page2.txt: [('--', '-'), ('-Joreted', 'Joreted'), ('Board-', 'Board'), ('--', '-')]
NMN19081006-V04-14-page3.txt: [('alass-', 'alass'), ('charge.-', 'charge.'), ('re-', 're'), ('-leading', 'leading')]
NMN19081006-V04-14-page4.txt: [('--', '-'), ('--', '-'), ('-', ''), ('-Allan', 'Allan'), ('-egioas', 'egioas')]
NMN19081006-V04-14-page5.txt: [('--', '-'), ('--', '-'), ('govern-', 'govern'), ('--', '-'), ('--', '-'), ('-', '')]
NMN19081006-V04-14-page6.txt: [('-a', 'a'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19081006-V04-14-page7.txt: [('---........', '--........')]
NMN19081027-V04-15-page1.txt: [('dead-', 'dead'), ('-.', '.'), ('--', '-'), ('--Selected.', '-Selected.')]
NMN19081027-V04-15-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('little-', 'little'), ('week--', 'week-'), ('work.--', 'work.-'), ('future.--', 'future.-'), ('be-', 'be')]
NMN19081027-V04-15-page3.txt: [('ha-', 'ha')]
NMN19081027-V04-15-page4.txt: [('-', ''), ('---', '--'), ('--', '-'), ('---', '--'), ('---', '--'), ('---', '--'), ('---', '--'), ('--', '-'), ('Ha-', 'Ha'), ('---', '--'), ('---', '--'), ('---', '--'), ('---', '--'), ('--', '-'), ('--', '-')]
NMN19081027-V04-15-page5.txt: [('-y', 'y'), ('-', ''), ('---', '--')]
NMN19081027-V04-15-page6.txt: [('-time', 'time'), ('--Dr.', '-Dr.')]
NMN19081027-V04-15-page7.txt: [('the-', 'the'), ('-', '')]
NMN19081027-V04-15-page8.txt: [('-', ''), ('--', '-'), ('met-', 'met')]
NMN19081124-V04-16-page1.txt: [('-', ''), ('it.--', 'it.-'), ('all."--', 'all."-')]
NMN19081124-V04-16-page2.txt: [('--', '-'), ('..-', '..'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19081124-V04-16-page3.txt: [('-', '')]
NMN19081124-V04-16-page4.txt: [('--', '-'), ('--', '-'), ('-', ''), ('Lake-', 'Lake'), ('near-', 'near')]
NMN19081124-V04-16-page5.txt: [('-', ''), ('-', ''), ('-o', 'o'), ('-the', 'the')]
NMN19081124-V04-16-page6.txt: [('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19081124-V04-16-page7.txt: [('--', '-'), ('-feee-a', 'feee-a'), ('C-', 'C')]
NMN19081215-V04-17-page1.txt: [('--It', '-It'), ('-', ''), ('-', ''), ('-', ''), ('-a', 'a'), ('-', ''), ('-', '')]
NMN19081215-V04-17-page2.txt: [('-Course', 'Course'), ('deter-', 'deter'), ('-IAT', 'IAT'), ('--', '-'), ('--', '-'), ('O--', 'O-'), ('In-', 'In')]
NMN19081215-V04-17-page3.txt: [('"Thereto-', '"Thereto'), ('-', '')]
NMN19081215-V04-17-page4.txt: [('-', ''), ('---', '--'), ("-cd's", "cd's"), ('--Irene', '-Irene'), ('Year...-', 'Year...'), ('-', '')]
NMN19081215-V04-17-page5.txt: [('-ust', 'ust'), ('God.--', 'God.-'), ('--Elmo', '-Elmo'), ('-Iurely', 'Iurely'), ('message.--', 'message.-'), ('-', ''), ('received.--', 'received.-'), ('amino.-', 'amino.'), ('Saviour.--', 'Saviour.-')]
NMN19081215-V04-17-page6.txt: [('money-', 'money')]
NMN19081215-V04-17-page7.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19090115-V05-01-page1.txt: [('--Selected.', '-Selected.')]
NMN19090115-V05-01-page2.txt: [('--o--', '-o--'), ('--o--', '-o--'), ('--oo--', '-oo--'), ('McDonald.--', 'McDonald.-'), ('Marks.--', 'Marks.-'), ('-', ''), ('--o--', '-o--')]
NMN19090115-V05-01-page3.txt: [('--these', '-these'), ('s-', 's')]
NMN19090115-V05-01-page4.txt: [('-c', 'c'), ('-', ''), ('-', '')]
NMN19090115-V05-01-page5.txt: [('--', '-'), ('--', '-'), ('-his', 'his'), ('-emainder', 'emainder')]
NMN19090115-V05-01-page7.txt: [('-emellber', 'emellber')]
NMN19090115-V05-01-page8.txt: [('-', ''), ('-', ''), ('-T', 'T'), ('-s.', 's.'), ('-', ''), ('-', ''), ('-COT', 'COT'), ('-i', 'i'), ('-', '')]
NMN19090115-V05-01-page9.txt: [('-yers', 'yers'), ('there-', 'there'), ('--', '-'), ('--', '-')]
NMN19090215-V05-02-page1.txt: [('multitude--', 'multitude-'), ('roam--', 'roam-'), ('-', ''), ('-Allen', 'Allen')]
NMN19090215-V05-02-page2.txt: [('-twenty-five.', 'twenty-five.')]
NMN19090215-V05-02-page3.txt: [('-myself', 'myself'), ('-ebeing', 'ebeing'), ('--', '-'), ('--', '-'), ('-', ''), ('-', ''), ('-', '')]
NMN19090215-V05-02-page4.txt: [('--', '-'), ('.-', '.'), ('go.-', 'go.'), ('-', '')]
NMN19090215-V05-02-page5.txt: [('-i', 'i'), ('-', '')]
NMN19090215-V05-02-page6.txt: [('-J""', 'J""'), ('--ow-', '-ow-'), ('-', ''), ('Secretary-', 'Secretary'), ('H.-', 'H.')]
NMN19090215-V05-02-page7.txt: [('Morocco--', 'Morocco-'), ('-', ''), ('-', ''), ('Onaway-', 'Onaway'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Hancock-', 'Hancock'), ('-', ''), ('-', ''), ('-', ''), ('-yirease', 'yirease'), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19090318-V05-03-page1.txt: [('debt--', 'debt-'), ('-rou', 'rou'), ('aside--', 'aside-'), ('-', ''), ('debt--', 'debt-'), ('--Selected.', '-Selected.'), ('--Selected.', '-Selected.'), ('--o--', '-o--'), ('-', '')]
NMN19090318-V05-03-page2.txt: [('not-', 'not'), ('-', ''), ('--A.', '-A.'), ('-do', 'do')]
NMN19090318-V05-03-page3.txt: [('-', '')]
NMN19090318-V05-03-page4.txt: [('-', '')]
NMN19090318-V05-03-page5.txt: [('there-', 'there')]
NMN19090318-V05-03-page6.txt: [('-planning', 'planning'), ('-', ''), ('-', '')]
NMN19090318-V05-03-page7.txt: [('-found', 'found')]
NMN19090318-V05-03-page8.txt: [('--e-..', '-e-..')]
NMN19090318-V05-03-page9.txt: [('dila-c-', 'dila-c'), ('-', ''), ('-', ''), ("-'-", "'-"), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Eastport-', 'Eastport'), ('-', ''), ('Total-', 'Total'), ('-', ''), ('-', ''), ('-..-', '..-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19090415-V05-04-page1.txt: [('-', ''), ('--Selected.', '-Selected.'), ('-ichigan', 'ichigan'), ('indebted-', 'indebted'), ('-north', 'north'), ('-', '')]
NMN19090415-V05-04-page2.txt: [('-', ''), ('-', ''), ('---', '--'), ('-', ''), ('-he', 'he'), ('ba-', 'ba'), ('-', ''), ('-', ''), ('-', '')]
NMN19090415-V05-04-page3.txt: [('-ea', 'ea')]
NMN19090415-V05-04-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('Johnson-', 'Johnson')]
NMN19090415-V05-04-page5.txt: [('-', ''), ('-', ''), ('--Hubert', '-Hubert'), ('-olth', 'olth')]
NMN19090415-V05-04-page6.txt: [('--', '-'), ('--', '-'), ('-t-e', 't-e'), ('-', ''), ('--s.', '-s.'), ('dab-', 'dab'), ('.-', '.')]
NMN19090415-V05-04-page7.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('gradua-', 'gradua')]
NMN19090514-V05-05-page1.txt: [('-', ''), ('-elan', 'elan'), ('tp-', 'tp'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19090514-V05-05-page2.txt: [('-', ''), ('-', '')]
NMN19090514-V05-05-page3.txt: [('-iaa', 'iaa'), ('-', '')]
NMN19090514-V05-05-page4.txt: [('-', ''), ('-.', '.'), ('member.-', 'member.'), ('-we', 'we'), ('Sabbath-keeper-', 'Sabbath-keeper')]
NMN19090514-V05-05-page6.txt: [('-T.', 'T.'), ('--', '-'), ('--', '-'), ('-', ''), ('-', ''), ('-', '')]
NMN19090514-V05-05-page7.txt: [('-', ''), ('-', ''), ('--o--', '-o--')]
NMN19090514-V05-05-page8.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19090514-V05-05-page9.txt: [('-', ''), ('-', ''), ('--o--', '-o--'), ('--', '-'), ('--', '-')]
NMN19090615-V05-06-page2.txt: [('-Ho', 'Ho'), ('society-', 'society')]
NMN19090615-V05-06-page3.txt: [('I-', 'I'), ('-pear', 'pear'), ('-', ''), ('.--thr-', '.--thr')]
NMN19090615-V05-06-page5.txt: [('re--', 're-')]
NMN19090615-V05-06-page6.txt: [('-f-Vie', 'f-Vie')]
NMN19090615-V05-06-page7.txt: [('-', '')]
NMN19090615-V05-06-page8.txt: [('-', ''), ('-', ''), ('-ankfort', 'ankfort'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
NMN19090615-V05-06-page9.txt: [('-meeting', 'meeting')]
NMN19090715-V05-07-page1.txt: [('-', ''), ('--Selected.', '-Selected.')]
NMN19090715-V05-07-page2.txt: [('--Selected.', '-Selected.'), ('.-', '.'), ('earth--', 'earth-'), ('-leaders', 'leaders')]
NMN19090715-V05-07-page3.txt: [('--o', '-o'), ('--', '-'), ('-', ''), ('--o--', '-o--'), ('-guide', 'guide')]
NMN19090715-V05-07-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('--', '-'), ('--', '-'), ('--Eugene', '-Eugene')]
NMN19090715-V05-07-page5.txt: [('N-', 'N'), ('-', ''), ('-', ''), ('-', ''), ('-spy', 'spy'), ('-', '')]
NMN19090715-V05-07-page6.txt: [("'lbs.-ch.-", "'lbs.-ch."), ('-', ''), ('-', ''), ('-t', 't'), ('n-', 'n'), ('-', ''), ('-..d', '..d'), ('-th', 'th'), ('campiinxe-', 'campiinxe'), ("-cn'ennee.", "cn'ennee."), ('-c', 'c'), ('Mag-', 'Mag'), ('-nnif', 'nnif'), ('s-', 's'), ('-inn-', 'inn-'), ('-nnAnn...', 'nnAnn...'), ('-', ''), ('-ook', 'ook'), ('ter-', 'ter'), ('n-rt.-', 'n-rt.'), ('-businenr', 'businenr'), ('t-tol-', 't-tol'), ('-.', '.'), ('-.', '.'), ('-tt', 'tt'), ('pubjnIshcnr-.-', 'pubjnIshcnr-.'), ('-uhre', 'uhre'), ('faecira.-', 'faecira.'), ("cit'cu-", "cit'cu"), ('YUKON-', 'YUKON'), ('-', ''), ('-early.', 'early.')]
NMN19090715-V05-07-page7.txt: [('-Sept', 'Sept'), ('-', ''), ('-', '')]
NMN19090715-V05-07-page8.txt: [('--', '-'), ('-', ''), ('us--', 'us-'), ('-made', 'made'), ('Cadillac-', 'Cadillac'), ('-', ''), ('River-', 'River'), ('---', '--'), ('---', '--'), ('------', '-----'), ('---', '--'), ('Omer-', 'Omer'), ('-', ''), ('rotoskey-----------', 'rotoskey----------'), ('-', ''), ('Soottvalo--', 'Soottvalo-'), ('-----.-', '----.-'), ('-.', '.'), ('--', '-'), ('-', ''), ('Fraukfnrt-', 'Fraukfnrt'), ('-----', '----'), ('TOTAL----', 'TOTAL---'), ('Mesiok-', 'Mesiok'), ('Ensign---', 'Ensign--')]
NMN19090715-V05-07-page9.txt: [('-', ''), ('-fel', 'fel')]
NMN19090812-V05-08-page1.txt: [('-t.', 't.'), ('camp-', 'camp'), ('Pas-', 'Pas'), ('-', ''), ('ay-', 'ay'), ('-ook', 'ook'), ('-neeitrrgs', 'neeitrrgs'), ('-', ''), ('J.-', 'J.')]
NMN19090812-V05-08-page2.txt: [('-rimes', 'rimes'), ('-loot', 'loot'), ('--', '-'), ('History.--', 'History.-')]
NMN19090812-V05-08-page3.txt: [('speech-', 'speech'), ('Greek.-', 'Greek.'), ('ljatheJatics--', 'ljatheJatics-'), ('Science..-', 'Science..'), ('-', ''), ('-', '')]
NMN19090812-V05-08-page4.txt: [('-', ''), ('iJtephen.-', 'iJtephen.'), ('Gra-', 'Gra'), ('Re-', 'Re'), ('-', ''), ('-', ''), ('bless-', 'bless')]
NMN19090812-V05-08-page5.txt: [('-ea.', 'ea.'), ('-cemed', 'cemed'), ('-.', '.'), ('-', ''), ('--"Oh', '-"Oh'), ('-L.', 'L.'), ('-.', '.'), ('-o', 'o'), ('-', '')]
NMN19090812-V05-08-page6.txt: [('-', ''), ('-jeople', 'jeople'), ('-', ''), ('-Acple', 'Acple'), ('lase-', 'lase'), ('Colfax-', 'Colfax'), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-ICHIUN', 'ICHIUN'), ('-', ''), ('-', ''), ('-', ''), ('assoc.-', 'assoc.'), ('-t.', 't.'), ('TraverseCity---', 'TraverseCity--')]
NMN19090812-V05-08-page7.txt: [('-hat', 'hat'), ('-', ''), ('-', ''), ("-ca'actical", "ca'actical"), ('-', '')]
NMN19090812-V05-08-page8.txt: [('-for', 'for'), ('--ake', '-ake'), ('-', ''), ('-.', '.'), ('-Present', 'Present'), ('--', '-'), ('cerep--', 'cerep-'), ('-meeting', 'meeting'), ('-q-', 'q-'), ('-t', 't'), ('-e', 'e'), ('-ed', 'ed'), ('a--', 'a-'), ('-', ''), ('-', ''), ('-', ''), ('-ting', 'ting'), ('-otheii.', 'otheii.'), ('heelth-', 'heelth'), ('-.', '.'), ('-d', 'd')]
NMN19090915-V05-09-page1.txt: [('--Selected.', '-Selected.'), ('bell--', 'bell-'), ('dwell--', 'dwell-'), ('--Driftwood.', '-Driftwood.')]
NMN19090915-V05-09-page2.txt: [('-', ''), ('-.he', '.he'), ('-con', 'con'), ('tem-', 'tem'), ('-.', '.'), ('r.oee-', 'r.oee'), ('TrEk.-', 'TrEk.'), ('-', ''), ('-', ''), ('-', ''), ('-IL', 'IL'), ("--'", "-'"), ('-.', '.'), ('-p', 'p'), ('Ni-', 'Ni'), ('-', ''), ('-i-..hese', 'i-..hese'), ('E-', 'E'), ('-', ''), ('th-', 'th'), ('-a-ch', 'a-ch'), ('-..', '..'), ('pcssi-', 'pcssi'), ('-EL', 'EL'), ('a-', 'a')]
NMN19090915-V05-09-page4.txt: [('Mc-', 'Mc'), ('Peter-', 'Peter'), ('Goffai-', 'Goffai'), ('Received-', 'Received'), ('C-', 'C')]
NMN19090915-V05-09-page5.txt: [('-LM', 'LM'), ('-', ''), ('-', ''), ('Cali-', 'Cali'), ('-', ''), ('-', ''), ('move-', 'move'), ('-r-a', 'r-a')]
NMN19090915-V05-09-page6.txt: [('-Ielps', 'Ielps'), ('-', ''), ('-', ''), ('-t', 't')]
NMN19090915-V05-09-page7.txt: [('-JAITY', 'JAITY'), ('-ee', 'ee'), ('erit--', 'erit-'), ('-a', 'a'), ('-olanteee', 'olanteee'), ('secretary-', 'secretary'), ('.-', '.'), ('--', '-'), ('-ee', 'ee')]
NMN19090915-V05-09-page8.txt: [('-is', 'is'), ('anistee---', 'anistee--'), ('--', '-'), ('dscellaneous-------', 'dscellaneous------'), ('---', '--'), ('..-', '..'), ('.-', '.'), ('.-------', '.------'), ("L'estport--", "L'estport-"), ('----', '---'), ('Yrankfort------', 'Yrankfort-----'), ('......."-------', '......."------'), ('.side-', '.side'), ('Scottville------', 'Scottville-----'), ('-', ''), ('..ctalcock-----', '..ctalcock----'), ('resick--------------', 'resick-------------'), ('Wilson-----..--', 'Wilson-----..-')]
NMN19090915-V05-09-page9.txt: [('R-', 'R'), ('-', ''), ("'-", "'"), ('Br-', 'Br'), ('w-', 'w'), ('rele-e-', 'rele-e'), ('-LF', 'LF'), ('---', '--'), ('-', ''), ('-', ''), ('-', ''), ('-laJmusson', 'laJmusson'), ('ICI-', 'ICI'), ('-.nt', '.nt'), ('-Jee.', 'Jee.'), ('aioint-', 'aioint'), ("-'e", "'e")]
NMN19091015-V05-10-page1.txt: [('lad--', 'lad-'), ('-Page', 'Page'), ('ITD.RTHMICMACANCO.N.-', 'ITD.RTHMICMACANCO.N.'), ('-hing', 'hing')]
NMN19091015-V05-10-page2.txt: [('cAR--', 'cAR-'), ('-', ''), ('-ill', 'ill'), ('-', ''), ('-', '')]
NMN19091015-V05-10-page3.txt: [('-', ''), ('-', ''), ('a-.-', 'a-.'), ('-nce', 'nce'), ('rocei-', 'rocei'), ('-she', 'she'), ('-he', 'he')]
NMN19091015-V05-10-page4.txt: [('-re', 're'), ('-', ''), ('--that', '-that'), ('--will', '-will'), ('--', '-'), ('--', '-')]
NMN19091015-V05-10-page5.txt: [('-', ''), ('MOM-', 'MOM'), ('-', ''), ('-Jorth', 'Jorth')]
NMN19091015-V05-10-page6.txt: [('couin-', 'couin'), ('ou--', 'ou-'), ('-.', '.'), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.-', '.'), ('.------.------', '.------.-----'), ('-"', '"'), ('-.', '.')]
NMN19091015-V05-10-page7.txt: [('-ake', 'ake'), ('-belona', 'belona'), ('yen--', 'yen-'), ('estimon-', 'estimon')]
NMN19091015-V05-10-page8.txt: [('.-', '.'), ('-.', '.'), ('.-', '.'), ('.-', '.'), ('-', ''), ('--', '-'), ('--BIBLE', '-BIBLE'), ('--', '-'), ("'-'-", "'-'"), ('--', '-'), ('.-', '.')]
NMN19091015-V05-10-page9.txt: [('-', ''), ('-.', '.'), ('--o--', '-o--'), ('--o--', '-o--'), ('--o', '-o'), ('--', '-')]
NMN19091115-V05-11-page1.txt: [('IC-', 'IC'), ('-', ''), ('-', ''), ('ow-', 'ow'), ('-fe', 'fe'), ('-', ''), ('--', '-'), ('--', '-'), ('-pernie', 'pernie'), ('-', '')]
NMN19091115-V05-11-page2.txt: [('-', ''), ('-', ''), ('livin-', 'livin'), ('.-', '.')]
NMN19091115-V05-11-page3.txt: [('-S.', 'S.'), ('Brothel-', 'Brothel'), ('-in', 'in')]
NMN19091115-V05-11-page4.txt: [('-', ''), ('-', ''), ('-rke', 'rke'), ('prayin-', 'prayin'), ('-call', 'call')]
NMN19091115-V05-11-page5.txt: [('--', '-'), ('--', '-'), ('-', ''), ('taus-', 'taus'), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-..', '..')]
NMN19091115-V05-11-page6.txt: [('canvassersllea.-', 'canvassersllea.'), ('--', '-'), ('--', '-'), ('--o--', '-o--'), ('Baldwin-', 'Baldwin'), ('Cadillac--', 'Cadillac-'), ('Colfax--', 'Colfax-'), ('Dighton--', 'Dighton-'), ('..-', '..'), ('Evart---', 'Evart--'), ('Frankfort-', 'Frankfort'), ('Hancock-', 'Hancock'), ('River-', 'River'), ('---------', '--------'), ('-------', '------'), ('---', '--'), ('--', '-'), ('--', '-'), ('----', '---'), ('--', '-'), ('-----', '----'), ('--', '-'), ('Ordway-', 'Ordway'), ('City--', 'City-'), ('Wildwood--------', 'Wildwood-------'), ('wilsom---------', 'wilsom--------'), ('--', '-'), ('---', '--'), ('.----', '.---'), ('------', '-----'), ('------', '-----')]
NMN19091115-V05-11-page7.txt: [('wh-', 'wh')]
NMN19091115-V05-11-page8.txt: [('fellow-men-', 'fellow-men'), ('rocuismonts-', 'rocuismonts'), ('-ako', 'ako'), ('-', ''), ('-would', 'would'), ('-i', 'i'), ('-vor-', 'vor-'), ('-se', 'se')]
NMN19091115-V05-11-page9.txt: [('-.', '.'), ('-', ''), ('TZ....-', 'TZ....'), ('-', ''), ('-', '')]
NMN19091209-V05-12-page1.txt: [('ro-', 'ro'), ('-', ''), ('litI-', 'litI'), ('-Cy', 'Cy'), ('-von', 'von'), ('---Stes', '--Stes'), ('-', ''), ('-', ''), ('.-', '.'), ('-', ''), ('l.-', 'l.'), ('-', ''), ('-', ''), ('-throne', 'throne'), ('.-', '.'), ('-', ''), ("-'''.'", "'''.'"), ('arid-', 'arid'), ('-', ''), ('--the', '-the'), ('-suop', 'suop'), ('-', ''), ("--Ea'i.l.y", "-Ea'i.l.y"), ('-', ''), ('---Stop', '--Stop')]
NMN19091209-V05-12-page2.txt: [('-', ''), ('-', ''), ('sendin-', 'sendin'), ('-', ''), ('-', ''), ('---J.', '--J.'), ('ez.-', 'ez.'), ('-heir', 'heir')]
NMN19091209-V05-12-page3.txt: [('months--', 'months-'), ('-', ''), ('-', ''), ('-ro', 'ro'), ('-chose', 'chose'), ('---', '--'), ('-', ''), ('-faeir', 'faeir'), ('months-', 'months'), ('----', '---')]
NMN19091209-V05-12-page4.txt: [('-"I', '"I'), ('-the', 'the'), ('u-', 'u'), ('-', ''), ('Coon.-', 'Coon.'), ('bor-', 'bor'), ('-the', 'the'), ('-', ''), ('Tlicken--', 'Tlicken-'), ('-', ''), ('-der', 'der'), ('-this', 'this'), ('-.', '.'), ('-today.', 'today.')]
NMN19091209-V05-12-page5.txt: [('-', ''), ('-sal', 'sal'), ('-', ''), ('.-', '.'), ('.-', '.'), ('-', ''), ('-uilding.', 'uilding.'), ('-', ''), ('-', ''), ('-', ''), ('Tan-', 'Tan')]
NMN19091209-V05-12-page6.txt: [('-', ''), ('--', '-'), ('-Admoro', 'Admoro'), ('-ier', 'ier'), ('b-', 'b'), ('-', ''), ('-', ''), ('--', '-'), ('-r', 'r')]
NMN19091209-V05-12-page7.txt: [('-', ''), ('-', ''), ('-.', '.'), ('-y', 'y'), ('C-', 'C'), ('apona----', 'apona---'), ('---', '--'), ('--', '-'), ('-----', '----'), ('-', ''), ('-------------', '------------'), ('--', '-'), ('Hancock--', 'Hancock-'), ('River----------', 'River---------'), ('----...', '---...'), ('-..', '..'), ('-"', '"'), ('--', '-'), ('-', ''), ('Onaway---', 'Onaway--'), ('Omer-', 'Omer'), ('Petoskey-', 'Petoskey'), ('City---', 'City--'), ('-..-----------', '..-----------'), ('thittemore.--', 'thittemore.-'), ('----', '---'), ('----', '---'), ('--', '-'), ('--', '-'), ('--', '-'), ('Lvi-', 'Lvi'), ('.....--', '.....-'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--------', '-------'), ('n.......-----', 'n.......----')]
NMN19091209-V05-12-page8.txt: [('Mrer-', 'Mrer'), ('.-', '.'), ('-', ''), ('-day', 'day'), ('-', ''), ('-w', 'w'), ('--', '-'), ('--o--', '-o--'), ('-Joen', 'Joen')]
NMN19100120-V06-01-page1.txt: [('-', ''), ('-', '')]
NMN19100120-V06-01-page2.txt: [('--', '-'), ('-', ''), ('-', ''), ('-to', 'to'), ('--', '-'), ('-', ''), ('S-', 'S'), ('iir-tcP-', 'iir-tcP')]
NMN19100120-V06-01-page3.txt: [('-', ''), ('camT-', 'camT'), ('-', '')]
NMN19100120-V06-01-page4.txt: [('s-', 's'), ('-.', '.'), ('--', '-'), ('--o--', '-o--')]
NMN19100120-V06-01-page5.txt: [('IT-', 'IT'), ('--r', '-r'), (".-'-", ".-'"), ('-', ''), ('-', ''), ('--', '-'), ('-..W.', '..W.'), ('-.', '.'), ('-', ''), ('L-', 'L'), ('-', ''), ('-', ''), ('.ot-', '.ot'), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-r', 'r')]
NMN19100120-V06-01-page6.txt: [('-', ''), ('-Adam', 'Adam'), ('-each.', 'each.'), ('-be', 'be'), ('-Died', 'Died')]
NMN19100120-V06-01-page7.txt: [('-', ''), ('--', '-'), ('r-', 'r'), ('-the', 'the'), ('-.', '.'), ('.-', '.'), ('-that', 'that'), ('-ess', 'ess'), ('v-', 'v'), ('-en.Lssl', 'en.Lssl'), ('Jar..-', 'Jar..')]
NMN19100120-V06-01-page8.txt: [('Pm-', 'Pm')]
NMN19100120-V06-01-page9.txt: [('-it-wised.', 'it-wised.'), ('-', '')]
NMN19100224-V06-02-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-n', 'n')]
NMN19100224-V06-02-page2.txt: [('-', ''), ('--', '-')]
NMN19100224-V06-02-page3.txt: [('-', ''), ('--', '-'), ('--Alya', '-Alya'), ('--Mrs.', '-Mrs.'), ('-', ''), ('-', ''), ('--o--', '-o--'), ('service--', 'service-')]
NMN19100224-V06-02-page4.txt: [('-', ''), ('-', ''), ('-', '')]
NMN19100224-V06-02-page5.txt: [('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-.', '.')]
NMN19100224-V06-02-page6.txt: [('--', '-'), ('--', '-')]
NMN19100224-V06-02-page7.txt: [('-WANTED.', 'WANTED.'), ('papers-', 'papers'), ('--o--', '-o--'), ('-', ''), ('-', ''), ('-her', 'her'), ('--', '-')]

Check Correction 3

In [20]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/NMN/correction3

Average verified rate: 0.9186944190255666

Average of error rates: 0.08445244956772334

Total token count: 197797

In [21]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[21]:
[('e', 557),
 ('t', 371),
 ("'", 289),
 ('th', 256),
 ('r', 236),
 ('n', 224),
 ('w', 208),
 ('f', 159),
 ('m', 157),
 ('d', 154),
 ('g', 111),
 ('aro', 89),
 ('u', 83),
 ('co', 73),
 ('bo', 73),
 ('re', 69),
 ('-', 69),
 ('k', 68),
 ('wo', 66),
 ('se', 38),
 ('x', 38),
 ('tc', 36),
 ('te', 33),
 ('nd', 31),
 ('es', 29),
 ('ti', 29),
 ('z', 28),
 ('li', 28),
 ('ie', 28),
 ('leetsville', 28),
 ("canvassers'", 27),
 ('mt', 27),
 ('al', 27),
 ('tt', 26),
 ('willaman', 26),
 ('ft', 23),
 ('ay', 23),
 ('ne', 20),
 ('ce', 20),
 ('soo', 20),
 ('q', 20),
 ('il', 19),
 ("elders'", 19),
 ('myrta', 18),
 ('ee', 18),
 ('altho', 18),
 ('rs', 18),
 ('ca', 17),
 ('ro', 17),
 ('tr', 16)]

Correction 4 -- Remove Extra Quotation Marks

In [22]:
# %load shared_elements/replace_extra_quotation_marks.py
prev = cycle
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 1:
                if token_list[-2] is 's' or 'S':
                    pass
                else:
                    corrections.append((token, re.sub(r"'", r"", token)))
            else:
                pass
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
        else:
            pass
    
    if len(corrections) > 0:
        print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
NMN19070108-V03-01-page1.txt: [("'calling", 'calling'), ("'ICH.", 'ICH.')]
NMN19070108-V03-01-page2.txt: [("'interesting.", 'interesting.'), ("'I", 'I')]
NMN19070129-V03-02-page1.txt: [("'What", 'What'), ("'work", 'work'), ("'There", 'There')]
NMN19070129-V03-02-page4.txt: [("'bay", 'bay')]
NMN19070129-V03-02-page6.txt: [("'Society", 'Society'), ("'Menominee", 'Menominee')]
NMN19070219-V03-03-page1.txt: [("'E", 'E')]
NMN19070219-V03-03-page5.txt: [("'rith", 'rith')]
NMN19070312-V03-04-page1.txt: [("'LICH.", 'LICH.')]
NMN19070401-V03-05-page3.txt: [("'.hat", '.hat'), ("'io", 'io')]
NMN19070401-V03-05-page7.txt: [("'NOTIC", 'NOTIC')]
NMN19070401-V03-05-page8.txt: [("'sets.", 'sets.')]
NMN19070423-V03-06-page1.txt: [("'with", 'with'), ("'you", 'you')]
NMN19070423-V03-06-page6.txt: [("'t", 't'), ("'wing", 'wing')]
NMN19070514-V03-07-page1.txt: [("'The", 'The')]
NMN19070514-V03-07-page2.txt: [("'soreheads'.", 'soreheads.')]
NMN19070514-V03-07-page6.txt: [("'We", 'We'), ("'I", 'I'), ("'The", 'The')]
NMN19070604-V03-08-page1.txt: [("'The", 'The')]
NMN19070604-V03-08-page2.txt: [("'From", 'From')]
NMN19070604-V03-08-page3.txt: [("'were", 'were')]
NMN19070625-V03-09-page2.txt: [("'We", 'We')]
NMN19070625-V03-09-page5.txt: [("'could", 'could')]
NMN19070625-V03-09-page6.txt: [("'Christian", 'Christian')]
NMN19070716-V03-10-page1.txt: [("'ne", 'ne')]
NMN19070716-V03-10-page3.txt: [("'Twas", 'Twas')]
NMN19070716-V03-10-page4.txt: [("'heirs", 'heirs')]
NMN19070716-V03-10-page5.txt: [("'Brother", 'Brother')]
NMN19070813-V03-11-page2.txt: [("'cause", 'cause'), ("'There", 'There')]
NMN19070813-V03-11-page3.txt: [("'reople's", 'reoples'), ("'gachera.", 'gachera.')]
NMN19070917-V03-12-page1.txt: [("'Church", 'Church')]
NMN19070917-V03-12-page5.txt: [("'seen", 'seen')]
NMN19070917-V03-12-page6.txt: [("'ave", 'ave')]
NMN19071008-V03-13-page1.txt: [("'i.", 'i.'), ("'act", 'act'), ("'.", '.')]
NMN19071008-V03-13-page2.txt: [("'UTE", 'UTE'), ("'eeen", 'eeen'), ("'eleasures.", 'eleasures.')]
NMN19071008-V03-13-page3.txt: [("'kl", 'kl')]
NMN19071008-V03-13-page4.txt: [("'fr.", 'fr.'), ("'...", '...')]
NMN19071008-V03-13-page6.txt: [("'Takes", 'Takes')]
NMN19071029-V03-14-page4.txt: [("'two", 'two')]
NMN19071029-V03-14-page5.txt: [("'c", 'c')]
NMN19071119-V03-15-page1.txt: [("'WHERE", 'WHERE')]
NMN19071119-V03-15-page2.txt: [("'continue.", 'continue.'), ("'Ur", 'Ur')]
NMN19071119-V03-15-page4.txt: [("'Mrs.", 'Mrs.')]
NMN19071119-V03-15-page5.txt: [("'Standard", 'Standard')]
NMN19071119-V03-15-page6.txt: [("'cannot", 'cannot')]
NMN19071119-V03-15-page7.txt: [("'married", 'married')]
NMN19071119-V03-15-page8.txt: [("'.or", '.or')]
NMN19071119-V03-15-page9.txt: [("'ell", 'ell')]
NMN19071210-V03-16-page1.txt: [("'JICHIi", 'JICHIi')]
NMN19071210-V03-16-page2.txt: [("'rod", 'rod'), ("'Oreee", 'Oreee'), ("'Iced", 'Iced'), ("'an", 'an'), ("'Rand", 'Rand')]
NMN19071210-V03-16-page3.txt: [("'ATE", 'ATE'), ("'jerk's", 'jerks')]
NMN19071210-V03-16-page4.txt: [("'with", 'with'), ("'.Copper", '.Copper')]
NMN19071210-V03-16-page6.txt: [("'Report", 'Report'), ('\'e."', 'e."')]
NMN19071210-V03-16-page7.txt: [("'s", 's'), ("'rho", 'rho'), ("'Timbers", 'Timbers')]
NMN19080107-V04-01-page1.txt: [("'military", 'military')]
NMN19080107-V04-01-page2.txt: [("'leo.", 'leo.')]
NMN19080107-V04-01-page5.txt: [("'Cno.t", 'Cno.t')]
NMN19080107-V04-01-page6.txt: [("'Auong", 'Auong')]
NMN19080128-V04-02-page1.txt: [("'noath", 'noath'), ('\'"', '"'), ('\'"', '"')]
NMN19080128-V04-02-page3.txt: [("'.re", '.re')]
NMN19080128-V04-02-page4.txt: [("'That", 'That'), ("'or", 'or')]
NMN19080128-V04-02-page5.txt: [("'to", 'to'), ("'that", 'that'), ("'Aril", 'Aril')]
NMN19080128-V04-02-page6.txt: [("'To", 'To')]
NMN19080128-V04-02-page8.txt: [("'I", 'I'), ("'that", 'that')]
NMN19080218-V04-03-page2.txt: [("'The", 'The')]
NMN19080218-V04-03-page4.txt: [("'i.", 'i.')]
NMN19080218-V04-03-page5.txt: [("'the", 'the')]
NMN19080218-V04-03-page6.txt: [("'v", 'v'), ("'Noe", 'Noe'), ("'before", 'before')]
NMN19080310-V04-04-page3.txt: [("'eT", 'eT'), ("'WS", 'WS')]
NMN19080310-V04-04-page4.txt: [("'s", 's')]
NMN19080310-V04-04-page6.txt: [("'TLUNTEra", 'TLUNTEra')]
NMN19080310-V04-04-page9.txt: [("'better", 'better')]
NMN19080331-V04-05-page1.txt: [("'VOL.", 'VOL.'), ("'For", 'For')]
NMN19080331-V04-05-page2.txt: [("'net", 'net'), ("'to", 'to')]
NMN19080331-V04-05-page3.txt: [("'nfluonce", 'nfluonce'), ("'Tod", 'Tod')]
NMN19080331-V04-05-page8.txt: [("'our", 'our'), ('\'sick."', 'sick."')]
NMN19080421-V04-06-page1.txt: [("'TOL.", 'TOL.'), ("'us", 'us')]
NMN19080421-V04-06-page2.txt: [("'W", 'W'), ("'.re", '.re'), ("'o", 'o'), ("'thereof", 'thereof')]
NMN19080421-V04-06-page3.txt: [("'pie", 'pie')]
NMN19080421-V04-06-page4.txt: [("'.", '.')]
NMN19080421-V04-06-page5.txt: [("'Tin", 'Tin'), ("'.", '.')]
NMN19080421-V04-06-page6.txt: [("'iOhK.", 'iOhK.')]
NMN19080421-V04-06-page8.txt: [("'iatt.", 'iatt.'), ("'inciples", 'inciples')]
NMN19080512-V04-07-page1.txt: [("'Publish", 'Publish'), ("'ten", 'ten')]
NMN19080512-V04-07-page3.txt: [("'ecoive", 'ecoive'), ("''neighborhooe", 'neighborhooe'), ("'e", 'e'), ("'Sons", 'Sons')]
NMN19080512-V04-07-page4.txt: [("'Dom", 'Dom'), ("'Now", 'Now'), ("'growing", 'growing')]
NMN19080512-V04-07-page7.txt: [("'it.", 'it.')]
NMN19080623-V04-08-page1.txt: [("'Lamb", 'Lamb')]
NMN19080623-V04-08-page2.txt: [("'blessed", 'blessed')]
NMN19080623-V04-08-page3.txt: [("'eody", 'eody')]
NMN19080623-V04-08-page5.txt: [("'A", 'A'), ("'What", 'What')]
NMN19080714-V04-09-page1.txt: [("'in", 'in'), ("'and", 'and'), ("'..ee", '..ee'), ("'diligent", 'diligent')]
NMN19080714-V04-09-page3.txt: [("'and", 'and')]
NMN19080714-V04-09-page6.txt: [("'fifty", 'fifty')]
NMN19080714-V04-09-page7.txt: [("'e", 'e')]
NMN19080714-V04-09-page9.txt: [("'t", 't')]
NMN19080804-V04-10-page2.txt: [("''or", 'or')]
NMN19080804-V04-10-page3.txt: [("'aefore", 'aefore'), ("'sited", 'sited')]
NMN19080804-V04-10-page4.txt: [("'God's", 'Gods')]
NMN19080804-V04-10-page5.txt: [("'field", 'field')]
NMN19080804-V04-10-page6.txt: [("'I", 'I'), ("'Sound", 'Sound')]
NMN19080804-V04-10-page8.txt: [("'vve", 'vve')]
NMN19080818-V04-11-page1.txt: [("'re", 're')]
NMN19080818-V04-11-page2.txt: [("'Your", 'Your')]
NMN19080915-V04-12-page1.txt: [("'L", 'L'), ("'Tight", 'Tight'), ("'Tony", 'Tony')]
NMN19080915-V04-12-page5.txt: [("'resented", 'resented')]
NMN19080915-V04-12-page6.txt: [("'eto", 'eto'), ("'Janie", 'Janie')]
NMN19080922-V04-13-page1.txt: [('\'"What', '"What'), ("'filled", 'filled')]
NMN19081006-V04-14-page1.txt: [("'e", 'e'), ("'bemuse", 'bemuse')]
NMN19081006-V04-14-page3.txt: [("'move", 'move'), ("'la", 'la')]
NMN19081006-V04-14-page4.txt: [("'iwy", 'iwy')]
NMN19081006-V04-14-page5.txt: [("'We", 'We')]
NMN19081006-V04-14-page6.txt: [("'The", 'The')]
NMN19081027-V04-15-page1.txt: [("'Twas", 'Twas')]
NMN19081027-V04-15-page4.txt: [("'Comer", 'Comer')]
NMN19081027-V04-15-page5.txt: [("'Abstain", 'Abstain')]
NMN19081027-V04-15-page7.txt: [("'PPER", 'PPER')]
NMN19081124-V04-16-page1.txt: [("'.", '.'), ("'Behold", 'Behold'), ("'If", 'If'), ("'I", 'I'), ("'I", 'I')]
NMN19081124-V04-16-page2.txt: [("'Nv", 'Nv')]
NMN19081124-V04-16-page3.txt: [("'acmes", 'acmes')]
NMN19081124-V04-16-page5.txt: [("'running", 'running'), ("'rowing", 'rowing')]
NMN19081124-V04-16-page7.txt: [("'etoskey", 'etoskey')]
NMN19081215-V04-17-page1.txt: [("'Your", 'Your')]
NMN19081215-V04-17-page3.txt: [("'.", '.')]
NMN19081215-V04-17-page7.txt: [("'Tao", 'Tao')]
NMN19090115-V05-01-page3.txt: [("''MISSIONARY", 'MISSIONARY')]
NMN19090115-V05-01-page4.txt: [("'ehe", 'ehe')]
NMN19090115-V05-01-page5.txt: [("'workmen", 'workmen'), ("'pause", 'pause')]
NMN19090115-V05-01-page7.txt: [("'ust", 'ust')]
NMN19090115-V05-01-page8.txt: [("'f.", 'f.'), ("''.", '.')]
NMN19090115-V05-01-page9.txt: [("'retheen", 'retheen')]
NMN19090215-V05-02-page1.txt: [("'Tis", 'Tis')]
NMN19090215-V05-02-page2.txt: [("'of", 'of'), ("'are", 'are')]
NMN19090215-V05-02-page4.txt: [("'Ake", 'Ake'), ("'Adch.", 'Adch.')]
NMN19090215-V05-02-page6.txt: [("'thorough", 'thorough'), ("'Medical", 'Medical')]
NMN19090215-V05-02-page7.txt: [("'Let", 'Let'), ("'at", 'at'), ("'hole", 'hole')]
NMN19090318-V05-03-page1.txt: [("'Tis", 'Tis')]
NMN19090318-V05-03-page4.txt: [("'held", 'held'), ("'long", 'long'), ("'YE", 'YE')]
NMN19090318-V05-03-page5.txt: [("'arch", 'arch')]
NMN19090318-V05-03-page6.txt: [("'Coming", 'Coming')]
NMN19090318-V05-03-page8.txt: [("'sirs.", 'sirs.')]
NMN19090318-V05-03-page9.txt: [("'mavay", 'mavay')]
NMN19090415-V05-04-page1.txt: [("'ut", 'ut')]
NMN19090415-V05-04-page2.txt: [("'.", '.'), ("'he", 'he'), ("'There", 'There')]
NMN19090415-V05-04-page3.txt: [("'work", 'work'), ("'eeve", 'eeve')]
NMN19090415-V05-04-page4.txt: [("'ceased", 'ceased')]
NMN19090415-V05-04-page5.txt: [("'uman", 'uman'), ("'.ro.", '.ro.'), ("'SISSIONARYj'O", 'SISSIONARYjO'), ("'egae", 'egae')]
NMN19090514-V05-05-page2.txt: [("'e", 'e'), ("'How", 'How'), ("'No", 'No')]
NMN19090514-V05-05-page3.txt: [("'e", 'e'), ("'Y", 'Y'), ("'Necessity", 'Necessity'), ("'Let", 'Let'), ("'which", 'which')]
NMN19090514-V05-05-page5.txt: [("'liner", 'liner')]
NMN19090514-V05-05-page7.txt: [("'third", 'third')]
NMN19090514-V05-05-page9.txt: [("'.s", '.s'), ("'NI", 'NI')]
NMN19090615-V05-06-page1.txt: [("'.", '.'), ("'What", 'What'), ("'You", 'You'), ("'Len", 'Len')]
NMN19090615-V05-06-page3.txt: [("'Nerd", 'Nerd')]
NMN19090615-V05-06-page4.txt: [("'lace", 'lace'), ("'stumps", 'stumps'), ("'IRON", 'IRON')]
NMN19090615-V05-06-page6.txt: [("'ors.", 'ors.'), ("'Business", 'Business')]
NMN19090615-V05-06-page7.txt: [("'Raw", 'Raw'), ("'having", 'having')]
NMN19090615-V05-06-page8.txt: [("'This", 'This')]
NMN19090715-V05-07-page1.txt: [("'NORT", 'NORT'), ("'recrma", 'recrma')]
NMN19090715-V05-07-page2.txt: [("'.", '.')]
NMN19090715-V05-07-page3.txt: [("'.nciple", '.nciple')]
NMN19090715-V05-07-page5.txt: [("'Mara", 'Mara')]
NMN19090715-V05-07-page6.txt: [("'lbs.", 'lbs.'), ("'ich", 'ich'), ("'o", 'o'), ("'Any", 'Any'), ("'ature", 'ature'), ("'Lie", 'Lie'), ("'but", 'but'), ("'T", 'T'), ("'aitnessed", 'aitnessed'), ("'ACIFT", 'ACIFT')]
NMN19090812-V05-08-page1.txt: [("'PrIN", 'PrIN'), ("'Te", 'Te'), ("'yelp", 'yelp'), ("'radar", 'radar')]
NMN19090812-V05-08-page2.txt: [("'LiM", 'LiM'), ("'-ill", '-ill'), ("'se", 'se')]
NMN19090812-V05-08-page4.txt: [("'Go", 'Go'), ("'hem", 'hem')]
NMN19090812-V05-08-page5.txt: [("'eve.", 'eve.'), ("'nfi", 'nfi'), ("'nook", 'nook'), ("'as", 'as'), ("'Visit", 'Visit')]
NMN19090812-V05-08-page6.txt: [("'MTH", 'MTH')]
NMN19090812-V05-08-page8.txt: [("'tether", 'tether'), ("'Tract", 'Tract')]
NMN19090915-V05-09-page2.txt: [("'..ance", '..ance'), ("'END", 'END'), ("'.", '.'), ("'.cork", '.cork'), ("'CLT", 'CLT')]
NMN19090915-V05-09-page3.txt: [("'Conference", 'Conference'), ("'When", 'When'), ("'Jill", 'Jill'), ("'Our", 'Our')]
NMN19090915-V05-09-page4.txt: [("'weextend", 'weextend'), ("'On", 'On'), ("'F.", 'F.'), ("'D.", 'D.'), ("'On", 'On'), ("'Rernast", 'Rernast')]
NMN19090915-V05-09-page5.txt: [("'Ee", 'Ee'), ("'Iome", 'Iome')]
NMN19090915-V05-09-page6.txt: [("'j", 'j')]
NMN19090915-V05-09-page7.txt: [("'he", 'he')]
NMN19090915-V05-09-page8.txt: [("'Volunteer", 'Volunteer'), ("'blase", 'blase'), ("'TOR", 'TOR')]
NMN19090915-V05-09-page9.txt: [("'Tao", 'Tao'), ("'.ess", '.ess'), ("'t", 't'), ("'e", 'e')]
NMN19091015-V05-10-page1.txt: [("'put", 'put'), ("'We.", 'We.')]
NMN19091015-V05-10-page2.txt: [("'at", 'at'), ("'in", 'in'), ("'Mine", 'Mine'), ("'We.", 'We.')]
NMN19091015-V05-10-page3.txt: [("'co.", 'co.'), ("'Icarts.", 'Icarts.'), ("'he", 'he'), ("'e", 'e')]
NMN19091015-V05-10-page5.txt: [("'reading", 'reading'), ("'Mess", 'Mess')]
NMN19091015-V05-10-page6.txt: [("'chat", 'chat'), ("'i", 'i')]
NMN19091015-V05-10-page7.txt: [("'olunteer", 'olunteer'), ("'All", 'All'), ("'moaning", 'moaning'), ("'aive", 'aive'), ("'a", 'a')]
NMN19091015-V05-10-page8.txt: [("'..f", '..f')]
NMN19091015-V05-10-page9.txt: [("''very", 'very')]
NMN19091115-V05-11-page1.txt: [("''after", 'after'), ("'.oyalty", '.oyalty'), ("'.", '.'), ("'or", 'or')]
NMN19091115-V05-11-page2.txt: [("'S", 'S'), ('\'Cations".', 'Cations".'), ("'Pe", 'Pe')]
NMN19091115-V05-11-page3.txt: [("'elp", 'elp')]
NMN19091115-V05-11-page4.txt: [("'Jou", 'Jou'), ("'ave", 'ave')]
NMN19091115-V05-11-page5.txt: [("'e", 'e')]
NMN19091115-V05-11-page6.txt: [("'ee", 'ee')]
NMN19091115-V05-11-page7.txt: [("'osier's", 'osiers'), ("'Outline", 'Outline')]
NMN19091115-V05-11-page8.txt: [("'.fares", '.fares'), ("'Yo", 'Yo'), ("'too", 'too')]
NMN19091209-V05-12-page1.txt: [("'TOL.", 'TOL.'), ("'Qum", 'Qum'), ('\'"', '"'), ("'should", 'should'), ("'b", 'b'), ("'Are", 'Are'), ("'sh", 'sh'), ("'.ors", '.ors')]
NMN19091209-V05-12-page2.txt: [("'THE", 'THE'), ("'.nest", '.nest')]
NMN19091209-V05-12-page5.txt: [("'wiolo", 'wiolo'), ("'''roo.", 'roo.'), ("'.", '.'), ("'irothror.", 'irothror.')]
NMN19091209-V05-12-page6.txt: [("'..tore", '..tore'), ("'tic", 'tic'), ("'loll.", 'loll.'), ("'orGuilford", 'orGuilford')]
NMN19091209-V05-12-page7.txt: [("'se", 'se'), ("'roe", 'roe')]
NMN19100120-V06-01-page2.txt: [("'WE", 'WE')]
NMN19100120-V06-01-page5.txt: [("'Do", 'Do'), ("'ZL.", 'ZL.'), ("'P.", 'P.'), ("'Zip", 'Zip')]
NMN19100120-V06-01-page7.txt: [("'Lash.", 'Lash.'), ("'to", 'to'), ("'looting", 'looting'), ("'The", 'The')]
NMN19100120-V06-01-page8.txt: [("'sere", 'sere'), ("'the", 'the'), ("'save", 'save'), ("'mica", 'mica')]
NMN19100120-V06-01-page9.txt: [("'nefore", 'nefore'), ("'Y.", 'Y.')]
NMN19100224-V06-02-page2.txt: [("'They", 'They'), ("'The", 'The')]
NMN19100224-V06-02-page4.txt: [("'The", 'The')]
NMN19100224-V06-02-page7.txt: [("'the", 'the'), ("'We", 'We')]

Check Correction 4

In [23]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/NMN/correction4

Average verified rate: 0.9202623368611607

Average of error rates: 0.08289913544668587

Total token count: 197761

In [24]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[24]:
[('e', 571),
 ('t', 379),
 ('th', 256),
 ('r', 237),
 ("'", 236),
 ('n', 224),
 ('w', 210),
 ('f', 163),
 ('m', 157),
 ('d', 155),
 ('g', 111),
 ('aro', 89),
 ('u', 83),
 ('co', 74),
 ('bo', 73),
 ('re', 70),
 ('-', 69),
 ('k', 68),
 ('wo', 67),
 ('se', 40),
 ('x', 38),
 ('tc', 36),
 ('te', 34),
 ('nd', 31),
 ('es', 29),
 ('ti', 29),
 ('z', 28),
 ('li', 28),
 ('ie', 28),
 ('leetsville', 28),
 ("canvassers'", 27),
 ('al', 27),
 ('mt', 27),
 ('tt', 26),
 ('willaman', 26),
 ('ft', 23),
 ('ay', 23),
 ('ne', 21),
 ('ce', 20),
 ('ee', 20),
 ('soo', 20),
 ('q', 20),
 ('il', 19),
 ('myrta', 18),
 ("elders'", 18),
 ('altho', 18),
 ('rs', 18),
 ('ca', 17),
 ('ro', 17),
 ('tr', 16)]

Correction 5 -- Rejoin Burst Words

In [25]:
# %load shared_elements/rejoin_burst_words.py
prev = cycle
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    pattern = re.compile("(\s(\w{1,2}\s){5,})")
    
    replacements = []
    clean.check_splits(pattern, spelling_dictionary, content, replacements)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
NMN19070312-V03-04-page1.txt: [(' C O N F E R E N C E ', 'CONFERENCE')]
NMN19080512-V04-07-page7.txt: [('Ii', 'Ii')]
NMN19081027-V04-15-page1.txt: [(' S H E E T\n', 'SHEET')]
NMN19090215-V05-02-page1.txt: [(' S H E E T\n', 'SHEET')]
NMN19090215-V05-02-page7.txt: [('In', 'In')]
NMN19090715-V05-07-page1.txt: [('\nh E N C E\n', 'hENCE')]
NMN19090812-V05-08-page6.txt: [('No', 'No')]
NMN19090915-V05-09-page1.txt: [('Be', 'Be')]
NMN19100120-V06-01-page5.txt: [('It', 'It'), ('It', 'It')]
NMN19100224-V06-02-page5.txt: [('As', 'As')]

Check Correction 5

In [26]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/NMN/correction5

Average verified rate: 0.9203216263369491

Average of error rates: 0.08282997118155619

Total token count: 197745

In [27]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[27]:
[('e', 562),
 ('t', 377),
 ('th', 256),
 ("'", 236),
 ('r', 236),
 ('n', 221),
 ('w', 210),
 ('f', 162),
 ('m', 157),
 ('d', 155),
 ('g', 111),
 ('aro', 89),
 ('u', 83),
 ('co', 74),
 ('bo', 73),
 ('re', 70),
 ('-', 69),
 ('k', 68),
 ('wo', 67),
 ('se', 40),
 ('x', 38),
 ('tc', 36),
 ('te', 34),
 ('nd', 31),
 ('es', 29),
 ('ti', 29),
 ('z', 28),
 ('li', 28),
 ('ie', 28),
 ('leetsville', 28),
 ("canvassers'", 27),
 ('al', 27),
 ('mt', 27),
 ('tt', 26),
 ('willaman', 26),
 ('ft', 23),
 ('ay', 23),
 ('ne', 21),
 ('ce', 20),
 ('ee', 20),
 ('soo', 20),
 ('q', 20),
 ('il', 19),
 ('myrta', 18),
 ("elders'", 18),
 ('altho', 18),
 ('rs', 18),
 ('ca', 17),
 ('ro', 17),
 ('tr', 16)]

Correction 6 -- Rejoin Split Words

In [28]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
NMN19070108-V03-01-page2.txt: [('te', 'r')]
NMN19070108-V03-01-page4.txt: [('th', 'at'), ('bo', 'A')]
NMN19070129-V03-02-page6.txt: [('co', 'well'), ('Soci', 'ety')]
NMN19070219-V03-03-page1.txt: [('SI', 'C')]
NMN19070219-V03-03-page3.txt: [('OBITUAR', 'Y')]
NMN19070312-V03-04-page1.txt: [('wo', 'ld')]
NMN19070401-V03-05-page1.txt: [('MICHIG', 'AN')]
NMN19070401-V03-05-page4.txt: [('th', 'at')]
NMN19070401-V03-05-page5.txt: [('co', 'operate')]
NMN19070401-V03-05-page7.txt: [('IC', 'Ed')]
NMN19070423-V03-06-page3.txt: [('responsibl', 'e')]
NMN19070514-V03-07-page3.txt: [('distrib', 'uted')]
NMN19070514-V03-07-page5.txt: [('Ra', 'ds')]
NMN19070604-V03-08-page3.txt: [('co', 'operation')]
NMN19070716-V03-10-page2.txt: [('th', 'a'), ('grat', 'ifying')]
NMN19070716-V03-10-page4.txt: [('camp-', 'meeting'), ('ne', 'cessary')]
NMN19070716-V03-10-page5.txt: [('ca', 'en')]
NMN19070813-V03-11-page1.txt: [('SHEE', 'T')]
NMN19070813-V03-11-page6.txt: [('Peo', 'ple')]
NMN19070813-V03-11-page7.txt: [('CA', 'T')]
NMN19070917-V03-12-page1.txt: [('SHEE', 'T')]
NMN19071008-V03-13-page1.txt: [('co', 'operate'), ('fa', 't')]
NMN19071008-V03-13-page2.txt: [('ta', 'le'), ('CU', 'E')]
NMN19071008-V03-13-page7.txt: [('NA', 'TIONS'), ('CREA', 'TURE')]
NMN19071029-V03-14-page1.txt: [('cura', 'te'), ('Al', 'to')]
NMN19071029-V03-14-page7.txt: [('th', 'at')]
NMN19071119-V03-15-page2.txt: [('co', 'operate'), ('re', 'te')]
NMN19071119-V03-15-page4.txt: [('cer', 'tainly'), ('ne', 'man')]
NMN19071210-V03-16-page1.txt: [('bo', 'n')]
NMN19071210-V03-16-page5.txt: [('co', 'operation'), ('re', 'organize')]
NMN19080107-V04-01-page2.txt: [('ascer', 'tained')]
NMN19080107-V04-01-page3.txt: [('contribut', 'ions')]
NMN19080107-V04-01-page4.txt: [('bo', 'a')]
NMN19080107-V04-01-page5.txt: [('bo', 'a')]
NMN19080128-V04-02-page6.txt: [('TI', 'C')]
NMN19080128-V04-02-page7.txt: [('oppor', 'tunities')]
NMN19080218-V04-03-page1.txt: [('th', 'at')]
NMN19080218-V04-03-page2.txt: [('bo', 'a')]
NMN19080218-V04-03-page4.txt: [('TAS', 'S')]
NMN19080218-V04-03-page5.txt: [('accom', 'plishing')]
NMN19080218-V04-03-page6.txt: [('sto', 'at')]
NMN19080310-V04-04-page5.txt: [('se', 'a')]
NMN19080331-V04-05-page4.txt: [('th', 'in'), ('co', 'operate')]
NMN19080421-V04-06-page2.txt: [('peo', 'ple')]
NMN19080421-V04-06-page4.txt: [('RE', 'E'), ('fi', 'st')]
NMN19080421-V04-06-page6.txt: [('es', 'to')]
NMN19080512-V04-07-page3.txt: [('sh', 'ould')]
NMN19080512-V04-07-page5.txt: [('ca', 'ro')]
NMN19080623-V04-08-page4.txt: [('th', 'in')]
NMN19080714-V04-09-page7.txt: [('ay', 'in'), ('Soo', 'the')]
NMN19080714-V04-09-page9.txt: [('Cheboy', 'gan')]
NMN19080804-V04-10-page1.txt: [('th', 'at')]
NMN19080804-V04-10-page3.txt: [('ves', 't')]
NMN19080804-V04-10-page5.txt: [('ch', 'at')]
NMN19080818-V04-11-page1.txt: [('th', 'or')]
NMN19080818-V04-11-page2.txt: [('th', 'in')]
NMN19080915-V04-12-page1.txt: [('AB', 'a')]
NMN19080915-V04-12-page2.txt: [('co', 'operation')]
NMN19080915-V04-12-page7.txt: [('OD', 'UM')]
NMN19080915-V04-12-page8.txt: [('DEPARTMEN', 'T')]
NMN19080915-V04-12-page9.txt: [('Esca', 'naba')]
NMN19081006-V04-14-page6.txt: [('Wo', 't')]
NMN19081027-V04-15-page1.txt: [('WIR', 'E')]
NMN19081027-V04-15-page2.txt: [('TI', 'M')]
NMN19081027-V04-15-page4.txt: [('SCHOO', 'L')]
NMN19081027-V04-15-page7.txt: [('CO', 'PPER')]
NMN19081027-V04-15-page8.txt: [('ca', 'ry'), ('ti', 'e')]
NMN19081124-V04-16-page4.txt: [('re', 'a')]
NMN19081124-V04-16-page7.txt: [('treasur', 'e')]
NMN19081215-V04-17-page5.txt: [('educat', 'ion')]
NMN19090115-V05-01-page8.txt: [('Co', 'rd')]
NMN19090215-V05-02-page3.txt: [('al', 'lays')]
NMN19090215-V05-02-page7.txt: [('se', 'at')]
NMN19090318-V05-03-page7.txt: [('al', 'ready')]
NMN19090415-V05-04-page2.txt: [('wh', 'e')]
NMN19090415-V05-04-page5.txt: [('Hr', 's')]
NMN19090415-V05-04-page6.txt: [('re', 'used'), ('bo', 'ne')]
NMN19090514-V05-05-page8.txt: [('Tra', 'verse')]
NMN19090615-V05-06-page2.txt: [('bo', 'used')]
NMN19090615-V05-06-page7.txt: [('cho', 'sen')]
NMN19090615-V05-06-page9.txt: [('daug', 'hter')]
NMN19090715-V05-07-page6.txt: [('ve', 'to'), ('LI', 'n'), ('ay', 'in'), ('re', 'e'), ('ne', 'd'), ('bo', 'n')]
NMN19090715-V05-07-page7.txt: [('NOTIC', 'E')]
NMN19090715-V05-07-page9.txt: [('re', 'sent')]
NMN19090812-V05-08-page1.txt: [('condi', 't'), ('co', 'me')]
NMN19090812-V05-08-page2.txt: [('wa', 'h')]
NMN19090812-V05-08-page3.txt: [('Appli', 'ed')]
NMN19090812-V05-08-page4.txt: [('M.', ''), ('canva', 'sed')]
NMN19090812-V05-08-page5.txt: [('ta', 'ke')]
NMN19090812-V05-08-page6.txt: [('PO', 'S'), ('se', 'e'), ('fi', 'st')]
NMN19090812-V05-08-page7.txt: [('ta', 'Co'), ('co', 'at')]
NMN19090812-V05-08-page8.txt: [('ti', 'e')]
NMN19090915-V05-09-page1.txt: [('SH', 'E'), ('Oppo', 'rtunity')]
NMN19090915-V05-09-page2.txt: [('th', 'a'), ('tem', 'PE')]
NMN19090915-V05-09-page4.txt: [('Mc', 'Clellan'), ('credentia', 'ls'), ('Publi', 'cation')]
NMN19090915-V05-09-page6.txt: [('attentio', 'n')]
NMN19090915-V05-09-page8.txt: [('re', 'order')]
NMN19091015-V05-10-page3.txt: [('wo', 'k')]
NMN19091015-V05-10-page4.txt: [('re', 'called')]
NMN19091015-V05-10-page5.txt: [('op', 'es'), ('ti', 'ed')]
NMN19091015-V05-10-page7.txt: [('mo', 'I'), ('unt', 'o')]
NMN19091015-V05-10-page8.txt: [('Tw', 'o')]
NMN19091015-V05-10-page9.txt: [('re', 'ship')]
NMN19091115-V05-11-page1.txt: [('ro', 'sin')]
NMN19091115-V05-11-page2.txt: [('re', 'baptized')]
NMN19091115-V05-11-page5.txt: [('co', 'operation'), ('organi', 'zed')]
NMN19091115-V05-11-page6.txt: [('th', 'o')]
NMN19091115-V05-11-page7.txt: [('vit', 'A')]
NMN19091115-V05-11-page8.txt: [('ou', 'se')]
NMN19091115-V05-11-page9.txt: [('ch', 'in')]
NMN19091209-V05-12-page2.txt: [('unt', 'il')]
NMN19091209-V05-12-page3.txt: [('rato', 'on')]
NMN19091209-V05-12-page4.txt: [('ri', 'o')]
NMN19091209-V05-12-page6.txt: [('Br', 'other')]
NMN19091209-V05-12-page8.txt: [('sHa', 'T')]
NMN19100120-V06-01-page1.txt: [('Mo', 'A'), ('oppo', 'rtunity')]
NMN19100120-V06-01-page2.txt: [('da', 'y'), ('co', 'operate')]
NMN19100120-V06-01-page5.txt: [('re', 'C'), ('nu', 'n'), ('ti', 'a'), ('ie', 'r')]
NMN19100120-V06-01-page7.txt: [('appr', 'oaching')]
NMN19100120-V06-01-page9.txt: [('re', 'lation')]
NMN19100224-V06-02-page4.txt: [('AC', 'E')]
NMN19100224-V06-02-page7.txt: [('Pa', 'T')]

Check Correction 6

In [29]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/NMN/correction6

Average verified rate: 0.9211084291013243

Average of error rates: 0.08189625360230549

Total token count: 197613

In [30]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[30]:
[('e', 554),
 ('t', 372),
 ('th', 248),
 ("'", 236),
 ('r', 234),
 ('n', 218),
 ('w', 210),
 ('f', 162),
 ('m', 156),
 ('d', 155),
 ('g', 111),
 ('aro', 89),
 ('u', 83),
 ('-', 69),
 ('k', 68),
 ('bo', 67),
 ('wo', 66),
 ('co', 62),
 ('re', 59),
 ('se', 38),
 ('x', 38),
 ('tc', 36),
 ('nd', 31),
 ('te', 31),
 ('z', 28),
 ('li', 28),
 ('leetsville', 28),
 ("canvassers'", 27),
 ('es', 27),
 ('mt', 27),
 ('ie', 27),
 ('al', 26),
 ('tt', 26),
 ('willaman', 26),
 ('ft', 23),
 ('ti', 23),
 ('ay', 21),
 ('ce', 20),
 ('ee', 20),
 ('q', 20),
 ('soo', 19),
 ('il', 18),
 ('myrta', 18),
 ("elders'", 18),
 ('altho', 18),
 ('rs', 18),
 ('ne', 18),
 ('tr', 16),
 ('id', 16),
 ('ea', 16)]

Correction 7 -- Rejoin Split Words II

In [31]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction7"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
NMN19070108-V03-01-page4.txt: [('A', 'Tt')]
NMN19070129-V03-02-page6.txt: [('the', 're')]
NMN19070312-V03-04-page1.txt: [('h', "aven's"), ('for', 'th'), ('wo', 'ld')]
NMN19070312-V03-04-page4.txt: [('a', 'id')]
NMN19070401-V03-05-page1.txt: [('Confer', 'ence')]
NMN19070401-V03-05-page3.txt: [('Ho', 'rt')]
NMN19070401-V03-05-page7.txt: [('a', 'id')]
NMN19070514-V03-07-page1.txt: [('C', 'ONFERENCE'), ('mission', 'ary'), ('plain', 'ly')]
NMN19070514-V03-07-page5.txt: [('Ra', 'ds')]
NMN19070625-V03-09-page5.txt: [('wonder', 'ful'), ('be', 'rm'), ('car', 'ded')]
NMN19070716-V03-10-page2.txt: [('tho', 'th')]
NMN19070716-V03-10-page4.txt: [('can', 'ey')]
NMN19070813-V03-11-page1.txt: [('box', 'es')]
NMN19070813-V03-11-page7.txt: [('THE', 'CA')]
NMN19071008-V03-13-page2.txt: [('C', 'AL')]
NMN19071029-V03-14-page1.txt: [('for', 'ay'), ('k', 'ep')]
NMN19071029-V03-14-page4.txt: [('dona', 'tions')]
NMN19071029-V03-14-page5.txt: [('N', 'th')]
NMN19071119-V03-15-page1.txt: [('N', 'EWS')]
NMN19071119-V03-15-page8.txt: [('W', 'ite')]
NMN19071210-V03-16-page7.txt: [('N', 'OTICE')]
NMN19071210-V03-16-page8.txt: [('J', 'OS')]
NMN19080107-V04-01-page2.txt: [('P', 'ea')]
NMN19080128-V04-02-page2.txt: [('var', 'ious')]
NMN19080128-V04-02-page3.txt: [('Depart', 'ment')]
NMN19080128-V04-02-page4.txt: [('in', 'Ch')]
NMN19080128-V04-02-page6.txt: [('can', 'vassers')]
NMN19080218-V04-03-page5.txt: [('attend', 'ance')]
NMN19080310-V04-04-page6.txt: [('m', 'eeting')]
NMN19080310-V04-04-page8.txt: [('S', 'ubscribe')]
NMN19080331-V04-05-page2.txt: [('wild', 'erness')]
NMN19080331-V04-05-page3.txt: [('a', 'gr')]
NMN19080331-V04-05-page5.txt: [('and', 'RE')]
NMN19080331-V04-05-page7.txt: [('Gene', 'al'), ('T', 'ACH')]
NMN19080421-V04-06-page1.txt: [('C', 'ONFERENCE')]
NMN19080421-V04-06-page4.txt: [('fin', 'ish'), ('b', 'es'), ('the', 'RE')]
NMN19080512-V04-07-page4.txt: [('to', 'co')]
NMN19080512-V04-07-page5.txt: [('p', 'ay')]
NMN19080714-V04-09-page4.txt: [('He', 'ft')]
NMN19080714-V04-09-page7.txt: [('b', 'aptismal')]
NMN19080804-V04-10-page1.txt: [('Confer', 'ence')]
NMN19080804-V04-10-page3.txt: [('a', 'ves')]
NMN19080804-V04-10-page5.txt: [('s', 'Ixth'), ('Me', 'ch')]
NMN19080804-V04-10-page8.txt: [('spec', 'Ial')]
NMN19080818-V04-11-page2.txt: [('a', 'nal')]
NMN19080915-V04-12-page1.txt: [('C', 'ONFERENCE')]
NMN19080915-V04-12-page3.txt: [('a', 'pr')]
NMN19080915-V04-12-page4.txt: [('the', 'RE')]
NMN19080922-V04-13-page1.txt: [('S', 'ri')]
NMN19081006-V04-14-page3.txt: [('fur', 'nished')]
NMN19081006-V04-14-page5.txt: [('miss', 'ionary')]
NMN19081006-V04-14-page6.txt: [('C', 'TS')]
NMN19081027-V04-15-page2.txt: [('ha', 're')]
NMN19081027-V04-15-page8.txt: [('adv', 'Anced')]
NMN19081124-V04-16-page1.txt: [('C', 'ONFERENCE')]
NMN19081215-V04-17-page1.txt: [('C', 'IG')]
NMN19081215-V04-17-page7.txt: [('A', 'lian')]
NMN19090115-V05-01-page4.txt: [('confer', 'ence'), ('r', 'esponsibility')]
NMN19090115-V05-01-page8.txt: [('Co', 'rd')]
NMN19090215-V05-02-page2.txt: [('s', 'unshine')]
NMN19090215-V05-02-page5.txt: [('can', 'vassing')]
NMN19090215-V05-02-page6.txt: [('le', 'ft')]
NMN19090215-V05-02-page7.txt: [('w', 'Op'), ('In', 'dia')]
NMN19090318-V05-03-page1.txt: [('perse', 'cution'), ('N', 'TH')]
NMN19090318-V05-03-page5.txt: [('G', 'reat')]
NMN19090318-V05-03-page6.txt: [('be', 'sai')]
NMN19090415-V05-04-page1.txt: [("'", 're')]
NMN19090415-V05-04-page2.txt: [('n', 'eed')]
NMN19090415-V05-04-page4.txt: [('FOR', 'TH')]
NMN19090415-V05-04-page6.txt: [('A', 'IL')]
NMN19090514-V05-05-page1.txt: [('b', 'AY')]
NMN19090514-V05-05-page9.txt: [('the', 'ol')]
NMN19090615-V05-06-page1.txt: [('evi', 'dence')]
NMN19090615-V05-06-page4.txt: [('a', 'LI')]
NMN19090615-V05-06-page7.txt: [('a', 'tt')]
NMN19090715-V05-07-page2.txt: [('A', 'nu')]
NMN19090715-V05-07-page6.txt: [('to', 'il'), ('o', 're'), ('m', 'Ight'), ('t', 'ic')]
NMN19090715-V05-07-page7.txt: [('A', 'IL')]
NMN19090812-V05-08-page1.txt: [('can', 'al')]
NMN19090812-V05-08-page5.txt: [('the', 'cae')]
NMN19090812-V05-08-page6.txt: [('the', 'ca'), ('N', 'TH'), ('No', 'rth'), ('the', 're')]
NMN19090915-V05-09-page3.txt: [('a', 'ny')]
NMN19090915-V05-09-page4.txt: [('encourage', 'ment')]
NMN19090915-V05-09-page5.txt: [('T', 'IE'), ('mar', 'ts')]
NMN19090915-V05-09-page8.txt: [('to', 'Co'), ('p', 'ay')]
NMN19090915-V05-09-page9.txt: [('e', 'ri'), ('the', 'RE')]
NMN19091015-V05-10-page1.txt: [('hoo', 'doos')]
NMN19091015-V05-10-page6.txt: [('e', 'co')]
NMN19091015-V05-10-page7.txt: [('down', 'Ey')]
NMN19091015-V05-10-page9.txt: [('Jo', 'nson')]
NMN19091115-V05-11-page1.txt: [('L', 'ive'), ('a', 'li')]
NMN19091115-V05-11-page8.txt: [('v', 'arious'), ('ou', 'se'), ('p', 'Lainly')]
NMN19091209-V05-12-page1.txt: [('i', 'tS'), ('to', 'ro')]
NMN19091209-V05-12-page2.txt: [('P', 'OM')]
NMN19091209-V05-12-page3.txt: [('ho', 'pper'), ('he', 'RE')]
NMN19091209-V05-12-page4.txt: [('the', 'Es'), ('and', 're')]
NMN19091209-V05-12-page5.txt: [('T', 'oo')]
NMN19091209-V05-12-page6.txt: [('R', 'ums')]
NMN19091209-V05-12-page7.txt: [('for', 'th')]
NMN19100120-V06-01-page1.txt: [('A', 'IL')]
NMN19100120-V06-01-page5.txt: [('e', 'rk'), ('L', 'eas')]
NMN19100120-V06-01-page7.txt: [('A', 'nd'), ('N', 'ow'), ('E', 'arnestly')]
NMN19100224-V06-02-page1.txt: [('S', 'HEET'), ('PET', 'OSKEY'), ('t', 'ie')]
NMN19100224-V06-02-page3.txt: [('A', 'shtabula')]
NMN19100224-V06-02-page7.txt: [('d', 'ay')]

Review Remaining Errors

In [32]:
reports.docs_with_high_error_rate(summary)
Out[32]:
[('NMN19100120-V06-01-page5.txt', 0.495),
 ('NMN19090715-V05-07-page6.txt', 0.34),
 ('NMN19090915-V05-09-page2.txt', 0.297),
 ('NMN19090915-V05-09-page9.txt', 0.285),
 ('NMN19091209-V05-12-page5.txt', 0.281),
 ('NMN19090812-V05-08-page8.txt', 0.269),
 ('NMN19091015-V05-10-page3.txt', 0.26),
 ('NMN19091209-V05-12-page6.txt', 0.243),
 ('NMN19090915-V05-09-page8.txt', 0.24),
 ('NMN19091209-V05-12-page8.txt', 0.234),
 ('NMN19071029-V03-14-page1.txt', 0.232),
 ('NMN19091209-V05-12-page7.txt', 0.225),
 ('NMN19090812-V05-08-page6.txt', 0.223),
 ('NMN19091209-V05-12-page1.txt', 0.217),
 ('NMN19091015-V05-10-page6.txt', 0.211),
 ('NMN19090812-V05-08-page1.txt', 0.211),
 ('NMN19071008-V03-13-page5.txt', 0.21)]
In [33]:
# %load shared_elements/high_error_rates.py
doc_keys = [x[0] for x in reports.docs_with_high_error_rate(summary) if x[1] > 0.3]

# utilities.open_original_docs(doc_keys, directories['cycle'])

Two documents with high errors are tables and very faint typeface.

In [34]:
reports.long_errors(errors_summary, min_length=15)
Out[34]:
(['superintendentswritos',
  "sabbatheschool''was",
  'supplementinwhat',
  'mieeinaaryelrerikeelth',
  'wavarorinarngwommannlp',
  "the'thirtyaeight",
  'miscellamiousoil',
  'sleeeliameeelenql',
  'amongthealationa',
  'gasfeemalielfteesebefile',
  'notwhithstanding',
  'publisherevdonreby-awnh',
  'temperliterature',
  'logibtrationwilli',
  'of-the-lecuvantixinwe',
  'buthishandisuper',
  'importanceofthis',
  'volunteerreading',
  'illastrertedieracring',
  "beautifully'illustrated",
  'theeopportemities',
  'vluatletestimony',
  'teetureetiatee-all',
  'hidingthomsolvus',
  'spirit--meekness',
  'countor-campaign',
  'faithfulconference',
  'reccmuiendations',
  "committees'appointed",
  'spiritually-minded',
  'hichigaagernference',
  'individvalwastos',
  'wmgeleftremodimarm',
  'whisperedtemptationsoftheenemyenticethemtosin',
  'mierepreeentations',
  "cannot'sympathize",
  'edacationalaeork',
  'iialispeakingsevisoofe',
  'petoskey-october',
  'gemerarconserance',
  'nothwithstanding',
  "oursabbath-schools'",
  'aabbethelreereer',
  'writtenespecially'],
 15)

Correction 8 -- Separate Squashed Errors

In [35]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log

prev = cycle
cycle = "correction8"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

verified_tokens = []

for filename in corpus:  
    content = utilities.readfile(directories['prev'], filename)
    clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)

tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]

sorted_list_of_words = list(words_sorted_short['token'])

wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = utilities.strip_punct(content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    
    for token in tokens:
        if not token.lower() in spelling_dictionary:
            if len(token) > 17:
                if re.search(r"[\-\-\'\"]", token):
                    pass
                else:
                    split_string = clean.infer_spaces(token, wordcost, maxword)
                    list_split_string = split_string.split()
                    
                    if clean.verify_split_string(list_split_string, spelling_dictionary):
                        replacements.append((token, split_string))
                    else:
                        pass
            else:
                pass
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
NMN19071008-V03-13-page6.txt: [('faithfulConference', 'faithful Conference')]
NMN19100120-V06-01-page2.txt: [('IIALIspeakingSevisoofe', 'II A L I speaking S e v i s o o f e')]

Review Correction 8

In [36]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/NMN/correction8

Average verified rate: 0.9216250151889506

Average of error rates: 0.08140345821325648

Total token count: 197512

In [37]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[37]:
[('e', 552),
 ('t', 370),
 ('th', 245),
 ("'", 236),
 ('r', 232),
 ('n', 214),
 ('w', 209),
 ('f', 163),
 ('m', 156),
 ('d', 154),
 ('g', 110),
 ('aro', 89),
 ('u', 83),
 ('-', 69),
 ('k', 67),
 ('bo', 67),
 ('wo', 66),
 ('co', 59),
 ('re', 48),
 ('se', 38),
 ('x', 38),
 ('tc', 36),
 ('te', 31),
 ('nd', 30),
 ('z', 28),
 ('leetsville', 28),
 ("canvassers'", 27),
 ('mt', 27),
 ('li', 27),
 ('es', 26),
 ('ie', 26),
 ('willaman', 26),
 ('tt', 25),
 ('al', 24),
 ('ti', 23),
 ('ft', 22),
 ('ce', 20),
 ('ee', 20),
 ('q', 20),
 ('soo', 19),
 ('myrta', 18),
 ("elders'", 18),
 ('altho', 18),
 ('rs', 18),
 ('ne', 18),
 ('ay', 17),
 ('tr', 16),
 ('ea', 16),
 ('il', 15),
 ('wi', 15)]
In [ ]: